* Revert "fix(cloudinit): bump size guardrail 30720 → 32000 bytes (#1517)" This reverts commit05c6edb4fe. * Revert "fix(cloud-init): pre-install hcloud-ccm before Flux (unblocks per-region LB allocation) (#1516)" This reverts commitb7140b9069. --------- Co-authored-by: claude <claude@anthropic.com>
146 lines
5.8 KiB
Plaintext
146 lines
5.8 KiB
Plaintext
#cloud-config
|
||
# Catalyst Sovereign worker bootstrap.
|
||
# Sovereign: ${sovereign_fqdn}
|
||
#
|
||
# This script:
|
||
# 1. Installs OS hardening (SSH password-auth off, fail2ban, unattended-upgrades).
|
||
# 2. Joins the cluster as a k3s agent via the control plane's private IP.
|
||
# 3. Touches /var/lib/catalyst/cloud-init-complete for the provisioner.
|
||
|
||
package_update: true
|
||
package_upgrade: false
|
||
packages:
|
||
- curl
|
||
- iptables
|
||
- ca-certificates
|
||
%{ if enable_fail2ban ~}
|
||
- fail2ban
|
||
%{ endif ~}
|
||
%{ if enable_unattended_upgrades ~}
|
||
- unattended-upgrades
|
||
- apt-listchanges
|
||
%{ endif ~}
|
||
|
||
write_files:
|
||
# ── OS hardening: SSH daemon ──────────────────────────────────────────
|
||
# Identical drop-in to the control plane — Phase-0 baseline. Operators
|
||
# tighten further via Crossplane Composition once Phase 1 completes.
|
||
- path: /etc/ssh/sshd_config.d/99-catalyst-hardening.conf
|
||
permissions: '0644'
|
||
content: |
|
||
# Managed by Catalyst Sovereign cloud-init — do not edit by hand.
|
||
PasswordAuthentication no
|
||
KbdInteractiveAuthentication no
|
||
ChallengeResponseAuthentication no
|
||
PermitRootLogin prohibit-password
|
||
PermitEmptyPasswords no
|
||
UsePAM yes
|
||
X11Forwarding no
|
||
AllowAgentForwarding no
|
||
AllowTcpForwarding no
|
||
ClientAliveInterval 300
|
||
ClientAliveCountMax 2
|
||
MaxAuthTries 3
|
||
LoginGraceTime 30
|
||
|
||
%{ if enable_unattended_upgrades ~}
|
||
- path: /etc/apt/apt.conf.d/20auto-upgrades
|
||
permissions: '0644'
|
||
content: |
|
||
APT::Periodic::Update-Package-Lists "1";
|
||
APT::Periodic::Unattended-Upgrade "1";
|
||
APT::Periodic::AutocleanInterval "7";
|
||
- path: /etc/apt/apt.conf.d/52unattended-upgrades-catalyst
|
||
permissions: '0644'
|
||
content: |
|
||
Unattended-Upgrade::Allowed-Origins {
|
||
"$${distro_id}:$${distro_codename}-security";
|
||
"$${distro_id}ESMApps:$${distro_codename}-apps-security";
|
||
"$${distro_id}ESM:$${distro_codename}-infra-security";
|
||
};
|
||
Unattended-Upgrade::Automatic-Reboot "true";
|
||
Unattended-Upgrade::Automatic-Reboot-Time "02:30";
|
||
Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
|
||
Unattended-Upgrade::Remove-Unused-Dependencies "true";
|
||
%{ endif ~}
|
||
|
||
%{ if enable_fail2ban ~}
|
||
- path: /etc/fail2ban/jail.d/catalyst-sshd.local
|
||
permissions: '0644'
|
||
content: |
|
||
[sshd]
|
||
enabled = true
|
||
port = ssh
|
||
filter = sshd
|
||
maxretry = 5
|
||
findtime = 10m
|
||
bantime = 1h
|
||
backend = systemd
|
||
%{ endif ~}
|
||
|
||
runcmd:
|
||
- swapoff -a
|
||
- sed -i '/swap/d' /etc/fstab
|
||
- update-alternatives --set iptables /usr/sbin/iptables-legacy || true
|
||
- update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy || true
|
||
- systemctl reload ssh || systemctl reload sshd || true
|
||
%{ if enable_fail2ban ~}
|
||
- systemctl enable --now fail2ban
|
||
%{ endif ~}
|
||
%{ if enable_unattended_upgrades ~}
|
||
- systemctl enable --now unattended-upgrades
|
||
%{ endif ~}
|
||
# ── Private NIC bring-up BEFORE k3s agent join (prov #71 root cause) ────
|
||
#
|
||
# Same Hetzner private-NIC hot-attach race as the control-plane template
|
||
# — see cloudinit-control-plane.tftpl for the full rationale. Without
|
||
# this wait, k3s agent's `K3S_URL=https://${cp_private_ip}:6443` dials
|
||
# the CP via a private IP that has no route, agent join silently fails,
|
||
# the worker never becomes Ready, autoscaler times out the scale-up.
|
||
- |
|
||
PRIMARY_NIC=$(ip -br link | awk '$1 == "eth0" {print $1; exit}')
|
||
echo "[private-nic] waiting for private NIC to reach the CP ${cp_private_ip}..."
|
||
for i in $(seq 1 60); do
|
||
if ip -4 route get ${cp_private_ip} 2>/dev/null | grep -qE "dev (eth|en|ens|enp)"; then
|
||
SRC=$(ip -4 route get ${cp_private_ip} | awk '/src/ {for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
|
||
echo "[private-nic] route to ${cp_private_ip} is up via src $${SRC} (iter $${i})"
|
||
break
|
||
fi
|
||
EXTRA=$(ip -br link | awk -v primary="$${PRIMARY_NIC}" '$1 != "lo" && $1 != primary && $1 !~ /^(docker|veth|tun|wg|cni|flannel|cilium)/ {print $1; exit}')
|
||
if [ -n "$${EXTRA}" ] && ! grep -q "$${EXTRA}:" /etc/netplan/*.yaml 2>/dev/null; then
|
||
echo "[private-nic] generating netplan stanza for $${EXTRA}"
|
||
cat >/etc/netplan/60-private-network.yaml <<NETPLAN
|
||
network:
|
||
version: 2
|
||
ethernets:
|
||
$${EXTRA}:
|
||
dhcp4: true
|
||
dhcp4-overrides:
|
||
use-dns: false
|
||
use-routes: false
|
||
NETPLAN
|
||
chmod 0600 /etc/netplan/60-private-network.yaml
|
||
netplan apply || true
|
||
fi
|
||
sleep 2
|
||
done
|
||
if ! ip -4 route get ${cp_private_ip} 2>/dev/null | grep -qE "dev (eth|en|ens|enp)"; then
|
||
echo "[private-nic] FATAL: no route to CP ${cp_private_ip} after 120s; k3s agent join will fail" >&2
|
||
ip -br addr >&2
|
||
exit 1
|
||
fi
|
||
|
||
# Join the control plane via private network IP (10.0.1.2 — the first
|
||
# control-plane node in the network subnet). k3s_version pinned so all
|
||
# workers in this Sovereign land on the same Kubernetes minor as the CP.
|
||
# --kubelet-arg=max-pods=220 doubles the kubelet default 110. Without
|
||
# it the qa-fixtures + full bootstrap-kit pod set saturates one node
|
||
# cleanly (catalyst-api, helm-controller, source-controller, plus the
|
||
# 45-HR install chain → Helm hooks → bp-* runtime pods). Caught on
|
||
# prov #63 (cpx52 × 3): CP at 110/110 pods, bp-catalyst-platform's
|
||
# catalyst-api pod stuck "Too many pods" → install hook timed out.
|
||
- 'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${k3s_version} K3S_URL=https://${cp_private_ip}:6443 K3S_TOKEN=${k3s_token} INSTALL_K3S_EXEC="agent --kubelet-arg=max-pods=220 --node-label catalyst.openova.io/role=worker" sh -'
|
||
- mkdir -p /var/lib/catalyst
|
||
- touch /var/lib/catalyst/cloud-init-complete
|
||
final_message: "Catalyst worker bootstrap complete after $UPTIME seconds"
|