openova/infra/hetzner/cloudinit-worker.tftpl
e3mrah 766890510b
Revert PR #1516 + #1517 — Gap A hcloud-ccm pre-install hangs cloud-init (#1518)
* Revert "fix(cloudinit): bump size guardrail 30720 → 32000 bytes (#1517)"

This reverts commit 05c6edb4fe.

* Revert "fix(cloud-init): pre-install hcloud-ccm before Flux (unblocks per-region LB allocation) (#1516)"

This reverts commit b7140b9069.

---------

Co-authored-by: claude <claude@anthropic.com>
2026-05-16 13:32:18 +04:00

146 lines
5.8 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#cloud-config
# Catalyst Sovereign worker bootstrap.
# Sovereign: ${sovereign_fqdn}
#
# This script:
# 1. Installs OS hardening (SSH password-auth off, fail2ban, unattended-upgrades).
# 2. Joins the cluster as a k3s agent via the control plane's private IP.
# 3. Touches /var/lib/catalyst/cloud-init-complete for the provisioner.
package_update: true
package_upgrade: false
packages:
- curl
- iptables
- ca-certificates
%{ if enable_fail2ban ~}
- fail2ban
%{ endif ~}
%{ if enable_unattended_upgrades ~}
- unattended-upgrades
- apt-listchanges
%{ endif ~}
write_files:
# ── OS hardening: SSH daemon ──────────────────────────────────────────
# Identical drop-in to the control plane — Phase-0 baseline. Operators
# tighten further via Crossplane Composition once Phase 1 completes.
- path: /etc/ssh/sshd_config.d/99-catalyst-hardening.conf
permissions: '0644'
content: |
# Managed by Catalyst Sovereign cloud-init — do not edit by hand.
PasswordAuthentication no
KbdInteractiveAuthentication no
ChallengeResponseAuthentication no
PermitRootLogin prohibit-password
PermitEmptyPasswords no
UsePAM yes
X11Forwarding no
AllowAgentForwarding no
AllowTcpForwarding no
ClientAliveInterval 300
ClientAliveCountMax 2
MaxAuthTries 3
LoginGraceTime 30
%{ if enable_unattended_upgrades ~}
- path: /etc/apt/apt.conf.d/20auto-upgrades
permissions: '0644'
content: |
APT::Periodic::Update-Package-Lists "1";
APT::Periodic::Unattended-Upgrade "1";
APT::Periodic::AutocleanInterval "7";
- path: /etc/apt/apt.conf.d/52unattended-upgrades-catalyst
permissions: '0644'
content: |
Unattended-Upgrade::Allowed-Origins {
"$${distro_id}:$${distro_codename}-security";
"$${distro_id}ESMApps:$${distro_codename}-apps-security";
"$${distro_id}ESM:$${distro_codename}-infra-security";
};
Unattended-Upgrade::Automatic-Reboot "true";
Unattended-Upgrade::Automatic-Reboot-Time "02:30";
Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
Unattended-Upgrade::Remove-Unused-Dependencies "true";
%{ endif ~}
%{ if enable_fail2ban ~}
- path: /etc/fail2ban/jail.d/catalyst-sshd.local
permissions: '0644'
content: |
[sshd]
enabled = true
port = ssh
filter = sshd
maxretry = 5
findtime = 10m
bantime = 1h
backend = systemd
%{ endif ~}
runcmd:
- swapoff -a
- sed -i '/swap/d' /etc/fstab
- update-alternatives --set iptables /usr/sbin/iptables-legacy || true
- update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy || true
- systemctl reload ssh || systemctl reload sshd || true
%{ if enable_fail2ban ~}
- systemctl enable --now fail2ban
%{ endif ~}
%{ if enable_unattended_upgrades ~}
- systemctl enable --now unattended-upgrades
%{ endif ~}
# ── Private NIC bring-up BEFORE k3s agent join (prov #71 root cause) ────
#
# Same Hetzner private-NIC hot-attach race as the control-plane template
# — see cloudinit-control-plane.tftpl for the full rationale. Without
# this wait, k3s agent's `K3S_URL=https://${cp_private_ip}:6443` dials
# the CP via a private IP that has no route, agent join silently fails,
# the worker never becomes Ready, autoscaler times out the scale-up.
- |
PRIMARY_NIC=$(ip -br link | awk '$1 == "eth0" {print $1; exit}')
echo "[private-nic] waiting for private NIC to reach the CP ${cp_private_ip}..."
for i in $(seq 1 60); do
if ip -4 route get ${cp_private_ip} 2>/dev/null | grep -qE "dev (eth|en|ens|enp)"; then
SRC=$(ip -4 route get ${cp_private_ip} | awk '/src/ {for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
echo "[private-nic] route to ${cp_private_ip} is up via src $${SRC} (iter $${i})"
break
fi
EXTRA=$(ip -br link | awk -v primary="$${PRIMARY_NIC}" '$1 != "lo" && $1 != primary && $1 !~ /^(docker|veth|tun|wg|cni|flannel|cilium)/ {print $1; exit}')
if [ -n "$${EXTRA}" ] && ! grep -q "$${EXTRA}:" /etc/netplan/*.yaml 2>/dev/null; then
echo "[private-nic] generating netplan stanza for $${EXTRA}"
cat >/etc/netplan/60-private-network.yaml <<NETPLAN
network:
version: 2
ethernets:
$${EXTRA}:
dhcp4: true
dhcp4-overrides:
use-dns: false
use-routes: false
NETPLAN
chmod 0600 /etc/netplan/60-private-network.yaml
netplan apply || true
fi
sleep 2
done
if ! ip -4 route get ${cp_private_ip} 2>/dev/null | grep -qE "dev (eth|en|ens|enp)"; then
echo "[private-nic] FATAL: no route to CP ${cp_private_ip} after 120s; k3s agent join will fail" >&2
ip -br addr >&2
exit 1
fi
# Join the control plane via private network IP (10.0.1.2 — the first
# control-plane node in the network subnet). k3s_version pinned so all
# workers in this Sovereign land on the same Kubernetes minor as the CP.
# --kubelet-arg=max-pods=220 doubles the kubelet default 110. Without
# it the qa-fixtures + full bootstrap-kit pod set saturates one node
# cleanly (catalyst-api, helm-controller, source-controller, plus the
# 45-HR install chain → Helm hooks → bp-* runtime pods). Caught on
# prov #63 (cpx52 × 3): CP at 110/110 pods, bp-catalyst-platform's
# catalyst-api pod stuck "Too many pods" → install hook timed out.
- 'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${k3s_version} K3S_URL=https://${cp_private_ip}:6443 K3S_TOKEN=${k3s_token} INSTALL_K3S_EXEC="agent --kubelet-arg=max-pods=220 --node-label catalyst.openova.io/role=worker" sh -'
- mkdir -p /var/lib/catalyst
- touch /var/lib/catalyst/cloud-init-complete
final_message: "Catalyst worker bootstrap complete after $UPTIME seconds"