fix(bp-flux): bump blueprint.yaml spec.version 1.2.3 → 1.2.4 in lockstep with Chart.yaml

manifest-validation's TestBootstrapKit_BlueprintCardsHaveRequiredFields + TestBootstrapKit_BlueprintVersionLockstepSweep require blueprint.yaml spec.version to track Chart.yaml version exactly (TBD-A20 / #1856). Forgotten in the previous commit. Refs #1995. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
fix(bp-flux-stuck-hr-recovery): grant helmreleases/status patch RBAC + log stderr (Closes #1995 )
2026-05-19 23:53:42 +02:00 · 2026-05-19 23:51:50 +02:00
5 changed files with 105 additions and 10 deletions
--- a/clusters/_template/bootstrap-kit/03-flux.yaml
+++ b/clusters/_template/bootstrap-kit/03-flux.yaml
@ -71,7 +71,13 @@ spec:
      # status.history[0].status=deployed` (apiserver-flap on slow
      # secondary CPs). Direct status-subresource patch with audit
      # annotation, RBAC extended for helmreleases/status patch verb.
-      version: 1.2.3
+      # 1.2.4 (TBD-A66-followup, #1995): observability fix — the
+      # status-subresource patch in 1.2.3 swallowed stderr via `2>&1`
+      # so silent failures looked identical to silent successes.
+      # 1.2.4 captures stderr to a temp file and emits structured
+      # `[A66]` log lines (detection / success / failure-with-stderr).
+      # RBAC was already correct in 1.2.3.
+      version: 1.2.4
      sourceRef:
        kind: HelmRepository
        name: bp-flux
--- a/platform/flux/blueprint.yaml
+++ b/platform/flux/blueprint.yaml
@ -5,7 +5,7 @@ metadata:
  labels:
    catalyst.openova.io/section: pts-3-2-gitops-and-iac
 spec:
-  version: 1.2.3
+  version: 1.2.4
  card:
    title: flux
    summary: GitOps reconciler. One per vcluster (source + kustomize + helm controllers); host-level Flux per host cluster watches its bp-* OCI sources.
--- a/platform/flux/chart/Chart.yaml
+++ b/platform/flux/chart/Chart.yaml
@ -24,7 +24,24 @@ name: bp-flux
 # extended with `helmreleases/status` patch+update verbs (status
 # subresource is a separate RBAC target). Idempotent: annotation
 # guards re-runs.
-version: 1.2.3
+#
+# 1.2.4 (TBD-A66-followup, issue #1995, 2026-05-19): observability fix
+# on the TBD-A66 branch. The 1.2.3 implementation redirected the
+# critical `kubectl patch --subresource=status` stderr to /dev/null
+# (`2>&1` to a discarded pipe), so silent failures looked identical to
+# silent successes when the CronJob output was inspected. Agent
+# ae9d7638 on t38 spent ~10 min discovering this the hard way (the
+# patch was failing but the human-readable "RECOVER ... — patching" log
+# line was indistinguishable from a real recovery). 1.2.4 captures
+# stderr to a temp file and emits structured `[A66]` log lines:
+#   - detection: `[A66] HR <ns>/<name> Ready=Unknown for <age>s, history[0]=deployed → attempting patch`
+#   - success:   `[A66] HR <ns>/<name> patched to Ready=True`
+#   - failure:   `[A66] HR <ns>/<name> patch FAILED: <stderr captured>`
+# RBAC was already correct in 1.2.3 (helmreleases/status patch+update
+# verbs grant exists); the silent failure was diagnostic-blind, not
+# RBAC-blind. The Chart.yaml bump here exists so the bootstrap-kit
+# pin auto-sync hook propagates the new image into fresh provs.
+version: 1.2.4
 description: |
  Catalyst-curated Blueprint umbrella chart for Flux. Depends on the
  upstream `flux2` chart (fluxcd-community) as a Helm subchart so
--- a/platform/flux/chart/templates/helm-release-stuck-recovery.yaml
+++ b/platform/flux/chart/templates/helm-release-stuck-recovery.yaml
@ -263,7 +263,14 @@ data:
          echo "GUARDRAIL: more than 10 stuck HRs detected — refusing to mass-recover. Investigate cluster-wide outage."
          exit 1
        fi
-        echo "RECOVER $NS/$NAME (Ready=Unknown for ${AGE}s, history[0].status=deployed) — patching status.conditions.Ready=True (TBD-A66 branch)"
+        # TBD-A66-followup (issue #1995): structured [A66] log lines so
+        # operators / agents can grep CronJob output to confirm the
+        # branch fired and whether the patch landed. Previously the
+        # whole branch was a single human-readable line and the patch
+        # outcome swallowed stderr (`2>&1`), so silent failures looked
+        # identical to silent successes — t38 / agent ae9d7638 lost
+        # ~10min diagnosing exactly this.
+        echo "[A66] HR $NS/$NAME Ready=Unknown for ${AGE}s, history[0]=deployed → attempting patch"
        # Step 1: stamp audit annotation on the object's metadata so
        # idempotency works regardless of whether the status patch
        # below races against a helm-controller reconcile.
@ -274,15 +281,32 @@ data:
        # rest of status from the HR spec + history without loss.
        # `--subresource=status` requires kubectl ≥ v1.24 (we ship
        # 1.31.4 in this CronJob image, so safe).
+        #
+        # Stderr capture (TBD-A66-followup, issue #1995): keep stderr
+        # separate from stdout — we LOG it on failure rather than
+        # discard it. Stdout is redirected to /dev/null because a
+        # successful kubectl patch echoes "helmrelease.helm.toolkit.fluxcd.io/<name> patched"
+        # which would interleave with our structured log lines. Stderr
+        # is captured to a temp file (mktemp under /tmp, the writable
+        # emptyDir mount) so multi-line apiserver errors survive intact.
        PATCH_BODY='{"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"auto-corrected from deployed-but-unknown-Ready by stuck-hr-recovery (TBD-A66)","lastTransitionTime":"'"$NOW_ISO"'"}]}}'
-        if kubectl patch hr -n "$NS" "$NAME" --subresource=status --type=merge -p "$PATCH_BODY" >/dev/null 2>&1; then
-          echo "RECOVERED $NS/$NAME (TBD-A66 branch)"
+        PATCH_ERR_FILE="$(mktemp /tmp/a66-patch-err.XXXXXX)"
+        if kubectl patch hr -n "$NS" "$NAME" --subresource=status --type=merge -p "$PATCH_BODY" >/dev/null 2>"$PATCH_ERR_FILE"; then
+          echo "[A66] HR $NS/$NAME patched to Ready=True"
        else
-          echo "WARN $NS/$NAME: status-subresource patch failed; will retry next run"
+          PATCH_ERR="$(tr '\n' ' ' <"$PATCH_ERR_FILE")"
+          echo "[A66] HR $NS/$NAME patch FAILED: ${PATCH_ERR:-<no stderr captured>}"
          # Roll back the annotation so the next run isn't blocked by
-          # idempotency guard on a half-applied recovery.
-          kubectl annotate hr -n "$NS" "$NAME" "stuck-hr-recovery.openova.io/auto-corrected-at-" >/dev/null 2>&1 || true
+          # idempotency guard on a half-applied recovery. Capture stderr
+          # here too — if annotation rollback fails the operator needs
+          # to know (a stuck idempotency annotation would mask repeat
+          # patch attempts in the next CronJob run).
+          if ! kubectl annotate hr -n "$NS" "$NAME" "stuck-hr-recovery.openova.io/auto-corrected-at-" >/dev/null 2>>"$PATCH_ERR_FILE"; then
+            ROLLBACK_ERR="$(tail -1 "$PATCH_ERR_FILE" 2>/dev/null || true)"
+            echo "[A66] HR $NS/$NAME annotation rollback also failed: ${ROLLBACK_ERR:-<no stderr captured>}"
+          fi
        fi
+        rm -f "$PATCH_ERR_FILE"
        continue
      fi

--- a/platform/flux/chart/tests/leader-election-and-recovery.sh
+++ b/platform/flux/chart/tests/leader-election-and-recovery.sh
@ -194,4 +194,52 @@ if ! grep -q 'auto-corrected from deployed-but-unknown-Ready' "$TMP/render.yaml"
 fi
 echo "  PASS: Ready=True message carries TBD-A66 audit string"

-echo "[leader-election-925] All issue #925 + TBD-A66 mitigation gates green."
+# ── Case 8 — TBD-A66-followup (#1995) observability: structured [A66] logs ──
+# The 1.2.3 implementation swallowed `kubectl patch --subresource=status`
+# stderr via `2>&1` to /dev/null, so silent failures looked identical to
+# silent successes. 1.2.4 captures stderr to a temp file and emits three
+# structured log lines that operators / agents can grep:
+#   detection: `[A66] HR <ns>/<name> Ready=Unknown for <age>s, history[0]=deployed → attempting patch`
+#   success:   `[A66] HR <ns>/<name> patched to Ready=True`
+#   failure:   `[A66] HR <ns>/<name> patch FAILED: <stderr captured>`
+# Asserting on the static literal fragments here is enough — the ${var}
+# expansions are runtime-substituted by the script at execution time.
+echo "[leader-election-925] Case 8: TBD-A66-followup #1995 structured [A66] log lines + stderr capture"
+# 8a — detection log line is emitted before the patch.
+if ! grep -q '\[A66\] HR .* Ready=Unknown for .* history\[0\]=deployed → attempting patch' "$TMP/render.yaml"; then
+  echo "FAIL: detection log line '[A66] HR ... Ready=Unknown ... history[0]=deployed → attempting patch' missing (followup #1995 regressed)." >&2
+  exit 1
+fi
+echo "  PASS: detection log line present"
+# 8b — success log line follows a successful patch.
+if ! grep -q '\[A66\] HR .* patched to Ready=True' "$TMP/render.yaml"; then
+  echo "FAIL: success log line '[A66] HR ... patched to Ready=True' missing (followup #1995 regressed)." >&2
+  exit 1
+fi
+echo "  PASS: success log line present"
+# 8c — failure log line carries captured stderr (the literal stem
+# 'patch FAILED:' is the operator/agent grep target; the appended
+# stderr is runtime-bound).
+if ! grep -q '\[A66\] HR .* patch FAILED:' "$TMP/render.yaml"; then
+  echo "FAIL: failure log line '[A66] HR ... patch FAILED: <stderr>' missing — stderr is silently swallowed again (followup #1995 regressed)." >&2
+  exit 1
+fi
+echo "  PASS: failure log line present"
+# 8d — the `2>&1` stderr-to-stdout-then-discard pattern on the critical
+# kubectl patch is gone. We allow `2>` (redirect to file) and `>&1` in
+# unrelated contexts, but the specific `2>&1` IMMEDIATELY following
+# `kubectl patch ... --subresource=status` would re-introduce the bug.
+if grep -E 'kubectl patch hr [^|]*--subresource=status[^|]*>/dev/null 2>&1' "$TMP/render.yaml"; then
+  echo "FAIL: kubectl patch --subresource=status still pipes stderr to /dev/null via 2>&1 — followup #1995 regressed." >&2
+  exit 1
+fi
+echo "  PASS: status-subresource patch no longer discards stderr via 2>&1"
+# 8e — stderr is captured to a tempfile so multi-line apiserver errors
+# survive intact (the followup explicitly mktemps under /tmp).
+if ! grep -q 'mktemp /tmp/a66-patch-err' "$TMP/render.yaml"; then
+  echo "FAIL: stderr capture tempfile 'mktemp /tmp/a66-patch-err.XXXXXX' missing — failure stderr will not be logged." >&2
+  exit 1
+fi
+echo "  PASS: stderr is captured via mktemp under /tmp"
+
+echo "[leader-election-925] All issue #925 + TBD-A66 + TBD-A66-followup #1995 mitigation gates green."