Compare commits
2 Commits
3d20ee35bf
...
9539c8ddee
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9539c8ddee | ||
|
|
571dfc4275 |
@ -71,7 +71,13 @@ spec:
|
||||
# status.history[0].status=deployed` (apiserver-flap on slow
|
||||
# secondary CPs). Direct status-subresource patch with audit
|
||||
# annotation, RBAC extended for helmreleases/status patch verb.
|
||||
version: 1.2.3
|
||||
# 1.2.4 (TBD-A66-followup, #1995): observability fix — the
|
||||
# status-subresource patch in 1.2.3 swallowed stderr via `2>&1`
|
||||
# so silent failures looked identical to silent successes.
|
||||
# 1.2.4 captures stderr to a temp file and emits structured
|
||||
# `[A66]` log lines (detection / success / failure-with-stderr).
|
||||
# RBAC was already correct in 1.2.3.
|
||||
version: 1.2.4
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bp-flux
|
||||
|
||||
@ -5,7 +5,7 @@ metadata:
|
||||
labels:
|
||||
catalyst.openova.io/section: pts-3-2-gitops-and-iac
|
||||
spec:
|
||||
version: 1.2.3
|
||||
version: 1.2.4
|
||||
card:
|
||||
title: flux
|
||||
summary: GitOps reconciler. One per vcluster (source + kustomize + helm controllers); host-level Flux per host cluster watches its bp-* OCI sources.
|
||||
|
||||
@ -24,7 +24,24 @@ name: bp-flux
|
||||
# extended with `helmreleases/status` patch+update verbs (status
|
||||
# subresource is a separate RBAC target). Idempotent: annotation
|
||||
# guards re-runs.
|
||||
version: 1.2.3
|
||||
#
|
||||
# 1.2.4 (TBD-A66-followup, issue #1995, 2026-05-19): observability fix
|
||||
# on the TBD-A66 branch. The 1.2.3 implementation redirected the
|
||||
# critical `kubectl patch --subresource=status` stderr to /dev/null
|
||||
# (`2>&1` to a discarded pipe), so silent failures looked identical to
|
||||
# silent successes when the CronJob output was inspected. Agent
|
||||
# ae9d7638 on t38 spent ~10 min discovering this the hard way (the
|
||||
# patch was failing but the human-readable "RECOVER ... — patching" log
|
||||
# line was indistinguishable from a real recovery). 1.2.4 captures
|
||||
# stderr to a temp file and emits structured `[A66]` log lines:
|
||||
# - detection: `[A66] HR <ns>/<name> Ready=Unknown for <age>s, history[0]=deployed → attempting patch`
|
||||
# - success: `[A66] HR <ns>/<name> patched to Ready=True`
|
||||
# - failure: `[A66] HR <ns>/<name> patch FAILED: <stderr captured>`
|
||||
# RBAC was already correct in 1.2.3 (helmreleases/status patch+update
|
||||
# verbs grant exists); the silent failure was diagnostic-blind, not
|
||||
# RBAC-blind. The Chart.yaml bump here exists so the bootstrap-kit
|
||||
# pin auto-sync hook propagates the new image into fresh provs.
|
||||
version: 1.2.4
|
||||
description: |
|
||||
Catalyst-curated Blueprint umbrella chart for Flux. Depends on the
|
||||
upstream `flux2` chart (fluxcd-community) as a Helm subchart so
|
||||
|
||||
@ -263,7 +263,14 @@ data:
|
||||
echo "GUARDRAIL: more than 10 stuck HRs detected — refusing to mass-recover. Investigate cluster-wide outage."
|
||||
exit 1
|
||||
fi
|
||||
echo "RECOVER $NS/$NAME (Ready=Unknown for ${AGE}s, history[0].status=deployed) — patching status.conditions.Ready=True (TBD-A66 branch)"
|
||||
# TBD-A66-followup (issue #1995): structured [A66] log lines so
|
||||
# operators / agents can grep CronJob output to confirm the
|
||||
# branch fired and whether the patch landed. Previously the
|
||||
# whole branch was a single human-readable line and the patch
|
||||
# outcome swallowed stderr (`2>&1`), so silent failures looked
|
||||
# identical to silent successes — t38 / agent ae9d7638 lost
|
||||
# ~10min diagnosing exactly this.
|
||||
echo "[A66] HR $NS/$NAME Ready=Unknown for ${AGE}s, history[0]=deployed → attempting patch"
|
||||
# Step 1: stamp audit annotation on the object's metadata so
|
||||
# idempotency works regardless of whether the status patch
|
||||
# below races against a helm-controller reconcile.
|
||||
@ -274,15 +281,32 @@ data:
|
||||
# rest of status from the HR spec + history without loss.
|
||||
# `--subresource=status` requires kubectl ≥ v1.24 (we ship
|
||||
# 1.31.4 in this CronJob image, so safe).
|
||||
#
|
||||
# Stderr capture (TBD-A66-followup, issue #1995): keep stderr
|
||||
# separate from stdout — we LOG it on failure rather than
|
||||
# discard it. Stdout is redirected to /dev/null because a
|
||||
# successful kubectl patch echoes "helmrelease.helm.toolkit.fluxcd.io/<name> patched"
|
||||
# which would interleave with our structured log lines. Stderr
|
||||
# is captured to a temp file (mktemp under /tmp, the writable
|
||||
# emptyDir mount) so multi-line apiserver errors survive intact.
|
||||
PATCH_BODY='{"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"auto-corrected from deployed-but-unknown-Ready by stuck-hr-recovery (TBD-A66)","lastTransitionTime":"'"$NOW_ISO"'"}]}}'
|
||||
if kubectl patch hr -n "$NS" "$NAME" --subresource=status --type=merge -p "$PATCH_BODY" >/dev/null 2>&1; then
|
||||
echo "RECOVERED $NS/$NAME (TBD-A66 branch)"
|
||||
PATCH_ERR_FILE="$(mktemp /tmp/a66-patch-err.XXXXXX)"
|
||||
if kubectl patch hr -n "$NS" "$NAME" --subresource=status --type=merge -p "$PATCH_BODY" >/dev/null 2>"$PATCH_ERR_FILE"; then
|
||||
echo "[A66] HR $NS/$NAME patched to Ready=True"
|
||||
else
|
||||
echo "WARN $NS/$NAME: status-subresource patch failed; will retry next run"
|
||||
PATCH_ERR="$(tr '\n' ' ' <"$PATCH_ERR_FILE")"
|
||||
echo "[A66] HR $NS/$NAME patch FAILED: ${PATCH_ERR:-<no stderr captured>}"
|
||||
# Roll back the annotation so the next run isn't blocked by
|
||||
# idempotency guard on a half-applied recovery.
|
||||
kubectl annotate hr -n "$NS" "$NAME" "stuck-hr-recovery.openova.io/auto-corrected-at-" >/dev/null 2>&1 || true
|
||||
# idempotency guard on a half-applied recovery. Capture stderr
|
||||
# here too — if annotation rollback fails the operator needs
|
||||
# to know (a stuck idempotency annotation would mask repeat
|
||||
# patch attempts in the next CronJob run).
|
||||
if ! kubectl annotate hr -n "$NS" "$NAME" "stuck-hr-recovery.openova.io/auto-corrected-at-" >/dev/null 2>>"$PATCH_ERR_FILE"; then
|
||||
ROLLBACK_ERR="$(tail -1 "$PATCH_ERR_FILE" 2>/dev/null || true)"
|
||||
echo "[A66] HR $NS/$NAME annotation rollback also failed: ${ROLLBACK_ERR:-<no stderr captured>}"
|
||||
fi
|
||||
fi
|
||||
rm -f "$PATCH_ERR_FILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
@ -194,4 +194,52 @@ if ! grep -q 'auto-corrected from deployed-but-unknown-Ready' "$TMP/render.yaml"
|
||||
fi
|
||||
echo " PASS: Ready=True message carries TBD-A66 audit string"
|
||||
|
||||
echo "[leader-election-925] All issue #925 + TBD-A66 mitigation gates green."
|
||||
# ── Case 8 — TBD-A66-followup (#1995) observability: structured [A66] logs ──
|
||||
# The 1.2.3 implementation swallowed `kubectl patch --subresource=status`
|
||||
# stderr via `2>&1` to /dev/null, so silent failures looked identical to
|
||||
# silent successes. 1.2.4 captures stderr to a temp file and emits three
|
||||
# structured log lines that operators / agents can grep:
|
||||
# detection: `[A66] HR <ns>/<name> Ready=Unknown for <age>s, history[0]=deployed → attempting patch`
|
||||
# success: `[A66] HR <ns>/<name> patched to Ready=True`
|
||||
# failure: `[A66] HR <ns>/<name> patch FAILED: <stderr captured>`
|
||||
# Asserting on the static literal fragments here is enough — the ${var}
|
||||
# expansions are runtime-substituted by the script at execution time.
|
||||
echo "[leader-election-925] Case 8: TBD-A66-followup #1995 structured [A66] log lines + stderr capture"
|
||||
# 8a — detection log line is emitted before the patch.
|
||||
if ! grep -q '\[A66\] HR .* Ready=Unknown for .* history\[0\]=deployed → attempting patch' "$TMP/render.yaml"; then
|
||||
echo "FAIL: detection log line '[A66] HR ... Ready=Unknown ... history[0]=deployed → attempting patch' missing (followup #1995 regressed)." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo " PASS: detection log line present"
|
||||
# 8b — success log line follows a successful patch.
|
||||
if ! grep -q '\[A66\] HR .* patched to Ready=True' "$TMP/render.yaml"; then
|
||||
echo "FAIL: success log line '[A66] HR ... patched to Ready=True' missing (followup #1995 regressed)." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo " PASS: success log line present"
|
||||
# 8c — failure log line carries captured stderr (the literal stem
|
||||
# 'patch FAILED:' is the operator/agent grep target; the appended
|
||||
# stderr is runtime-bound).
|
||||
if ! grep -q '\[A66\] HR .* patch FAILED:' "$TMP/render.yaml"; then
|
||||
echo "FAIL: failure log line '[A66] HR ... patch FAILED: <stderr>' missing — stderr is silently swallowed again (followup #1995 regressed)." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo " PASS: failure log line present"
|
||||
# 8d — the `2>&1` stderr-to-stdout-then-discard pattern on the critical
|
||||
# kubectl patch is gone. We allow `2>` (redirect to file) and `>&1` in
|
||||
# unrelated contexts, but the specific `2>&1` IMMEDIATELY following
|
||||
# `kubectl patch ... --subresource=status` would re-introduce the bug.
|
||||
if grep -E 'kubectl patch hr [^|]*--subresource=status[^|]*>/dev/null 2>&1' "$TMP/render.yaml"; then
|
||||
echo "FAIL: kubectl patch --subresource=status still pipes stderr to /dev/null via 2>&1 — followup #1995 regressed." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo " PASS: status-subresource patch no longer discards stderr via 2>&1"
|
||||
# 8e — stderr is captured to a tempfile so multi-line apiserver errors
|
||||
# survive intact (the followup explicitly mktemps under /tmp).
|
||||
if ! grep -q 'mktemp /tmp/a66-patch-err' "$TMP/render.yaml"; then
|
||||
echo "FAIL: stderr capture tempfile 'mktemp /tmp/a66-patch-err.XXXXXX' missing — failure stderr will not be logged." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo " PASS: stderr is captured via mktemp under /tmp"
|
||||
|
||||
echo "[leader-election-925] All issue #925 + TBD-A66 + TBD-A66-followup #1995 mitigation gates green."
|
||||
|
||||
Loading…
Reference in New Issue
Block a user