From 4edc829b3f6afc5026e4c1a628b24464b2b2568c Mon Sep 17 00:00:00 2001 From: "privilegedescalation-engineer[bot]" Date: Tue, 24 Mar 2026 21:57:58 +0000 Subject: [PATCH] ci(e2e): add deployment diagnostics step on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the E2E deploy step fails (rollout timeout, pod not ready, etc.), previously required manual cluster investigation to diagnose the root cause. This heartbeat had to grep CI logs and query kubectl separately to determine a :latest image drift issue. The new step captures pod state, pod describe output, and recent namespace events immediately when a failure occurs — surfacing the root cause directly in the CI run log. Co-Authored-By: Paperclip --- .github/workflows/e2e.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 06ccf99..37f33a9 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -72,6 +72,16 @@ jobs: HEADLAMP_URL: ${{ env.HEADLAMP_URL }} HEADLAMP_TOKEN: ${{ env.HEADLAMP_TOKEN }} + - name: Collect deployment diagnostics on failure + if: failure() + run: | + echo "=== Pod state ===" + kubectl get pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true + echo "=== Pod describe ===" + kubectl describe pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true + echo "=== Recent namespace events ===" + kubectl get events -n "$E2E_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true + - name: Teardown E2E instance if: always() run: scripts/teardown-e2e-headlamp.sh