From 4edc829b3f6afc5026e4c1a628b24464b2b2568c Mon Sep 17 00:00:00 2001
From: "privilegedescalation-engineer[bot]"
 <privilegedescalation-engineer[bot]@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:57:58 +0000
Subject: [PATCH] ci(e2e): add deployment diagnostics step on failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the E2E deploy step fails (rollout timeout, pod not ready, etc.),
previously required manual cluster investigation to diagnose the root
cause. This heartbeat had to grep CI logs and query kubectl separately
to determine a :latest image drift issue.

The new step captures pod state, pod describe output, and recent namespace
events immediately when a failure occurs — surfacing the root cause
directly in the CI run log.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 .github/workflows/e2e.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
index 06ccf99..37f33a9 100644
--- a/.github/workflows/e2e.yaml
+++ b/.github/workflows/e2e.yaml
@@ -72,6 +72,16 @@ jobs:
           HEADLAMP_URL: ${{ env.HEADLAMP_URL }}
           HEADLAMP_TOKEN: ${{ env.HEADLAMP_TOKEN }}
 
+      - name: Collect deployment diagnostics on failure
+        if: failure()
+        run: |
+          echo "=== Pod state ==="
+          kubectl get pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true
+          echo "=== Pod describe ==="
+          kubectl describe pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true
+          echo "=== Recent namespace events ==="
+          kubectl get events -n "$E2E_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
+
       - name: Teardown E2E instance
         if: always()
         run: scripts/teardown-e2e-headlamp.sh