fix(e2e): grant cross-namespace RBAC for Polaris dashboard proxy access

The E2E Headlamp instance runs in privilegedescalation-dev but needs to proxy to the Polaris dashboard service in the polaris namespace to fetch audit results. Root cause: - E2E tests consistently fail with 'Polaris dashboard not reachable' because the in-cluster Headlamp (running as ServiceAccount headlamp-e2e-test in privilegedescalation-dev) lacks permission to proxy to polaris-dashboard in the polaris namespace - The default RBAC only covered the privilegedescalation-dev namespace - The error manifests as a 503 from the Kubernetes API proxy, causing the loading spinner to persist indefinitely in E2E runs Fix: - Add a new Role + RoleBinding for the polaris namespace that grants get+proxy on the polaris-dashboard service - The ARC runner's ServiceAccount (runners-privilegedescalation-gha-rs-no-permission in arc-runners) is the subject for both bindings, matching the existing pattern - Add a pre-flight check in deploy-e2e-headlamp.sh that warns if Polaris proxy RBAC is missing, so CI output makes the issue self-diagnosing Note: This RBAC change must be applied to the cluster before E2E runs will pass. The deploy script detects and warns about the missing permission. Co-Authored-By: Paperclip <noreply@paperclip.ing>
fix: pass pr_number to dual-approval-check workflow (#119 )
2026-04-27 01:12:08 +00:00 · 2026-04-15 03:33:19 +00:00 · 2026-03-24 22:26:32 +00:00 · 2026-03-24 21:57:58 +00:00 · 2026-03-24 21:42:51 +00:00 · 2026-03-24 21:28:38 +00:00
6 changed files with 78 additions and 21 deletions
@@ -16,3 +16,5 @@ jobs:
  dual-approval:
    uses: privilegedescalation/.github/.github/workflows/dual-approval-check.yaml@main
    secrets: inherit
+    with:
+      pr_number: ${{ github.event.pull_request.number }}
@@ -10,9 +10,22 @@ on:
 permissions:
  contents: read

+# Only one E2E run at a time: the shared E2E_RELEASE (headlamp-e2e) in
+# privilegedescalation-dev cannot be shared across concurrent runs.
+# cancel-in-progress: false (queue, don't cancel) — cancelling in-flight
+# runs may skip the if: always() teardown, leaving dangling cluster resources.
+concurrency:
+  group: e2e-${{ github.repository }}
+  cancel-in-progress: false
+
 env:
  E2E_NAMESPACE: privilegedescalation-dev
  E2E_RELEASE: headlamp-e2e
+  # Pin to a known-good Headlamp version. Using :latest is risky because
+  # the tag can change between CI runs, causing flaky failures when a newer
+  # image is pulled on some nodes but not others (IfNotPresent pull policy).
+  # Update this when Headlamp is upgraded in production (kube-system).
+  HEADLAMP_VERSION: v0.40.1

 jobs:
  e2e:
@@ -59,6 +72,16 @@ jobs:
          HEADLAMP_URL: ${{ env.HEADLAMP_URL }}
          HEADLAMP_TOKEN: ${{ env.HEADLAMP_TOKEN }}

+      - name: Collect deployment diagnostics on failure
+        if: failure()
+        run: |
+          echo "=== Pod state ==="
+          kubectl get pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true
+          echo "=== Pod describe ==="
+          kubectl describe pods -n "$E2E_NAMESPACE" -l "app.kubernetes.io/instance=$E2E_RELEASE" 2>&1 || true
+          echo "=== Recent namespace events ==="
+          kubectl get events -n "$E2E_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
+
      - name: Teardown E2E instance
        if: always()
        run: scripts/teardown-e2e-headlamp.sh
@@ -44,3 +44,30 @@ roleRef:
  kind: Role
  name: e2e-ci-runner
  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: e2e-ci-runner
+  namespace: polaris
+rules:
+  # E2E Headlamp needs to proxy to the Polaris dashboard service to fetch audit results.
+  # The service account in privilegedescalation-dev is granted get+proxy on polaris-dashboard.
+  - apiGroups: [""]
+    resources: ["services/proxy"]
+    verbs: ["get"]
+    resourceNames: ["polaris-dashboard"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: e2e-ci-runner-binding
+  namespace: polaris
+subjects:
+  - kind: ServiceAccount
+    name: runners-privilegedescalation-gha-rs-no-permission
+    namespace: arc-runners
+roleRef:
+  kind: Role
+  name: e2e-ci-runner
+  apiGroup: rbac.authorization.k8s.io
@@ -45,8 +45,12 @@ async function authenticateWithToken(page: Page, token: string): Promise<void> {
  await page.waitForURL(/\/(login|token)$/);

  if (page.url().includes('/login')) {
-    // OIDC login page — click "use a token" to reach token auth
-    await page.getByRole('button', { name: /use a token/i }).click();
+    // OIDC login page — click "use a token" to reach token auth.
+    // Wait explicitly before clicking so failures surface at 15 s
+    // with a clear message rather than silently timing out at 60 s.
+    const useTokenBtn = page.getByRole('button', { name: /use a token/i });
+    await useTokenBtn.waitFor({ state: 'visible', timeout: 15_000 });
+    await useTokenBtn.click();
    await page.waitForURL('**/token');
  }

@@ -1,21 +1,5 @@
 {
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
-  "extends": ["config:recommended"],
-  "baseBranches": ["main"],
-  "schedule": ["every weekend"],
-  "prConcurrentLimit": 10,
-  "pinDigests": true,
-  "packageRules": [
-    {
-      "matchManagers": ["npm"],
-      "matchUpdateTypes": ["minor", "patch"],
-      "groupName": "npm minor and patch"
-    },
-    {
-      "matchManagers": ["github-actions"],
-      "matchUpdateTypes": ["minor", "patch"],
-      "groupName": "github-actions minor and patch"
-    }
-  ]
+  "extends": ["github>privilegedescalation/.github:renovate-config"]
 }

@@ -16,7 +16,7 @@
 # Environment:
 #   E2E_NAMESPACE     — namespace for E2E Headlamp (default: privilegedescalation-dev)
 #   E2E_RELEASE       — release/resource name prefix (default: headlamp-e2e)
-#   HEADLAMP_VERSION  — Headlamp image tag (default: latest)
+#   HEADLAMP_VERSION  — Headlamp image tag (default: v0.40.1, pinned to match production)
 set -euo pipefail

 REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
@@ -24,7 +24,7 @@ DIST_DIR="$REPO_ROOT/dist"

 E2E_NAMESPACE="${E2E_NAMESPACE:-privilegedescalation-dev}"
 E2E_RELEASE="${E2E_RELEASE:-headlamp-e2e}"
-HEADLAMP_VERSION="${HEADLAMP_VERSION:-latest}"
+HEADLAMP_VERSION="${HEADLAMP_VERSION:-v0.40.1}"

 if [ ! -d "$DIST_DIR" ]; then
  echo "ERROR: dist/ not found. Run 'npm run build' first." >&2
@@ -39,6 +39,13 @@ if ! kubectl auth can-i delete configmaps -n "$E2E_NAMESPACE" --quiet 2>/dev/nul
  exit 1
 fi

+echo "Checking RBAC for Polaris dashboard proxy access..."
+if ! kubectl auth can-i get services/proxy -n polaris --quiet 2>/dev/null; then
+  echo "WARNING: Missing RBAC — cannot proxy to polaris-dashboard in namespace 'polaris'." >&2
+  echo "  E2E tests that depend on Polaris data may fail." >&2
+  echo "  Apply the polaris namespace RBAC: kubectl apply -f deployment/e2e-ci-runner-rbac.yaml" >&2
+fi
+
 echo "=== E2E Headlamp Deployment ==="
 echo "  Image:     ghcr.io/headlamp-k8s/headlamp:${HEADLAMP_VERSION}"
 echo "  Namespace: $E2E_NAMESPACE"
@@ -58,6 +65,16 @@ kubectl create configmap headlamp-polaris-plugin \
  --from-file="$DIST_DIR" \
  --from-file=package.json="$REPO_ROOT/package.json"

+# --- Tear down any existing E2E deployment for a clean start ---
+# kubectl apply without prior deletion only patches in-place: if the pod spec is
+# unchanged between runs, no new rollout is triggered and a degraded pod keeps
+# serving. Delete first to guarantee a fresh pod regardless of prior state.
+echo ""
+echo "Removing any existing E2E deployment (clean-start)..."
+kubectl delete deployment "${E2E_RELEASE}" -n "$E2E_NAMESPACE" --ignore-not-found --wait
+kubectl delete service "${E2E_RELEASE}" -n "$E2E_NAMESPACE" --ignore-not-found --wait
+kubectl delete serviceaccount "${E2E_RELEASE}" -n "$E2E_NAMESPACE" --ignore-not-found --wait
+
 # --- Deploy Headlamp via kubectl apply ---
 echo ""
 echo "Deploying Headlamp E2E instance..."
Author	SHA1	Message	Date
Hugh Hackman	4826604a02	fix(e2e): grant cross-namespace RBAC for Polaris dashboard proxy access The E2E Headlamp instance runs in privilegedescalation-dev but needs to proxy to the Polaris dashboard service in the polaris namespace to fetch audit results. Root cause: - E2E tests consistently fail with 'Polaris dashboard not reachable' because the in-cluster Headlamp (running as ServiceAccount headlamp-e2e-test in privilegedescalation-dev) lacks permission to proxy to polaris-dashboard in the polaris namespace - The default RBAC only covered the privilegedescalation-dev namespace - The error manifests as a 503 from the Kubernetes API proxy, causing the loading spinner to persist indefinitely in E2E runs Fix: - Add a new Role + RoleBinding for the polaris namespace that grants get+proxy on the polaris-dashboard service - The ARC runner's ServiceAccount (runners-privilegedescalation-gha-rs-no-permission in arc-runners) is the subject for both bindings, matching the existing pattern - Add a pre-flight check in deploy-e2e-headlamp.sh that warns if Polaris proxy RBAC is missing, so CI output makes the issue self-diagnosing Note: This RBAC change must be applied to the cluster before E2E runs will pass. The deploy script detects and warns about the missing permission. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-04-27 01:12:08 +00:00
privilegedescalation-engineer[bot]	dff1265435	fix: pass pr_number to dual-approval-check workflow (#119 ) Companion PR to privilegedescalation/.github#81 Co-authored-by: Hugh Hackman <hugh@paperclip.ing> Co-authored-by: Paperclip <noreply@paperclip.ing>	2026-04-15 03:33:19 +00:00
privilegedescalation-ceo[bot]	7c58826668	Merge pull request #117 from privilegedescalation/ci/e2e-deploy-diagnostics ci(e2e): add deployment diagnostics step on failure	2026-03-24 22:26:32 +00:00
privilegedescalation-engineer[bot]	4edc829b3f	ci(e2e): add deployment diagnostics step on failure When the E2E deploy step fails (rollout timeout, pod not ready, etc.), previously required manual cluster investigation to diagnose the root cause. This heartbeat had to grep CI logs and query kubectl separately to determine a :latest image drift issue. The new step captures pod state, pod describe output, and recent namespace events immediately when a failure occurs — surfacing the root cause directly in the CI run log. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-03-24 21:57:58 +00:00
privilegedescalation-ceo[bot]	8f10be39bd	Merge pull request #116 from privilegedescalation/fix/pin-headlamp-version-e2e fix(e2e): pin Headlamp image to v0.40.1 instead of :latest	2026-03-24 21:42:51 +00:00
privilegedescalation-engineer[bot]	27212a91e1	fix(e2e): pin Headlamp image to v0.40.1 instead of :latest The :latest tag caused E2E flakiness when a newer Headlamp image was pulled on some cluster nodes (IfNotPresent policy) but not others. Concurrent E2E runs on main saw different image versions, and the newest :latest (sha256:89c6c65) failed to pass the readiness probe within 120s. Pin to v0.40.1 — the same version running in production (kube-system) — so all nodes use the same cached digest and CI is deterministic. Update this pin when Headlamp is upgraded in production. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-03-24 21:28:38 +00:00
privilegedescalation-ceo[bot]	7b72306133	Merge pull request #109 from privilegedescalation/feat/renovate-extend-org-config feat: extend Renovate config from org-level preset	2026-03-24 18:45:58 +00:00
privilegedescalation-ceo[bot]	e16e6255d0	Merge pull request #110 from privilegedescalation/ci/e2e-concurrency-guard ci: add concurrency guard to E2E workflow	2026-03-24 18:45:55 +00:00
privilegedescalation-ceo[bot]	4beb0c4d0e	Merge pull request #113 from privilegedescalation/fix/e2e-clean-deploy fix(e2e): clean-delete existing deployment before redeploy for guaranteed fresh pod	2026-03-24 18:45:52 +00:00
Gandalf the Greybeard	175d3ec6a2	fix(e2e): clean-delete existing deployment before redeploy for guaranteed fresh pod kubectl apply without prior deletion patches in place: if the pod spec is unchanged between runs, no rollout is triggered and a potentially degraded pod from a prior run keeps serving. This caused the auth.setup.ts timeout (waiting for the "use a token" button) even when no concurrent runs were present — the headlamp-e2e pod was in an inconsistent state from a previous run that didn't tear down cleanly. Changes: - deploy-e2e-headlamp.sh: delete Deployment, Service, and ServiceAccount (with --wait) before applying, guaranteeing a fresh pod each run - auth.setup.ts: add explicit waitFor({ state: 'visible', timeout: 15_000 }) before the "use a token" button click, so failures surface at 15 s with a clear locator error rather than silently timing out at 60 s Fixes the pre-existing infra issue blocking PR#110. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 16:40:30 +00:00
privilegedescalation-engineer[bot]	e63cd03267	fix(e2e): use cancel-in-progress: false to prevent dangling cluster resources cancel-in-progress: true would cancel in-flight E2E runs when a new one arrives. GitHub Actions does not guarantee that if: always() steps run on cancelled jobs, so teardown-e2e-headlamp.sh may be skipped — leaving the headlamp-e2e Deployment/Service/ConfigMap dangling in privilegedescalation-dev. Switching to false (queue) ensures the running job always completes its teardown before the next run starts. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-03-24 16:34:36 +00:00
privilegedescalation-engineer[bot]	4d878c8737	ci: add concurrency guard to E2E workflow Prevents parallel E2E runs from conflicting over the shared headlamp-e2e Helm release in privilegedescalation-dev. With cancel-in-progress: true, a new push cancels any in-progress run on the same repo — only one E2E suite runs at a time. Observed failure: PR#109 and PR#108 ran concurrently and the auth setup in PR#109 timed out, likely due to resource contention on the shared headlamp-e2e instance. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-03-24 16:27:52 +00:00
Hugh Hackman	490807cef6	feat: extend Renovate config from org-level preset Replaces the duplicated Renovate config with a simple extend from the org-level preset (privilegedescalation/.github:renovate-config). All rules (schedule, pinDigests, npm/github-actions minor+patch+major groups) are now inherited from the org config, which was updated in PR #66 to add major-version update rules for GitHub Actions. This eliminates config drift between repos and reduces maintenance toil — future rule changes only need to be made in one place. Co-Authored-By: Paperclip <noreply@paperclip.ing>	2026-03-24 16:16:15 +00:00