#!/usr/bin/env bash # ci-health-check.sh — Scan all privilegedescalation repos for CI/CD health # Run from: /paperclip/privilegedescalation/engineering/hugh # Requires: GH_TOKEN set (use: export GH_TOKEN=$(bash ./get-github-token.sh)) # # Plugin repo discovery # --------------------- # PLUGIN_REPOS is populated dynamically from the GitHub org so newly created # plugin repos are picked up automatically. The filter is: # - non-archived, public repos in the privilegedescalation org # - name starts with "headlamp-" # - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin) # If discovery fails (network error, GH_TOKEN missing, API outage), we fall # back to a hardcoded list so the health check still produces a useful report. # # Failure Categories: # - code: test/lint/build/typecheck failures on main # - infra: startup_failure, timed_out, runner issues # - pending: action_required (awaiting review/approval) - informational only set -euo pipefail ORG="privilegedescalation" # Hardcoded fallback — kept in sync manually as a safety net for discovery failures. PLUGIN_REPOS_FALLBACK=( headlamp-polaris-plugin headlamp-rook-plugin headlamp-sealed-secrets-plugin headlamp-intel-gpu-plugin headlamp-tns-csi-plugin headlamp-kube-vip-plugin headlamp-plugin-template headlamp-argocd-plugin ) mapfile -t PLUGIN_REPOS < <( gh api --paginate "orgs/${ORG}/repos" \ --jq '.[] | select(.archived == false and .visibility == "public" and (.name | startswith("headlamp-")) and .name != "headlamp-agent-skills") | .name' \ 2>/dev/null | sort ) if [ ${#PLUGIN_REPOS[@]} -eq 0 ]; then echo "WARNING: dynamic repo discovery returned no results — using hardcoded fallback" >&2 PLUGIN_REPOS=("${PLUGIN_REPOS_FALLBACK[@]}") fi echo "=== CI/CD Health Check — $(date -u '+%Y-%m-%d %H:%M UTC') ===" echo "" failures=0 warnings=0 process_pending=0 for repo in "${PLUGIN_REPOS[@]}"; do echo "--- ${repo} ---" # Get last 10 runs (wider window to catch intermittent failures) runs=$(gh run list --repo "${ORG}/${repo}" --limit 10 --json name,conclusion,headBranch,updatedAt 2>/dev/null || echo "[]") if [ "$runs" = "[]" ]; then echo " WARNING: No workflow runs found" ((warnings++)) || true continue fi total=$(echo "$runs" | jq 'length') # Categorize failures: # - code failures: test/lint/build on main # - infra failures: startup_failure, timed_out # - process pending: action_required code_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length') infra_failures=$(echo "$runs" | jq '[.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out")] | length') action_required=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length') if [ "$code_failures" -gt 0 ]; then echo " FAIL (code): ${code_failures} CI failure(s) in last ${total} runs on main:" echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"' ((failures++)) || true fi if [ "$infra_failures" -gt 0 ]; then echo " FAIL (infra): ${infra_failures} infrastructure failure(s):" echo "$runs" | jq -r '.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out") | " - \(.name): \(.conclusion) (\(.updatedAt))"' ((failures++)) || true fi if [ "$code_failures" -eq 0 ] && [ "$infra_failures" -eq 0 ]; then echo " OK: CI passing on main" fi # Process pending — informational only (awaiting review/approval) if [ "$action_required" -gt 0 ]; then echo " INFO: ${action_required} workflow run(s) awaiting action (dual approval, review, etc.):" echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"' ((process_pending++)) || true fi # Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494) e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length') if [ "$e2e_failures" -gt 0 ]; then echo " WARN: E2E Tests failing on main (${e2e_failures} failure(s)) — RBAC bootstrap pending (PRI-494)" ((warnings++)) || true fi # Surface Release failures as warnings — with graceful skip in place, these indicate real errors release_failures=$(echo "$runs" | jq '[.[] | select(.name=="Release" and .conclusion=="failure")] | length') if [ "$release_failures" -gt 0 ]; then echo " WARN: Release workflow has ${release_failures} failure(s) — investigate (PRI-380 secrets still pending)" ((warnings++)) || true fi # Check latest release latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error") echo " Latest release: ${latest_release}" echo "" done echo "=== Summary ===" echo "Repos scanned: ${#PLUGIN_REPOS[@]}" echo "With failures: ${failures}" echo "With warnings: ${warnings}" echo "With pending approval: ${process_pending}" if [ "$failures" -gt 0 ]; then exit 1 fi