133 lines
5.2 KiB
Bash
Executable File
133 lines
5.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ci-health-check.sh — Scan all privilegedescalation repos for CI/CD health
|
|
# Run from: /paperclip/privilegedescalation/engineering/hugh
|
|
# Requires: GH_TOKEN set (use: export GH_TOKEN=$(bash ./get-github-token.sh))
|
|
#
|
|
# Plugin repo discovery
|
|
# ---------------------
|
|
# PLUGIN_REPOS is populated dynamically from the GitHub org so newly created
|
|
# plugin repos are picked up automatically. The filter is:
|
|
# - non-archived, public repos in the privilegedescalation org
|
|
# - name starts with "headlamp-"
|
|
# - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin)
|
|
# If discovery fails (network error, GH_TOKEN missing, API outage), we fall
|
|
# back to a hardcoded list so the health check still produces a useful report.
|
|
#
|
|
# Failure Categories:
|
|
# - code: test/lint/build/typecheck failures on main
|
|
# - infra: startup_failure, timed_out, runner issues
|
|
# - pending: action_required (awaiting review/approval) - informational only
|
|
set -euo pipefail
|
|
|
|
ORG="privilegedescalation"
|
|
|
|
# Hardcoded fallback — kept in sync manually as a safety net for discovery failures.
|
|
PLUGIN_REPOS_FALLBACK=(
|
|
headlamp-polaris-plugin
|
|
headlamp-rook-plugin
|
|
headlamp-sealed-secrets-plugin
|
|
headlamp-intel-gpu-plugin
|
|
headlamp-tns-csi-plugin
|
|
headlamp-kube-vip-plugin
|
|
headlamp-plugin-template
|
|
headlamp-argocd-plugin
|
|
)
|
|
|
|
mapfile -t PLUGIN_REPOS < <(
|
|
gh api --paginate "orgs/${ORG}/repos" \
|
|
--jq '.[] | select(.archived == false and .visibility == "public" and (.name | startswith("headlamp-")) and .name != "headlamp-agent-skills") | .name' \
|
|
2>/dev/null | sort
|
|
)
|
|
|
|
if [ ${#PLUGIN_REPOS[@]} -eq 0 ]; then
|
|
echo "WARNING: dynamic repo discovery returned no results — using hardcoded fallback" >&2
|
|
PLUGIN_REPOS=("${PLUGIN_REPOS_FALLBACK[@]}")
|
|
fi
|
|
|
|
# Private repos not visible to dynamic discovery
|
|
PLUGIN_REPOS+=("infra")
|
|
|
|
echo "=== CI/CD Health Check — $(date -u '+%Y-%m-%d %H:%M UTC') ==="
|
|
echo ""
|
|
|
|
failures=0
|
|
warnings=0
|
|
process_pending=0
|
|
|
|
for repo in "${PLUGIN_REPOS[@]}"; do
|
|
echo "--- ${repo} ---"
|
|
|
|
# Get last 10 runs (wider window to catch intermittent failures)
|
|
runs=$(gh run list --repo "${ORG}/${repo}" --limit 10 --json name,conclusion,headBranch,updatedAt 2>/dev/null || echo "[]")
|
|
|
|
if [ "$runs" = "[]" ]; then
|
|
echo " WARNING: No workflow runs found"
|
|
((warnings++)) || true
|
|
continue
|
|
fi
|
|
|
|
total=$(echo "$runs" | jq 'length')
|
|
|
|
# Categorize failures:
|
|
# - code failures: test/lint/build on main
|
|
# - infra failures: startup_failure, timed_out
|
|
# - process pending: action_required
|
|
|
|
code_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
|
|
infra_failures=$(echo "$runs" | jq '[.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out")] | length')
|
|
action_required=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
|
|
|
|
if [ "$code_failures" -gt 0 ]; then
|
|
echo " FAIL (code): ${code_failures} CI failure(s) in last ${total} runs on main:"
|
|
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"'
|
|
((failures++)) || true
|
|
fi
|
|
|
|
if [ "$infra_failures" -gt 0 ]; then
|
|
echo " FAIL (infra): ${infra_failures} infrastructure failure(s):"
|
|
echo "$runs" | jq -r '.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out") | " - \(.name): \(.conclusion) (\(.updatedAt))"'
|
|
((failures++)) || true
|
|
fi
|
|
|
|
if [ "$code_failures" -eq 0 ] && [ "$infra_failures" -eq 0 ]; then
|
|
echo " OK: CI passing on main"
|
|
fi
|
|
|
|
# Process pending — informational only (awaiting review/approval)
|
|
if [ "$action_required" -gt 0 ]; then
|
|
echo " INFO: ${action_required} workflow run(s) awaiting action (dual approval, review, etc.):"
|
|
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
|
|
((process_pending++)) || true
|
|
fi
|
|
|
|
# Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494)
|
|
e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length')
|
|
if [ "$e2e_failures" -gt 0 ]; then
|
|
echo " WARN: E2E Tests failing on main (${e2e_failures} failure(s)) — RBAC bootstrap pending (PRI-494)"
|
|
((warnings++)) || true
|
|
fi
|
|
|
|
# Surface Release failures as warnings — with graceful skip in place, these indicate real errors
|
|
release_failures=$(echo "$runs" | jq '[.[] | select(.name=="Release" and .conclusion=="failure")] | length')
|
|
if [ "$release_failures" -gt 0 ]; then
|
|
echo " WARN: Release workflow has ${release_failures} failure(s) — investigate (PRI-380 secrets still pending)"
|
|
((warnings++)) || true
|
|
fi
|
|
|
|
# Check latest release
|
|
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
|
|
echo " Latest release: ${latest_release}"
|
|
|
|
echo ""
|
|
done
|
|
|
|
echo "=== Summary ==="
|
|
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
|
|
echo "With failures: ${failures}"
|
|
echo "With warnings: ${warnings}"
|
|
echo "With pending approval: ${process_pending}"
|
|
|
|
if [ "$failures" -gt 0 ]; then
|
|
exit 1
|
|
fi
|