Files
org/.github/scripts/ci-health-check.sh
T
Chris Farhood 0ff52c20fd ci-health-check: complete dynamic repo discovery (PRI-331)
PR #115's first commit landed dynamic discovery via gh api but missed
three of the five issue requirements. This commit completes them:

- Move headlamp- prefix filtering into jq via startswith() and add
  explicit exclusion for headlamp-agent-skills (skills bundle, not a
  plugin), instead of relying on grep -E '^headlamp-.+'.
- Add PLUGIN_REPOS_FALLBACK with the previously hardcoded list and
  use it when discovery returns empty, instead of exiting with error.
- Add header comment documenting the discovery filter and the
  headlamp-agent-skills exclusion.

Verified jq filter against live API: returns 8 plugin repos, all
prefixed headlamp-, headlamp-agent-skills correctly excluded.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-05-04 15:51:13 +00:00

110 lines
4.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# ci-health-check.sh — Scan all privilegedescalation repos for CI/CD health
# Run from: /paperclip/privilegedescalation/engineering/hugh
# Requires: GH_TOKEN set (use: export GH_TOKEN=$(bash ./get-github-token.sh))
#
# Plugin repo discovery
# ---------------------
# PLUGIN_REPOS is populated dynamically from the GitHub org so newly created
# plugin repos are picked up automatically. The filter is:
# - non-archived, public repos in the privilegedescalation org
# - name starts with "headlamp-"
# - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin)
# If discovery fails (network error, GH_TOKEN missing, API outage), we fall
# back to a hardcoded list so the health check still produces a useful report.
set -euo pipefail
ORG="privilegedescalation"
# Hardcoded fallback — kept in sync manually as a safety net for discovery failures.
PLUGIN_REPOS_FALLBACK=(
headlamp-polaris-plugin
headlamp-rook-plugin
headlamp-sealed-secrets-plugin
headlamp-intel-gpu-plugin
headlamp-tns-csi-plugin
headlamp-kube-vip-plugin
headlamp-plugin-template
headlamp-argocd-plugin
)
mapfile -t PLUGIN_REPOS < <(
gh api --paginate "orgs/${ORG}/repos" \
--jq '.[] | select(.archived == false and .visibility == "public" and (.name | startswith("headlamp-")) and .name != "headlamp-agent-skills") | .name' \
2>/dev/null | sort
)
if [ ${#PLUGIN_REPOS[@]} -eq 0 ]; then
echo "WARNING: dynamic repo discovery returned no results — using hardcoded fallback" >&2
PLUGIN_REPOS=("${PLUGIN_REPOS_FALLBACK[@]}")
fi
echo "=== CI/CD Health Check — $(date -u '+%Y-%m-%d %H:%M UTC') ==="
echo ""
failures=0
warnings=0
for repo in "${PLUGIN_REPOS[@]}"; do
echo "--- ${repo} ---"
# Get last 10 runs (wider window to catch intermittent failures)
runs=$(gh run list --repo "${ORG}/${repo}" --limit 10 --json name,conclusion,headBranch,updatedAt 2>/dev/null || echo "[]")
if [ "$runs" = "[]" ]; then
echo " WARNING: No workflow runs found"
((warnings++)) || true
continue
fi
# Count CI failures on main — exclude E2E and Release (tracked separately below)
main_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
total=$(echo "$runs" | jq 'length')
if [ "$main_failures" -gt 0 ]; then
echo " FAIL: ${main_failures} CI failure(s) in last ${total} runs on main:"
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"'
((failures++)) || true
else
echo " OK: CI passing on main"
fi
# Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494)
e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length')
if [ "$e2e_failures" -gt 0 ]; then
echo " WARN: E2E Tests failing on main (${e2e_failures} failure(s)) — RBAC bootstrap pending (PRI-494)"
((warnings++)) || true
fi
# Surface Release failures as warnings — with graceful skip in place, these indicate real errors
release_failures=$(echo "$runs" | jq '[.[] | select(.name=="Release" and .conclusion=="failure")] | length')
if [ "$release_failures" -gt 0 ]; then
echo " WARN: Release workflow has ${release_failures} failure(s) — investigate (PRI-380 secrets still pending)"
((warnings++)) || true
fi
# Check for action_required — GitHub's "Require approval for first-time contributors" setting
# blocks workflow runs from GitHub App bot accounts. This is a CI pipeline blocker (see PRI-44).
action_required_count=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
if [ "$action_required_count" -gt 0 ]; then
echo " FAIL: ${action_required_count} workflow run(s) with action_required (GitHub App PR approval blocked):"
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
((failures++)) || true
fi
# Check latest release
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
echo " Latest release: ${latest_release}"
echo ""
done
echo "=== Summary ==="
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
echo "With failures: ${failures}"
echo "With warnings: ${warnings}"
if [ "$failures" -gt 0 ]; then
exit 1
fi