2df48640bb
- Replace node -e JSON parsing with jq (available on our runners) - Exclude Release workflow failures from FAIL count — these fail at the post-release PR-creation step due to missing RELEASE_APP org secrets (tracked in PRI-380), not actual CI breakage - Demote Release failures to WARN so the health check exits 0 when only Release is broken, giving clean signal for real CI problems - Increase run limit from 5 to 10 for better intermittent failure detection - Remove unnecessary Node.js setup step from the workflow Co-Authored-By: Paperclip <noreply@paperclip.ing>
69 lines
2.3 KiB
Bash
Executable File
69 lines
2.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ci-health-check.sh — Scan all privilegedescalation repos for CI/CD health
|
|
# Run from: /paperclip/privilegedescalation/engineering/hugh
|
|
# Requires: GH_TOKEN set (use: export GH_TOKEN=$(bash ./get-github-token.sh))
|
|
set -euo pipefail
|
|
|
|
ORG="privilegedescalation"
|
|
PLUGIN_REPOS=(
|
|
headlamp-polaris-plugin
|
|
headlamp-rook-plugin
|
|
headlamp-sealed-secrets-plugin
|
|
headlamp-intel-gpu-plugin
|
|
headlamp-tns-csi-plugin
|
|
headlamp-kube-vip-plugin
|
|
)
|
|
|
|
echo "=== CI/CD Health Check — $(date -u '+%Y-%m-%d %H:%M UTC') ==="
|
|
echo ""
|
|
|
|
failures=0
|
|
warnings=0
|
|
|
|
for repo in "${PLUGIN_REPOS[@]}"; do
|
|
echo "--- ${repo} ---"
|
|
|
|
# Get last 10 runs (wider window to catch intermittent failures)
|
|
runs=$(gh run list --repo "${ORG}/${repo}" --limit 10 --json name,conclusion,headBranch,updatedAt 2>/dev/null || echo "[]")
|
|
|
|
if [ "$runs" = "[]" ]; then
|
|
echo " WARNING: No workflow runs found"
|
|
((warnings++)) || true
|
|
continue
|
|
fi
|
|
|
|
# Count CI failures on main — exclude Release workflow failures since those
|
|
# fail at the post-release PR-creation step (tracked separately via PRI-380).
|
|
main_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release")] | length')
|
|
total=$(echo "$runs" | jq 'length')
|
|
|
|
if [ "$main_failures" -gt 0 ]; then
|
|
echo " FAIL: ${main_failures} CI failure(s) in last ${total} runs on main:"
|
|
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release") | " - \(.name) (\(.updatedAt))"'
|
|
((failures++)) || true
|
|
else
|
|
echo " OK: All recent CI runs passing"
|
|
# Surface any Release failures as a warning (known issue: PRI-380)
|
|
release_failures=$(echo "$runs" | jq '[.[] | select(.name=="Release" and .conclusion=="failure")] | length')
|
|
if [ "$release_failures" -gt 0 ]; then
|
|
echo " WARN: Release workflow has ${release_failures} failure(s) — see PRI-380 (missing RELEASE_APP org secrets)"
|
|
((warnings++)) || true
|
|
fi
|
|
fi
|
|
|
|
# Check latest release
|
|
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
|
|
echo " Latest release: ${latest_release}"
|
|
|
|
echo ""
|
|
done
|
|
|
|
echo "=== Summary ==="
|
|
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
|
|
echo "With failures: ${failures}"
|
|
echo "With warnings: ${warnings}"
|
|
|
|
if [ "$failures" -gt 0 ]; then
|
|
exit 1
|
|
fi
|