fix: categorize CI failures to distinguish expected process failures from real infra issues
This commit updates ci-health-check.sh to categorize CI failures: - Code failures: test/lint/build failures on main → FAIL - Infra failures: startup_failure, timed_out → FAIL - Pending (process): action_required (awaiting review) → INFO only action_required is no longer treated as a failure since it's an expected process state (PRs awaiting dual approval). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -12,6 +12,11 @@
|
|||||||
# - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin)
|
# - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin)
|
||||||
# If discovery fails (network error, GH_TOKEN missing, API outage), we fall
|
# If discovery fails (network error, GH_TOKEN missing, API outage), we fall
|
||||||
# back to a hardcoded list so the health check still produces a useful report.
|
# back to a hardcoded list so the health check still produces a useful report.
|
||||||
|
#
|
||||||
|
# Failure Categories:
|
||||||
|
# - code: test/lint/build/typecheck failures on main
|
||||||
|
# - infra: startup_failure, timed_out, runner issues
|
||||||
|
# - pending: action_required (awaiting review/approval) - informational only
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
ORG="privilegedescalation"
|
ORG="privilegedescalation"
|
||||||
@@ -44,6 +49,7 @@ echo ""
|
|||||||
|
|
||||||
failures=0
|
failures=0
|
||||||
warnings=0
|
warnings=0
|
||||||
|
process_pending=0
|
||||||
|
|
||||||
for repo in "${PLUGIN_REPOS[@]}"; do
|
for repo in "${PLUGIN_REPOS[@]}"; do
|
||||||
echo "--- ${repo} ---"
|
echo "--- ${repo} ---"
|
||||||
@@ -57,18 +63,40 @@ for repo in "${PLUGIN_REPOS[@]}"; do
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Count CI failures on main — exclude E2E and Release (tracked separately below)
|
|
||||||
main_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
|
|
||||||
total=$(echo "$runs" | jq 'length')
|
total=$(echo "$runs" | jq 'length')
|
||||||
|
|
||||||
if [ "$main_failures" -gt 0 ]; then
|
# Categorize failures:
|
||||||
echo " FAIL: ${main_failures} CI failure(s) in last ${total} runs on main:"
|
# - code failures: test/lint/build on main
|
||||||
|
# - infra failures: startup_failure, timed_out
|
||||||
|
# - process pending: action_required
|
||||||
|
|
||||||
|
code_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
|
||||||
|
infra_failures=$(echo "$runs" | jq '[.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out")] | length')
|
||||||
|
action_required=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
|
||||||
|
|
||||||
|
if [ "$code_failures" -gt 0 ]; then
|
||||||
|
echo " FAIL (code): ${code_failures} CI failure(s) in last ${total} runs on main:"
|
||||||
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"'
|
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"'
|
||||||
((failures++)) || true
|
((failures++)) || true
|
||||||
else
|
fi
|
||||||
|
|
||||||
|
if [ "$infra_failures" -gt 0 ]; then
|
||||||
|
echo " FAIL (infra): ${infra_failures} infrastructure failure(s):"
|
||||||
|
echo "$runs" | jq -r '.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out") | " - \(.name): \(.conclusion) (\(.updatedAt))"'
|
||||||
|
((failures++)) || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$code_failures" -eq 0 ] && [ "$infra_failures" -eq 0 ]; then
|
||||||
echo " OK: CI passing on main"
|
echo " OK: CI passing on main"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Process pending — informational only (awaiting review/approval)
|
||||||
|
if [ "$action_required" -gt 0 ]; then
|
||||||
|
echo " INFO: ${action_required} workflow run(s) awaiting action (dual approval, review, etc.):"
|
||||||
|
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
|
||||||
|
((process_pending++)) || true
|
||||||
|
fi
|
||||||
|
|
||||||
# Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494)
|
# Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494)
|
||||||
e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length')
|
e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length')
|
||||||
if [ "$e2e_failures" -gt 0 ]; then
|
if [ "$e2e_failures" -gt 0 ]; then
|
||||||
@@ -83,15 +111,6 @@ for repo in "${PLUGIN_REPOS[@]}"; do
|
|||||||
((warnings++)) || true
|
((warnings++)) || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check for action_required — GitHub's "Require approval for first-time contributors" setting
|
|
||||||
# blocks workflow runs from GitHub App bot accounts. This is a CI pipeline blocker (see PRI-44).
|
|
||||||
action_required_count=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
|
|
||||||
if [ "$action_required_count" -gt 0 ]; then
|
|
||||||
echo " FAIL: ${action_required_count} workflow run(s) with action_required (GitHub App PR approval blocked):"
|
|
||||||
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
|
|
||||||
((failures++)) || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check latest release
|
# Check latest release
|
||||||
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
|
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
|
||||||
echo " Latest release: ${latest_release}"
|
echo " Latest release: ${latest_release}"
|
||||||
@@ -103,7 +122,8 @@ echo "=== Summary ==="
|
|||||||
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
|
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
|
||||||
echo "With failures: ${failures}"
|
echo "With failures: ${failures}"
|
||||||
echo "With warnings: ${warnings}"
|
echo "With warnings: ${warnings}"
|
||||||
|
echo "With pending approval: ${process_pending}"
|
||||||
|
|
||||||
if [ "$failures" -gt 0 ]; then
|
if [ "$failures" -gt 0 ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
Reference in New Issue
Block a user