Merge pull request #143 from privilegedescalation/hugh/ci-failure-categorization

fix: categorize CI failures to distinguish expected process failures from infra issues
This commit is contained in:
2026-05-10 16:52:05 -07:00
committed by GitHub
2 changed files with 45 additions and 37 deletions
+35 -15
View File
@@ -12,6 +12,11 @@
# - excludes "headlamp-agent-skills" (skills bundle, not a Headlamp plugin)
# If discovery fails (network error, GH_TOKEN missing, API outage), we fall
# back to a hardcoded list so the health check still produces a useful report.
#
# Failure Categories:
# - code: test/lint/build/typecheck failures on main
# - infra: startup_failure, timed_out, runner issues
# - pending: action_required (awaiting review/approval) - informational only
set -euo pipefail
ORG="privilegedescalation"
@@ -44,6 +49,7 @@ echo ""
failures=0
warnings=0
process_pending=0
for repo in "${PLUGIN_REPOS[@]}"; do
echo "--- ${repo} ---"
@@ -57,18 +63,40 @@ for repo in "${PLUGIN_REPOS[@]}"; do
continue
fi
# Count CI failures on main — exclude E2E and Release (tracked separately below)
main_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
total=$(echo "$runs" | jq 'length')
if [ "$main_failures" -gt 0 ]; then
echo " FAIL: ${main_failures} CI failure(s) in last ${total} runs on main:"
# Categorize failures:
# - code failures: test/lint/build on main
# - infra failures: startup_failure, timed_out
# - process pending: action_required
code_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests")] | length')
infra_failures=$(echo "$runs" | jq '[.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out")] | length')
action_required=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
if [ "$code_failures" -gt 0 ]; then
echo " FAIL (code): ${code_failures} CI failure(s) in last ${total} runs on main:"
echo "$runs" | jq -r '.[] | select(.headBranch=="main" and .conclusion=="failure" and .name!="Release" and .name!="E2E Tests") | " - \(.name) (\(.updatedAt))"'
((failures++)) || true
else
fi
if [ "$infra_failures" -gt 0 ]; then
echo " FAIL (infra): ${infra_failures} infrastructure failure(s):"
echo "$runs" | jq -r '.[] | select(.conclusion=="startup_failure" or .conclusion=="timed_out") | " - \(.name): \(.conclusion) (\(.updatedAt))"'
((failures++)) || true
fi
if [ "$code_failures" -eq 0 ] && [ "$infra_failures" -eq 0 ]; then
echo " OK: CI passing on main"
fi
# Process pending — informational only (awaiting review/approval)
if [ "$action_required" -gt 0 ]; then
echo " INFO: ${action_required} workflow run(s) awaiting action (dual approval, review, etc.):"
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
((process_pending++)) || true
fi
# Surface E2E test failures as warnings (infra blocker: RBAC not yet applied — PRI-494)
e2e_failures=$(echo "$runs" | jq '[.[] | select(.headBranch=="main" and .name=="E2E Tests" and .conclusion=="failure")] | length')
if [ "$e2e_failures" -gt 0 ]; then
@@ -83,15 +111,6 @@ for repo in "${PLUGIN_REPOS[@]}"; do
((warnings++)) || true
fi
# Check for action_required — GitHub's "Require approval for first-time contributors" setting
# blocks workflow runs from GitHub App bot accounts. This is a CI pipeline blocker (see PRI-44).
action_required_count=$(echo "$runs" | jq '[.[] | select(.conclusion=="action_required")] | length')
if [ "$action_required_count" -gt 0 ]; then
echo " FAIL: ${action_required_count} workflow run(s) with action_required (GitHub App PR approval blocked):"
echo "$runs" | jq -r '.[] | select(.conclusion=="action_required") | " - \(.name) on \(.headBranch) (\(.updatedAt))"'
((failures++)) || true
fi
# Check latest release
latest_release=$(gh api "repos/${ORG}/${repo}/releases" --jq '.[0].tag_name // "none"' 2>/dev/null || echo "error")
echo " Latest release: ${latest_release}"
@@ -103,7 +122,8 @@ echo "=== Summary ==="
echo "Repos scanned: ${#PLUGIN_REPOS[@]}"
echo "With failures: ${failures}"
echo "With warnings: ${warnings}"
echo "With pending approval: ${process_pending}"
if [ "$failures" -gt 0 ]; then
exit 1
fi
fi
+10 -22
View File
@@ -1,22 +1,5 @@
name: Dual Approval Check
# Reusable workflow: verifies that both the CTO and QA bot accounts
# have approved a pull request. Plugin repos call this on
# pull_request_review events to get a required GitHub status check.
#
# Usage in a plugin repo's workflow:
#
# on:
# pull_request_review:
# types: [submitted, dismissed]
# pull_request:
# types: [opened, reopened, synchronize]
#
# jobs:
# dual-approval:
# uses: privilegedescalation/.github/.github/workflows/dual-approval-check.yaml@main
# secrets: inherit
on:
workflow_call:
inputs:
@@ -50,8 +33,8 @@ jobs:
PR_NUMBER: ${{ inputs.pr_number }}
REPO: ${{ github.repository }}
run: |
if [ -z "${PR_NUMBER}" ]; then
echo "::notice::No PR number in context (dismissed review?). Skipping dual approval check — no action needed."
if [ -z "${PR_NUMBER}" ] || [ "${PR_NUMBER}" = "null" ]; then
echo "::notice::No PR number in context (dismissed review or workflow_call without pr_number). Skipping dual approval check — no action needed."
exit 0
fi
@@ -62,11 +45,16 @@ jobs:
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${REPO}/pulls/${PR_NUMBER}/reviews")
if [ -z "${REVIEWS}" ] || [ "${REVIEWS}" = "null" ]; then
echo "::warning::Could not fetch reviews for PR #${PR_NUMBER}. Assuming no approvals yet."
exit 1
fi
CTO_APPROVED=$(echo "${REVIEWS}" | jq -r --arg user "${CTO_REVIEWER}" \
'[.[] | select(.user.login == $user or .user.login == ($user + "[bot]"))] | last | .state == "APPROVED"')
'[.[] | select(.user.login == $user or .user.login == ($user + "[bot]"))] | last | if .state then .state == "APPROVED" else false end')
QA_APPROVED=$(echo "${REVIEWS}" | jq -r --arg user "${QA_REVIEWER}" \
'[.[] | select(.user.login == $user or .user.login == ($user + "[bot]"))] | last | .state == "APPROVED"')
'[.[] | select(.user.login == $user or .user.login == ($user + "[bot]"))] | last | if .state then .state == "APPROVED" else false end')
echo "CTO (${CTO_REVIEWER}) approved: ${CTO_APPROVED}"
echo "QA (${QA_REVIEWER}) approved: ${QA_APPROVED}"
@@ -82,4 +70,4 @@ jobs:
echo " Missing: QA approval from ${QA_REVIEWER}"
fi
exit 1
fi
fi