Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 17a9aa165a | |||
| 3e306b70f8 | |||
| 3aa9c15e80 | |||
| 957cf144a7 | |||
| 52b1429ba0 | |||
| 66575982af | |||
| 66932958b1 | |||
| 0d5f65176b | |||
| 5670c008e1 | |||
| f9325772bd |
@@ -1,64 +0,0 @@
|
|||||||
name: Workflow Recovery
|
|
||||||
|
|
||||||
on:
|
|
||||||
schedule:
|
|
||||||
- cron: '*/5 * * * *'
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
recover-stuck-runs:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
timeout-minutes: 10
|
|
||||||
steps:
|
|
||||||
- name: Generate GitHub App token
|
|
||||||
id: app-token
|
|
||||||
if: vars.RELEASE_APP_ID != ''
|
|
||||||
uses: actions/create-github-app-token@v3
|
|
||||||
with:
|
|
||||||
app-id: ${{ vars.RELEASE_APP_ID }}
|
|
||||||
private-key: ${{ secrets.RELEASE_APP_PRIVATE_KEY }}
|
|
||||||
owner: privilegedescalation
|
|
||||||
|
|
||||||
- name: Detect and re-run stuck action_required runs
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ steps.app-token.outputs.token || github.token }}
|
|
||||||
run: |
|
|
||||||
echo "Checking for action_required runs in privilegedescalation org..."
|
|
||||||
|
|
||||||
RUNS=$(curl -sf -H "Authorization: Bearer $GH_TOKEN" \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
"https://api.github.com/orgs/privilegedescalation/actions/runs?status=action_required&per_page=50" \
|
|
||||||
|| echo '{"workflow_runs": []}')
|
|
||||||
|
|
||||||
COUNT=$(echo "$RUNS" | jq '.workflow_runs | length')
|
|
||||||
echo "Found $COUNT action_required runs"
|
|
||||||
|
|
||||||
if [ "$COUNT" = "0" ] || [ "$COUNT" = "null" ]; then
|
|
||||||
echo "No stuck runs found. Exiting."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$RUNS" | jq -r '.workflow_runs[] | @json' | while read -r run; do
|
|
||||||
RUN_ID=$(echo "$run" | jq -r '.id')
|
|
||||||
WORKFLOW_NAME=$(echo "$run" | jq -r '.name')
|
|
||||||
REPO=$(echo "$run" | jq -r '.repository.full_name')
|
|
||||||
BRANCH=$(echo "$run" | jq -r '.head_branch')
|
|
||||||
CREATED_AT=$(echo "$run" | jq -r '.created_at')
|
|
||||||
|
|
||||||
echo "Found stuck run: $WORKFLOW_NAME (#$RUN_ID) on $REPO branch $BRANCH"
|
|
||||||
echo "Created at: $CREATED_AT"
|
|
||||||
echo "Re-running..."
|
|
||||||
|
|
||||||
RESP=$(curl -sf -X POST \
|
|
||||||
-H "Authorization: Bearer $GH_TOKEN" \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
"https://api.github.com/repos/$REPO/actions/runs/$RUN_ID/rerun" \
|
|
||||||
-w "\n%{http_code}")
|
|
||||||
|
|
||||||
HTTP_CODE=$(echo "$RESP" | tail -1)
|
|
||||||
if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "204" ]; then
|
|
||||||
echo "Successfully re-ran $WORKFLOW_NAME (#$RUN_ID)"
|
|
||||||
else
|
|
||||||
echo "Failed to re-run $WORKFLOW_NAME (#$RUN_ID): $HTTP_CODE"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
@@ -66,7 +66,7 @@ test.describe('Intel GPU plugin smoke tests', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
await page.goto('/c/main/intel-gpu/nodes');
|
await page.goto('/c/main/intel-gpu/nodes');
|
||||||
await expect(page.getByRole('heading', { name: /node/i })).toBeVisible({ timeout: 15_000 });
|
await expect(page.getByRole('heading', { name: /intel gpu.*nodes/i })).toBeVisible({ timeout: 15_000 });
|
||||||
|
|
||||||
await page.goto('/c/main/intel-gpu/pods');
|
await page.goto('/c/main/intel-gpu/pods');
|
||||||
await expect(page.getByRole('heading', { name: /pod/i })).toBeVisible({ timeout: 15_000 });
|
await expect(page.getByRole('heading', { name: /pod/i })).toBeVisible({ timeout: 15_000 });
|
||||||
|
|||||||
@@ -151,4 +151,27 @@ describe('IntelGpuDataProvider', () => {
|
|||||||
expect(callCountAfter).toBeGreaterThan(callCountBefore);
|
expect(callCountAfter).toBeGreaterThan(callCountBefore);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('treats a hanging CRD request as unavailable after 2s timeout', async () => {
|
||||||
|
vi.useFakeTimers();
|
||||||
|
const nodeWrapper = { jsonData: {} };
|
||||||
|
vi.mocked(K8s.ResourceClasses.Node.useList).mockReturnValue([[nodeWrapper], null] as any);
|
||||||
|
vi.mocked(K8s.ResourceClasses.Pod.useList).mockReturnValue([[nodeWrapper], null] as any);
|
||||||
|
vi.mocked(ApiProxy.request)
|
||||||
|
.mockReturnValueOnce(new Promise(() => {}))
|
||||||
|
.mockResolvedValueOnce({ items: [] })
|
||||||
|
.mockResolvedValueOnce({ items: [] })
|
||||||
|
.mockResolvedValueOnce({ items: [] });
|
||||||
|
|
||||||
|
const { result } = renderHook(() => useIntelGpuContext(), { wrapper: Wrapper });
|
||||||
|
|
||||||
|
expect(result.current.loading).toBe(true);
|
||||||
|
|
||||||
|
vi.advanceTimersByTime(2000);
|
||||||
|
await act(async () => {});
|
||||||
|
expect(result.current.crdAvailable).toBe(false);
|
||||||
|
expect(result.current.loading).toBe(false);
|
||||||
|
|
||||||
|
vi.useRealTimers();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -69,6 +69,18 @@ export function useIntelGpuContext(): IntelGpuContextValue {
|
|||||||
// Helpers
|
// Helpers
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const DEFAULT_REQUEST_TIMEOUT_MS = 2_000;
|
||||||
|
|
||||||
|
/** Wraps a promise with a timeout, rejecting if it doesn't settle within ms. */
|
||||||
|
function withTimeout<T>(promise: Promise<T>, ms: number): Promise<T> {
|
||||||
|
return Promise.race([
|
||||||
|
promise,
|
||||||
|
new Promise<T>((_, reject) =>
|
||||||
|
setTimeout(() => reject(new Error(`Request timed out after ${ms}ms`)), ms)
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
/** Extract raw Kubernetes JSON from Headlamp KubeObject wrappers. */
|
/** Extract raw Kubernetes JSON from Headlamp KubeObject wrappers. */
|
||||||
const extractJsonData = (items: unknown[]): unknown[] =>
|
const extractJsonData = (items: unknown[]): unknown[] =>
|
||||||
items.map(item =>
|
items.map(item =>
|
||||||
@@ -108,8 +120,11 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
|||||||
try {
|
try {
|
||||||
// GpuDevicePlugin CRDs — graceful degradation if CRD not installed
|
// GpuDevicePlugin CRDs — graceful degradation if CRD not installed
|
||||||
try {
|
try {
|
||||||
const pluginList = await ApiProxy.request(
|
const pluginList = await withTimeout(
|
||||||
`/apis/${INTEL_DEVICE_PLUGIN_API_GROUP}/${INTEL_DEVICE_PLUGIN_API_VERSION}/gpudeviceplugins`
|
ApiProxy.request(
|
||||||
|
`/apis/${INTEL_DEVICE_PLUGIN_API_GROUP}/${INTEL_DEVICE_PLUGIN_API_VERSION}/gpudeviceplugins`
|
||||||
|
),
|
||||||
|
DEFAULT_REQUEST_TIMEOUT_MS
|
||||||
);
|
);
|
||||||
if (!cancelled && isKubeList(pluginList)) {
|
if (!cancelled && isKubeList(pluginList)) {
|
||||||
setCrdAvailable(true);
|
setCrdAvailable(true);
|
||||||
@@ -139,7 +154,7 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
|||||||
|
|
||||||
for (const url of pluginPodSelectors) {
|
for (const url of pluginPodSelectors) {
|
||||||
try {
|
try {
|
||||||
const list = await ApiProxy.request(url);
|
const list = await withTimeout(ApiProxy.request(url), DEFAULT_REQUEST_TIMEOUT_MS);
|
||||||
if (!cancelled && isKubeList(list)) {
|
if (!cancelled && isKubeList(list)) {
|
||||||
const gpuPluginPods = filterIntelGpuPluginPods(list.items);
|
const gpuPluginPods = filterIntelGpuPluginPods(list.items);
|
||||||
foundPluginPods.push(...gpuPluginPods);
|
foundPluginPods.push(...gpuPluginPods);
|
||||||
|
|||||||
Reference in New Issue
Block a user