From be84428226378ad966db14ea5241c543d41e865b Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Sun, 26 Apr 2026 21:05:04 +0000 Subject: [PATCH] fix: enrich k8s_job_deleted_externally error with forensics + verify Job presence on grace fire (FAR-107) The error previously fired with no diagnostic context, making it impossible to distinguish (a) self-delete by our SIGTERM/cancel path, (b) TTL after a missed Complete condition, or (c) actual external deletion without cluster shell access. Two changes: 1. Grace-period verification: when the log stream exits and the 30s grace timer fires, do a one-shot readNamespacedJob before declaring the Job gone. If it's still there, settle as gracePeriodFired (not jobGone) so we don't mis-classify K8s condition propagation lag as deletion. 2. Forensic capture: track which of the three detection paths (completion-poll-404, grace-period-verify-404, recheck-poll-404) first observed the 404, the last successful Job conditions read, the poll count, elapsed time since pod-running, and stdout size. Append all of it to the errorMessage so the next occurrence is self-diagnosing. Co-Authored-By: Paperclip --- src/server/execute.test.ts | 3 +- src/server/execute.ts | 91 +++++++++++++++++++++++++++++++++++--- 2 files changed, 87 insertions(+), 7 deletions(-) diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index c27730b..cf7c63a 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1019,7 +1019,8 @@ describe("execute: happy path", () => { const result = await executePromise; expect(result.errorCode).toBe("k8s_job_deleted_externally"); - expect(result.errorMessage).toBe("K8s Job was deleted externally before Claude could complete"); + expect(result.errorMessage).toMatch(/^K8s Job was deleted externally before Claude could complete \[/); + expect(result.errorMessage).toContain("detected_via="); expect(result.exitCode).toBeNull(); }); diff --git a/src/server/execute.ts b/src/server/execute.ts index b2f1c72..8473ce2 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -582,11 +582,14 @@ async function readPodLogs( * is treated as a soft terminal: succeeded=false, timedOut=false, jobGone=true. * The caller should log this and fall through to stdout parsing. */ +type JobConditionSnapshot = { type?: string; status?: string; reason?: string; message?: string }; + async function waitForJobCompletion( namespace: string, jobName: string, timeoutMs: number, kubeconfigPath?: string, + observer?: { lastConditions: JobConditionSnapshot[] | null; pollCount: number }, ): Promise<{ succeeded: boolean; timedOut: boolean; jobGone?: boolean }> { const batchApi = getBatchApi(kubeconfigPath); const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : 0; @@ -605,6 +608,12 @@ async function waitForJobCompletion( throw err; } const conditions = job.status?.conditions ?? []; + if (observer) { + observer.pollCount += 1; + observer.lastConditions = conditions.map((c) => ({ + type: c.type, status: c.status, reason: c.reason, message: c.message, + })); + } const complete = conditions.find((c) => c.type === "Complete" && c.status === "True"); if (complete) return { succeeded: true, timedOut: false }; @@ -1112,6 +1121,17 @@ export async function execute(ctx: AdapterExecutionContext): Promise | null = null; const completionWithGrace = new Promise((resolve, reject) => { let settled = false; @@ -1297,11 +1318,37 @@ export async function execute(ctx: AdapterExecutionContext): Promise { if (logExitTime !== null && Date.now() - logExitTime >= LOG_EXIT_COMPLETION_GRACE_MS) { - void onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update — proceeding with captured output (FAR-23)\n`).catch(() => {}); - settleOk({ succeeded: false, timedOut: false, jobGone: true }); + // Stop the grace poller immediately so we don't double-fire while the + // verification read below is in flight. + if (gracePoller) { clearInterval(gracePoller); gracePoller = null; } + // The log stream exiting only means the container stopped producing + // output — it does NOT prove the Job was deleted. Verify Job + // presence with a one-shot read so we can distinguish: + // (a) Job 404 → truly gone (TTL or external deletion) + // (b) Job still present → K8s condition propagation lag (FAR-23) + // Without this check we mis-classify (b) as "deleted externally" and + // emit a false-positive k8s_job_deleted_externally error (FAR-107). + void (async () => { + try { + await getBatchApi(kubeconfigPath).readNamespacedJob({ name: jobName, namespace }); + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update; Job ${jobName} still present — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true }); + } catch (err: unknown) { + if (isK8s404(err)) { + jobGoneDetectionPath = "grace-period-verify-404"; + jobGoneAt = Date.now(); + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago and Job ${jobName} is gone (TTL or external deletion) — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, jobGone: true }); + } else { + // K8s API hiccup — bail out without claiming external deletion. + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago; Job state unverifiable (${err instanceof Error ? err.message : String(err)}) — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true }); + } + } + })(); } }, 1_000); }); @@ -1369,6 +1416,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise 0) { + const summary = lastConds + .map((c) => `${c.type}=${c.status}${c.reason ? `(${c.reason})` : ""}`) + .join(","); + detailParts.push(`last_job_conditions=[${summary}]`); + } else { + detailParts.push("last_job_conditions=none_observed"); + } + detailParts.push(`stdout_bytes=${stdout.length}`); + const stdoutLines = stdout.split("\n").filter((l) => l.trim()).length; + detailParts.push(`stdout_nonempty_lines=${stdoutLines}`); return { exitCode, signal: null, timedOut: false, - errorMessage: "K8s Job was deleted externally before Claude could complete", + errorMessage: `K8s Job was deleted externally before Claude could complete [${detailParts.join(", ")}]`, errorCode: "k8s_job_deleted_externally", resultJson: { stdout }, };