diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index c27730b..cf7c63a 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1019,7 +1019,8 @@ describe("execute: happy path", () => { const result = await executePromise; expect(result.errorCode).toBe("k8s_job_deleted_externally"); - expect(result.errorMessage).toBe("K8s Job was deleted externally before Claude could complete"); + expect(result.errorMessage).toMatch(/^K8s Job was deleted externally before Claude could complete \[/); + expect(result.errorMessage).toContain("detected_via="); expect(result.exitCode).toBeNull(); }); diff --git a/src/server/execute.ts b/src/server/execute.ts index b2f1c72..8473ce2 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -582,11 +582,14 @@ async function readPodLogs( * is treated as a soft terminal: succeeded=false, timedOut=false, jobGone=true. * The caller should log this and fall through to stdout parsing. */ +type JobConditionSnapshot = { type?: string; status?: string; reason?: string; message?: string }; + async function waitForJobCompletion( namespace: string, jobName: string, timeoutMs: number, kubeconfigPath?: string, + observer?: { lastConditions: JobConditionSnapshot[] | null; pollCount: number }, ): Promise<{ succeeded: boolean; timedOut: boolean; jobGone?: boolean }> { const batchApi = getBatchApi(kubeconfigPath); const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : 0; @@ -605,6 +608,12 @@ async function waitForJobCompletion( throw err; } const conditions = job.status?.conditions ?? []; + if (observer) { + observer.pollCount += 1; + observer.lastConditions = conditions.map((c) => ({ + type: c.type, status: c.status, reason: c.reason, message: c.message, + })); + } const complete = conditions.find((c) => c.type === "Complete" && c.status === "True"); if (complete) return { succeeded: true, timedOut: false }; @@ -1112,6 +1121,17 @@ export async function execute(ctx: AdapterExecutionContext): Promise | null = null; const completionWithGrace = new Promise((resolve, reject) => { let settled = false; @@ -1297,11 +1318,37 @@ export async function execute(ctx: AdapterExecutionContext): Promise { if (logExitTime !== null && Date.now() - logExitTime >= LOG_EXIT_COMPLETION_GRACE_MS) { - void onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update — proceeding with captured output (FAR-23)\n`).catch(() => {}); - settleOk({ succeeded: false, timedOut: false, jobGone: true }); + // Stop the grace poller immediately so we don't double-fire while the + // verification read below is in flight. + if (gracePoller) { clearInterval(gracePoller); gracePoller = null; } + // The log stream exiting only means the container stopped producing + // output — it does NOT prove the Job was deleted. Verify Job + // presence with a one-shot read so we can distinguish: + // (a) Job 404 → truly gone (TTL or external deletion) + // (b) Job still present → K8s condition propagation lag (FAR-23) + // Without this check we mis-classify (b) as "deleted externally" and + // emit a false-positive k8s_job_deleted_externally error (FAR-107). + void (async () => { + try { + await getBatchApi(kubeconfigPath).readNamespacedJob({ name: jobName, namespace }); + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update; Job ${jobName} still present — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true }); + } catch (err: unknown) { + if (isK8s404(err)) { + jobGoneDetectionPath = "grace-period-verify-404"; + jobGoneAt = Date.now(); + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago and Job ${jobName} is gone (TTL or external deletion) — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, jobGone: true }); + } else { + // K8s API hiccup — bail out without claiming external deletion. + await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago; Job state unverifiable (${err instanceof Error ? err.message : String(err)}) — proceeding with captured output (FAR-23)\n`).catch(() => {}); + settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true }); + } + } + })(); } }, 1_000); }); @@ -1369,6 +1416,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise 0) { + const summary = lastConds + .map((c) => `${c.type}=${c.status}${c.reason ? `(${c.reason})` : ""}`) + .join(","); + detailParts.push(`last_job_conditions=[${summary}]`); + } else { + detailParts.push("last_job_conditions=none_observed"); + } + detailParts.push(`stdout_bytes=${stdout.length}`); + const stdoutLines = stdout.split("\n").filter((l) => l.trim()).length; + detailParts.push(`stdout_nonempty_lines=${stdoutLines}`); return { exitCode, signal: null, timedOut: false, - errorMessage: "K8s Job was deleted externally before Claude could complete", + errorMessage: `K8s Job was deleted externally before Claude could complete [${detailParts.join(", ")}]`, errorCode: "k8s_job_deleted_externally", resultJson: { stdout }, };