From 8474f78fe1135d04469959969eb0550c82615864 Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Sun, 26 Apr 2026 01:57:43 +0000 Subject: [PATCH] fix: include pod terminated reason/message in claude_truncated error (FAR-95) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Capture the claude container's terminated state (exit code, reason, message, signal) and surface it in the truncation error so operators see *why* the run was cut short — e.g. "exit code 137, SIGKILL (commonly OOMKilled), reason=OOMKilled, message=Memory cgroup out of memory" instead of just a "truncated" label with no diagnostic context. Co-Authored-By: Paperclip --- src/server/execute.test.ts | 34 ++++++++++++++++++-- src/server/execute.ts | 64 +++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 6 deletions(-) diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index 0aaf02f..31f3d1e 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -60,7 +60,7 @@ vi.mock("@paperclipai/adapter-utils/server-utils", async (importOriginal) => { }); }); -const { isK8s404, buildPartialRunError, classifyOrphan, describePodTerminatedError, streamPodLogsOnce, shouldAbortForCancellation, execute } = await import("./execute.js"); +const { isK8s404, buildPartialRunError, classifyOrphan, describePodTerminatedError, describeTruncationCause, streamPodLogsOnce, shouldAbortForCancellation, execute } = await import("./execute.js"); function makeJob(opts: { runId?: string; @@ -362,6 +362,33 @@ describe("describePodTerminatedError", () => { }); }); +describe("describeTruncationCause", () => { + it("annotates exit code 137 as SIGKILL/OOM", () => { + const msg = describeTruncationCause({ exitCode: 137, reason: "OOMKilled", message: "Memory cgroup out of memory", signal: null }); + expect(msg).toContain("exit code 137"); + expect(msg).toContain("SIGKILL"); + expect(msg).toContain("OOMKilled"); + expect(msg).toContain("Memory cgroup out of memory"); + }); + + it("annotates exit code 143 as SIGTERM", () => { + const msg = describeTruncationCause({ exitCode: 143, reason: null, message: null, signal: null }); + expect(msg).toContain("exit code 143"); + expect(msg).toContain("SIGTERM"); + }); + + it("falls back to 'pod state unavailable' when state is null", () => { + const msg = describeTruncationCause(null); + expect(msg).toContain("pod state unavailable"); + }); + + it("emits 'no exit code' when exitCode is null but state exists", () => { + const msg = describeTruncationCause({ exitCode: null, reason: "Error", message: null, signal: null }); + expect(msg).toContain("no exit code"); + expect(msg).toContain("reason=Error"); + }); +}); + describe("execute: all-invalid agent.id (N4)", () => { it("returns hard error without creating a Job when agent.id sanitizes to null", async () => { const logs: string[] = []; @@ -1019,7 +1046,7 @@ describe("execute: happy path", () => { }, ); mockCoreListPods.mockResolvedValue({ - items: [{ metadata: { name: "pod-abc" }, status: { containerStatuses: [{ name: "claude", state: { terminated: { exitCode: 137 } } }] } }], + items: [{ metadata: { name: "pod-abc" }, status: { containerStatuses: [{ name: "claude", state: { terminated: { exitCode: 137, reason: "OOMKilled", message: "Memory cgroup out of memory" } } }] } }], }); const executePromise = execute(makeCtx()); @@ -1030,6 +1057,9 @@ describe("execute: happy path", () => { expect(result.errorMessage).toContain("truncated mid-stream"); expect(result.errorMessage).toContain("claude-opus-4-7"); expect(result.errorMessage).toContain("exit code 137"); + expect(result.errorMessage).toContain("SIGKILL"); + expect(result.errorMessage).toContain("OOMKilled"); + expect(result.errorMessage).toContain("Memory cgroup out of memory"); }); it("reconnects log stream and logs status when job completion takes > 3s", async () => { diff --git a/src/server/execute.ts b/src/server/execute.ts index ea70f50..558fe81 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -574,6 +574,27 @@ async function waitForJobCompletion( * Get the exit code from the Job's pod. */ async function getPodExitCode(namespace: string, jobName: string, kubeconfigPath?: string): Promise { + const state = await getPodTerminatedState(namespace, jobName, kubeconfigPath); + return state?.exitCode ?? null; +} + +/** + * Get the claude container's terminated state (exit code, reason, message, + * signal) from the Job's pod. Returns null if the pod or container is gone. + * Used by the no-result error path to explain *why* a run was truncated. + */ +export interface PodTerminatedState { + exitCode: number | null; + reason: string | null; + message: string | null; + signal: number | null; +} + +async function getPodTerminatedState( + namespace: string, + jobName: string, + kubeconfigPath?: string, +): Promise { const coreApi = getCoreApi(kubeconfigPath); const podList = await coreApi.listNamespacedPod({ namespace, @@ -583,7 +604,40 @@ async function getPodExitCode(namespace: string, jobName: string, kubeconfigPath if (!pod) return null; const containerStatus = pod.status?.containerStatuses?.find((s) => s.name === "claude"); - return containerStatus?.state?.terminated?.exitCode ?? null; + const terminated = containerStatus?.state?.terminated; + if (!terminated) return null; + return { + exitCode: terminated.exitCode ?? null, + reason: terminated.reason ?? null, + message: (terminated.message ?? "").trim() || null, + signal: terminated.signal ?? null, + }; +} + +/** + * Format a human-readable explanation for a truncated run, including the + * pod's claude-container terminated state when available. Exit code 137 + * is annotated as SIGKILL/OOM since that is the most common cause. + * Exported for unit tests. + */ +export function describeTruncationCause( + state: PodTerminatedState | null, +): string { + if (!state) { + return "pod state unavailable — likely deleted before exit could be read"; + } + const parts: string[] = []; + if (state.exitCode !== null) { + parts.push(`exit code ${state.exitCode}`); + if (state.exitCode === 137) parts.push("SIGKILL (commonly OOMKilled)"); + else if (state.exitCode === 143) parts.push("SIGTERM"); + } else { + parts.push("no exit code"); + } + if (state.signal !== null) parts.push(`signal ${state.signal}`); + if (state.reason) parts.push(`reason=${state.reason}`); + if (state.message) parts.push(`message=${state.message}`); + return parts.join(", "); } /** @@ -998,6 +1052,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise | null = null; // Set when we return a mismatch error so the finally block knows not to @@ -1297,7 +1352,8 @@ export async function execute(ctx: AdapterExecutionContext): Promise