From b1878c684ea40cc2a773b13f86b8a3bd433b87bd Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Mon, 27 Apr 2026 00:00:56 +0000 Subject: [PATCH] fix: retry-aware pod state lookup + honest truncation cause messages (FAR-107) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The single-shot getPodTerminatedState query lost a real race against kubelet's containerStatus update: when Claude exited cleanly but quickly, listNamespacedPod often returned the pod with phase=Succeeded/Failed but without a populated state.terminated, so describeTruncationCause fell into the catch-all "pod state unavailable — likely deleted before exit could be read" branch. That message is doubly wrong: the pod was not deleted and the exit cause was readable a few hundred ms later. Operators chasing claude_truncated runs (Nancy/Privileged Escalation) had no visibility into the actual exit code, OOMKilled flag, or reason. Two changes: 1. Introduce lookupPodState + getPodLookupWithRetry — the lookup result carries the pod phase and a podMissing flag, and retries up to 4×500ms when the pod is in a terminal phase but containerStatuses lag. When the pod is in a non-terminal phase or genuinely gone we bail immediately without burning the retry budget. 2. describeTruncationCause now distinguishes three states: - "pod is gone" (eviction, preemption, external delete) - "container terminated state not yet observable (pod phase=…)" - the existing populated-state path with exit code / reason / signal The truncation error path re-queries with the retry-aware lookup right before producing the message, so subsequent claude_truncated errors surface the actual exit cause (137=OOMKilled, 143=SIGTERM, kubelet reason text) instead of a misleading deletion claim. Co-Authored-By: Paperclip --- src/server/execute.ts | 96 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 10 deletions(-) diff --git a/src/server/execute.ts b/src/server/execute.ts index fed6fb3..640465e 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -650,30 +650,82 @@ export interface PodTerminatedState { signal: number | null; } -async function getPodTerminatedState( +/** + * Result of a pod-state lookup. `state` is the terminated state when available; + * `phase` and `podMissing` give the caller enough context to render an honest + * truncation-cause message instead of guessing "likely deleted" (FAR-107). + */ +export interface PodLookupResult { + state: PodTerminatedState | null; + phase: string | null; + podMissing: boolean; +} + +async function lookupPodState( namespace: string, jobName: string, kubeconfigPath?: string, -): Promise { +): Promise { const coreApi = getCoreApi(kubeconfigPath); const podList = await coreApi.listNamespacedPod({ namespace, labelSelector: `job-name=${jobName}`, }); const pod = podList.items[0]; - if (!pod) return null; + if (!pod) return { state: null, phase: null, podMissing: true }; + const phase = pod.status?.phase ?? null; const containerStatus = pod.status?.containerStatuses?.find((s) => s.name === "claude"); const terminated = containerStatus?.state?.terminated; - if (!terminated) return null; + if (!terminated) return { state: null, phase, podMissing: false }; return { - exitCode: terminated.exitCode ?? null, - reason: terminated.reason ?? null, - message: (terminated.message ?? "").trim() || null, - signal: terminated.signal ?? null, + state: { + exitCode: terminated.exitCode ?? null, + reason: terminated.reason ?? null, + message: (terminated.message ?? "").trim() || null, + signal: terminated.signal ?? null, + }, + phase, + podMissing: false, }; } +/** + * Read the claude container's terminated state, retrying briefly when the pod + * exists in a terminal phase but kubelet has not yet propagated the + * containerStatuses[].state.terminated field. Without this retry, fast + * truncated-stream exits surface as "pod state unavailable" (FAR-107) and + * mask the real exit code / OOMKilled / SIGTERM cause. + */ +async function getPodLookupWithRetry( + namespace: string, + jobName: string, + kubeconfigPath?: string, + attempts = 4, + delayMs = 500, +): Promise { + let last: PodLookupResult = { state: null, phase: null, podMissing: true }; + for (let i = 0; i < attempts; i++) { + last = await lookupPodState(namespace, jobName, kubeconfigPath); + if (last.state) return last; + if (last.podMissing) return last; + // Pod exists but no terminated state. If it is in a terminal phase the + // containerStatuses update is in flight — wait briefly and retry. If it + // is still Running/Pending, retrying is unlikely to help, so bail. + if (last.phase !== "Succeeded" && last.phase !== "Failed") return last; + if (i < attempts - 1) await new Promise((r) => setTimeout(r, delayMs)); + } + return last; +} + +async function getPodTerminatedState( + namespace: string, + jobName: string, + kubeconfigPath?: string, +): Promise { + return (await lookupPodState(namespace, jobName, kubeconfigPath)).state; +} + /** * Format a human-readable explanation for a truncated run, including the * pod's claude-container terminated state when available. Exit code 137 @@ -682,9 +734,17 @@ async function getPodTerminatedState( */ export function describeTruncationCause( state: PodTerminatedState | null, + lookup?: PodLookupResult, ): string { if (!state) { - return "pod state unavailable — likely deleted before exit could be read"; + if (lookup?.podMissing) { + return "pod is gone — Job pod was removed (eviction, preemption, or external delete) before exit could be read"; + } + if (lookup && !lookup.podMissing) { + const phaseHint = lookup.phase ? `pod phase=${lookup.phase}` : "pod present"; + return `container terminated state not yet observable (${phaseHint}) — kubelet status update did not land within retry window; exit cause unknown`; + } + return "pod state unavailable — exit cause unknown"; } const parts: string[] = []; if (state.exitCode !== null) { @@ -1554,7 +1614,23 @@ export async function execute(ctx: AdapterExecutionContext): Promise {}); + } + const cause = describeTruncationCause(refreshedState, lookup); const modelHint = parsedStream.model ? ` (model: ${parsedStream.model})` : ""; return { exitCode,