fix: prevent process_lost when K8s Job completes (FAR-10)

Four stacked bugs caused the adapter to hang after K8s Job completion,
allowing the 5-minute reaper to mark runs process_lost even when the Job
actually succeeded.

- streamPodLogsOnce: add stopSignal polling loop that destroys the
  writable every 200ms once the job-completion branch fires, aborting
  any in-flight follow stream that would otherwise hang indefinitely
- waitForPod: treat phase=Failed as a terminal error (throw via
  describePodTerminatedError) instead of entering the log-stream path
  with a dead pod (new helper is exported for unit tests)
- waitForPod: surface cs.state?.terminated in the per-tick detail line
  so operators see exit code / reason without needing kubectl
- keepalive: add POST_TERMINAL_KEEPALIVE_MS (90s) window after Job goes
  terminal so onSpawn keeps refreshing updatedAt during cleanup; if
  execute() genuinely stalls past 90s the reaper will still catch it

Regression tests added for describePodTerminatedError (phase=Failed
with and without claude container status).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Gandalf the Greybeard
2026-04-23 15:59:51 +00:00
parent 78fd702ccb
commit b3c1519cf5
2 changed files with 130 additions and 6 deletions
+60 -1
View File
@@ -1,6 +1,6 @@
import { describe, it, expect } from "vitest";
import type * as k8s from "@kubernetes/client-node";
import { isK8s404, buildPartialRunError, isReattachableOrphan } from "./execute.js";
import { isK8s404, buildPartialRunError, isReattachableOrphan, describePodTerminatedError } from "./execute.js";
function makeJob(opts: {
runId?: string;
@@ -186,3 +186,62 @@ describe("isReattachableOrphan", () => {
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
});
});
// Regression: FAR-10 — waitForPod must throw on phase=Failed, not return the pod name.
// These tests cover describePodTerminatedError, the helper that waitForPod uses to build
// the error message before throwing. Verifies that phase=Failed with no claude logs
// produces a structured, actionable error instead of silently entering the log-stream path.
describe("describePodTerminatedError", () => {
it("includes exit code and reason when claude container status is available", () => {
const cs = [
{
name: "claude",
state: { terminated: { exitCode: 137, reason: "OOMKilled" } },
},
] as k8s.V1ContainerStatus[];
const msg = describePodTerminatedError("mypod", "Failed", cs);
expect(msg).toContain("137");
expect(msg).toContain("OOMKilled");
expect(msg).toContain("phase=Failed");
});
it("falls back to message field when reason is absent", () => {
const cs = [
{
name: "claude",
state: { terminated: { exitCode: 1, message: "signal: killed" } },
},
] as k8s.V1ContainerStatus[];
const msg = describePodTerminatedError("mypod", "Failed", cs);
expect(msg).toContain("signal: killed");
expect(msg).toContain("1");
});
it("returns generic message when no claude container status is present", () => {
const msg = describePodTerminatedError("mypod", "Failed", []);
expect(msg).toBe("Pod mypod reached phase=Failed");
});
it("ignores non-claude containers", () => {
const cs = [
{
name: "sidecar",
state: { terminated: { exitCode: 0, reason: "Completed" } },
},
] as k8s.V1ContainerStatus[];
const msg = describePodTerminatedError("mypod", "Failed", cs);
expect(msg).toBe("Pod mypod reached phase=Failed");
});
it("handles null exitCode gracefully", () => {
const cs = [
{
name: "claude",
state: { terminated: { exitCode: null, reason: "Error" } },
},
] as unknown as k8s.V1ContainerStatus[];
const msg = describePodTerminatedError("mypod", "Failed", cs);
expect(msg).toContain("unknown");
expect(msg).toContain("Error");
});
});