fix: prevent process_lost when K8s Job completes (FAR-10)
Four stacked bugs caused the adapter to hang after K8s Job completion, allowing the 5-minute reaper to mark runs process_lost even when the Job actually succeeded. - streamPodLogsOnce: add stopSignal polling loop that destroys the writable every 200ms once the job-completion branch fires, aborting any in-flight follow stream that would otherwise hang indefinitely - waitForPod: treat phase=Failed as a terminal error (throw via describePodTerminatedError) instead of entering the log-stream path with a dead pod (new helper is exported for unit tests) - waitForPod: surface cs.state?.terminated in the per-tick detail line so operators see exit code / reason without needing kubectl - keepalive: add POST_TERMINAL_KEEPALIVE_MS (90s) window after Job goes terminal so onSpawn keeps refreshing updatedAt during cleanup; if execute() genuinely stalls past 90s the reaper will still catch it Regression tests added for describePodTerminatedError (phase=Failed with and without claude container status). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import type * as k8s from "@kubernetes/client-node";
|
||||
import { isK8s404, buildPartialRunError, isReattachableOrphan } from "./execute.js";
|
||||
import { isK8s404, buildPartialRunError, isReattachableOrphan, describePodTerminatedError } from "./execute.js";
|
||||
|
||||
function makeJob(opts: {
|
||||
runId?: string;
|
||||
@@ -186,3 +186,62 @@ describe("isReattachableOrphan", () => {
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// Regression: FAR-10 — waitForPod must throw on phase=Failed, not return the pod name.
|
||||
// These tests cover describePodTerminatedError, the helper that waitForPod uses to build
|
||||
// the error message before throwing. Verifies that phase=Failed with no claude logs
|
||||
// produces a structured, actionable error instead of silently entering the log-stream path.
|
||||
describe("describePodTerminatedError", () => {
|
||||
it("includes exit code and reason when claude container status is available", () => {
|
||||
const cs = [
|
||||
{
|
||||
name: "claude",
|
||||
state: { terminated: { exitCode: 137, reason: "OOMKilled" } },
|
||||
},
|
||||
] as k8s.V1ContainerStatus[];
|
||||
const msg = describePodTerminatedError("mypod", "Failed", cs);
|
||||
expect(msg).toContain("137");
|
||||
expect(msg).toContain("OOMKilled");
|
||||
expect(msg).toContain("phase=Failed");
|
||||
});
|
||||
|
||||
it("falls back to message field when reason is absent", () => {
|
||||
const cs = [
|
||||
{
|
||||
name: "claude",
|
||||
state: { terminated: { exitCode: 1, message: "signal: killed" } },
|
||||
},
|
||||
] as k8s.V1ContainerStatus[];
|
||||
const msg = describePodTerminatedError("mypod", "Failed", cs);
|
||||
expect(msg).toContain("signal: killed");
|
||||
expect(msg).toContain("1");
|
||||
});
|
||||
|
||||
it("returns generic message when no claude container status is present", () => {
|
||||
const msg = describePodTerminatedError("mypod", "Failed", []);
|
||||
expect(msg).toBe("Pod mypod reached phase=Failed");
|
||||
});
|
||||
|
||||
it("ignores non-claude containers", () => {
|
||||
const cs = [
|
||||
{
|
||||
name: "sidecar",
|
||||
state: { terminated: { exitCode: 0, reason: "Completed" } },
|
||||
},
|
||||
] as k8s.V1ContainerStatus[];
|
||||
const msg = describePodTerminatedError("mypod", "Failed", cs);
|
||||
expect(msg).toBe("Pod mypod reached phase=Failed");
|
||||
});
|
||||
|
||||
it("handles null exitCode gracefully", () => {
|
||||
const cs = [
|
||||
{
|
||||
name: "claude",
|
||||
state: { terminated: { exitCode: null, reason: "Error" } },
|
||||
},
|
||||
] as unknown as k8s.V1ContainerStatus[];
|
||||
const msg = describePodTerminatedError("mypod", "Failed", cs);
|
||||
expect(msg).toContain("unknown");
|
||||
expect(msg).toContain("Error");
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user