fix: reattach to orphaned K8s Jobs across Paperclip restarts (FAR-124)
When the Paperclip pod restarts mid-run, the in-process setInterval keepalive dies, `updatedAt` goes stale, and the server's orphan reaper fails the run with the (misleading) "child pid 1 is no longer running" message. Paperclip then dispatches a continuation run, whose execute() finds the previous run's K8s Job still happily running and deletes it as an "orphan" — throwing away work and producing the transcript/run cascade reported on FAR-124. Changes: - job-manifest: add `paperclip.io/task-id` and `paperclip.io/session-id` labels (sanitized via new `sanitizeLabelValue` helper) so a later execute() can identify an orphan as the continuation of the same logical unit of work. - execute: in the concurrency guard, when `reattachOrphanedJobs` is on (default) and an orphan matches agent + task + session + is not terminal, pick it as the reattach target; delete only the other orphans. Branch the build/create/waitForPod block so the reattach path skips manifest building, Secret creation, Job creation, and scheduling wait — it jumps straight to streaming logs and waiting for the existing pod's completion. - config-schema: expose `reattachOrphanedJobs` toggle (default true). - Tests: `sanitizeLabelValue`, `isReattachableOrphan`, new label presence/absence, config default. No server-side changes; the misleading reaper message and lack of a non-local retry path will be addressed in a follow-up upstream PR. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -1,5 +1,29 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { isK8s404, buildPartialRunError } from "./execute.js";
|
||||
import type * as k8s from "@kubernetes/client-node";
|
||||
import { isK8s404, buildPartialRunError, isReattachableOrphan } from "./execute.js";
|
||||
|
||||
function makeJob(opts: {
|
||||
runId?: string;
|
||||
agentId?: string;
|
||||
taskId?: string;
|
||||
sessionId?: string;
|
||||
adapterType?: string;
|
||||
terminal?: boolean;
|
||||
}): k8s.V1Job {
|
||||
const labels: Record<string, string> = {
|
||||
"paperclip.io/adapter-type": opts.adapterType ?? "claude_k8s",
|
||||
};
|
||||
if (opts.agentId) labels["paperclip.io/agent-id"] = opts.agentId;
|
||||
if (opts.runId) labels["paperclip.io/run-id"] = opts.runId;
|
||||
if (opts.taskId) labels["paperclip.io/task-id"] = opts.taskId;
|
||||
if (opts.sessionId) labels["paperclip.io/session-id"] = opts.sessionId;
|
||||
return {
|
||||
metadata: { name: "ac-job", namespace: "paperclip", labels },
|
||||
status: opts.terminal
|
||||
? { conditions: [{ type: "Complete", status: "True" }] }
|
||||
: { conditions: [] },
|
||||
} as k8s.V1Job;
|
||||
}
|
||||
|
||||
describe("isK8s404", () => {
|
||||
it("returns false for non-Error values", () => {
|
||||
@@ -106,3 +130,59 @@ describe("buildPartialRunError", () => {
|
||||
expect(msg).toBe("Claude exited with code 1: real error line");
|
||||
});
|
||||
});
|
||||
|
||||
describe("isReattachableOrphan", () => {
|
||||
const agentId = "agent-abc";
|
||||
const taskId = "task-xyz";
|
||||
const sessionId = "sess-123";
|
||||
|
||||
it("returns true when agent/task/session all match and Job is not terminal", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId, runId: "old-run" });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false when the Job is already Complete", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId, runId: "old-run", terminal: true });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when expected taskId is null (caller couldn't derive one)", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId: null, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when expected sessionId is null", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId: null })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when agent id doesn't match", () => {
|
||||
const job = makeJob({ agentId: "agent-other", taskId, sessionId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when task id doesn't match", () => {
|
||||
const job = makeJob({ agentId, taskId: "task-other", sessionId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when session id doesn't match", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId: "sess-other" });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when the Job is from a different adapter type", () => {
|
||||
const job = makeJob({ agentId, taskId, sessionId, adapterType: "claude_local" });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when Job has no task-id label (labels were introduced in FAR-124)", () => {
|
||||
const job = makeJob({ agentId, sessionId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when Job has no session-id label", () => {
|
||||
const job = makeJob({ agentId, taskId });
|
||||
expect(isReattachableOrphan(job, { agentId, taskId, sessionId })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user