From 6923597b3134076afc8ed1245892b9ec493712bc Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Sun, 26 Apr 2026 21:19:02 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20do=20not=20delete=20active=20Jobs=20on?= =?UTF-8?q?=20SIGTERM=20=E2=80=94=20leave=20for=20orphan=20reattach=20(FAR?= =?UTF-8?q?-107)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of Nancy's k8s_job_deleted_externally false positive: the paperclip server itself receives SIGTERM during rolling deploys, evictions, scale-down, etc. The previous SIGTERM handler iterated activeJobs and deleted every Job before exiting, which surfaced in the in-flight heartbeat as "K8s Job was deleted externally" — even though nothing external touched it. With reattachOrphanedJobs=true (default), this is exactly the wrong behaviour: leaving the Jobs alive lets the next paperclip process discover them via the orphan-classification path and reattach their log streams. With reattachOrphanedJobs=false the operator opted into manual cleanup, so we still must not auto-delete. The Job's ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so both survive together and TTL handles cleanup on natural completion. Test rewritten to assert the new contract: SIGTERM must not touch K8s Jobs. Co-Authored-By: Paperclip --- src/server/execute.test.ts | 14 ++++++++------ src/server/execute.ts | 38 ++++++++++++++------------------------ 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index cf7c63a..a56b411 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1771,7 +1771,7 @@ describe("execute: SIGTERM handler best-effort cleanup", () => { vi.useRealTimers(); }); - it("deletes the active Job when SIGTERM fires during execution", async () => { + it("does NOT delete active Jobs on SIGTERM — leaves them for orphan reattach (FAR-107)", async () => { // Mock process.kill to prevent the test process from actually being killed. const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); @@ -1782,17 +1782,19 @@ describe("execute: SIGTERM handler best-effort cleanup", () => { // Flush microtasks through the async setup chain: getSelfPodInfo, listJobs, // readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and // ensureSigtermHandler() all complete before the try block enters streaming. - // 30 rounds is more than enough for the ~7 sequential await points. for (let i = 0; i < 30; i++) await Promise.resolve(); - // Emit SIGTERM — the process.once handler fires synchronously and kicks off - // async cleanup (deleteNamespacedJob). The mock resolves immediately. + // Reset deleteJob spy after setup so we can detect any SIGTERM-driven calls. + mockBatchDeleteJob.mockClear(); + + // Emit SIGTERM — the handler must re-raise to the default handler without + // touching the K8s Job. Deleting the Job here would surface as + // k8s_job_deleted_externally in the in-flight run (FAR-107). process.emit("SIGTERM"); - // Flush microtasks for deleteJob to resolve and the .then(process.kill) to run. for (let i = 0; i < 10; i++) await Promise.resolve(); - expect(mockBatchDeleteJob).toHaveBeenCalled(); + expect(mockBatchDeleteJob).not.toHaveBeenCalled(); expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM"); killSpy.mockRestore(); diff --git a/src/server/execute.ts b/src/server/execute.ts index 8473ce2..1f0c1c4 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -58,30 +58,20 @@ function ensureSigtermHandler(): void { if (sigtermHandlerRegistered) return; sigtermHandlerRegistered = true; process.once("SIGTERM", () => { - const jobs = [...activeJobs]; - void Promise.allSettled( - jobs.map(async (ref) => { - try { - const batchApi = getBatchApi(ref.kubeconfigPath); - await batchApi.deleteNamespacedJob({ - name: ref.jobName, - namespace: ref.namespace, - body: { propagationPolicy: "Background" }, - }); - } catch { /* best-effort */ } - if (ref.promptSecretName && ref.promptSecretNamespace) { - try { - const coreApi = getCoreApi(ref.kubeconfigPath); - await coreApi.deleteNamespacedSecret({ - name: ref.promptSecretName, - namespace: ref.promptSecretNamespace, - }); - } catch { /* best-effort */ } - } - }), - ).then(() => { - process.kill(process.pid, "SIGTERM"); - }); + // Do NOT delete active K8s Jobs on SIGTERM (FAR-107). Paperclip itself + // receives SIGTERM during rolling deploys, evictions, scale-down, etc. + // Deleting the Jobs we own there causes the in-flight heartbeat to surface + // a false-positive `k8s_job_deleted_externally` error and tears down work + // the user expected to keep running. + // + // The correct behaviour with `reattachOrphanedJobs=true` (default) is to + // leave the Jobs alive: the next paperclip process discovers them via the + // orphan-classification path and reattaches their log streams. When + // `reattachOrphanedJobs=false` the operator explicitly opted into manual + // cleanup and should not have us auto-deleting either. The owning Job's + // ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so + // both survive together and TTL cleans them up after natural completion. + process.kill(process.pid, "SIGTERM"); }); }