diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index cf7c63a..a56b411 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1771,7 +1771,7 @@ describe("execute: SIGTERM handler best-effort cleanup", () => { vi.useRealTimers(); }); - it("deletes the active Job when SIGTERM fires during execution", async () => { + it("does NOT delete active Jobs on SIGTERM — leaves them for orphan reattach (FAR-107)", async () => { // Mock process.kill to prevent the test process from actually being killed. const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); @@ -1782,17 +1782,19 @@ describe("execute: SIGTERM handler best-effort cleanup", () => { // Flush microtasks through the async setup chain: getSelfPodInfo, listJobs, // readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and // ensureSigtermHandler() all complete before the try block enters streaming. - // 30 rounds is more than enough for the ~7 sequential await points. for (let i = 0; i < 30; i++) await Promise.resolve(); - // Emit SIGTERM — the process.once handler fires synchronously and kicks off - // async cleanup (deleteNamespacedJob). The mock resolves immediately. + // Reset deleteJob spy after setup so we can detect any SIGTERM-driven calls. + mockBatchDeleteJob.mockClear(); + + // Emit SIGTERM — the handler must re-raise to the default handler without + // touching the K8s Job. Deleting the Job here would surface as + // k8s_job_deleted_externally in the in-flight run (FAR-107). process.emit("SIGTERM"); - // Flush microtasks for deleteJob to resolve and the .then(process.kill) to run. for (let i = 0; i < 10; i++) await Promise.resolve(); - expect(mockBatchDeleteJob).toHaveBeenCalled(); + expect(mockBatchDeleteJob).not.toHaveBeenCalled(); expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM"); killSpy.mockRestore(); diff --git a/src/server/execute.ts b/src/server/execute.ts index 8473ce2..1f0c1c4 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -58,30 +58,20 @@ function ensureSigtermHandler(): void { if (sigtermHandlerRegistered) return; sigtermHandlerRegistered = true; process.once("SIGTERM", () => { - const jobs = [...activeJobs]; - void Promise.allSettled( - jobs.map(async (ref) => { - try { - const batchApi = getBatchApi(ref.kubeconfigPath); - await batchApi.deleteNamespacedJob({ - name: ref.jobName, - namespace: ref.namespace, - body: { propagationPolicy: "Background" }, - }); - } catch { /* best-effort */ } - if (ref.promptSecretName && ref.promptSecretNamespace) { - try { - const coreApi = getCoreApi(ref.kubeconfigPath); - await coreApi.deleteNamespacedSecret({ - name: ref.promptSecretName, - namespace: ref.promptSecretNamespace, - }); - } catch { /* best-effort */ } - } - }), - ).then(() => { - process.kill(process.pid, "SIGTERM"); - }); + // Do NOT delete active K8s Jobs on SIGTERM (FAR-107). Paperclip itself + // receives SIGTERM during rolling deploys, evictions, scale-down, etc. + // Deleting the Jobs we own there causes the in-flight heartbeat to surface + // a false-positive `k8s_job_deleted_externally` error and tears down work + // the user expected to keep running. + // + // The correct behaviour with `reattachOrphanedJobs=true` (default) is to + // leave the Jobs alive: the next paperclip process discovers them via the + // orphan-classification path and reattaches their log streams. When + // `reattachOrphanedJobs=false` the operator explicitly opted into manual + // cleanup and should not have us auto-deleting either. The owning Job's + // ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so + // both survive together and TTL cleans them up after natural completion. + process.kill(process.pid, "SIGTERM"); }); }