From 49288fa5c737af9737a27c61b1ccd7940638498e Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Sun, 26 Apr 2026 21:24:11 +0000 Subject: [PATCH] fix: scope cancel-polling to explicit cancellation states only (FAR-107) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shouldAbortForCancellation previously treated any non-`running` runStatus as a cancellation signal — which made the keepalive's cancel-poll delete the K8s Job whenever the heartbeat-runs API briefly returned a transient or stale status (e.g. queued, pending, succeeded, failed, completed, unknown) for an in-flight run. The follow-up `waitForJobCompletion` poll then observed the 404 and surfaced a spurious `k8s_job_deleted_externally` error to the user, even though no human or external system deleted the Job. Privileged Escalation's "null-pointer-nancy" agent reproduced this on runs that were never cancelled and were not adjacent to a paperclip restart, ruling out the SIGTERM path that 0.1.50 already addressed. Tighten the guard to fire only on `cancelled` / `cancelling`. Other terminal statuses are unreachable while the adapter is still executing (the adapter's own return is what flips them) and even if observed mid-run, they do not justify deleting a Job that may still be doing real work — the natural completion path will tear it down. Co-Authored-By: Paperclip --- src/server/execute.test.ts | 20 ++++++++++++++------ src/server/execute.ts | 16 +++++++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index a56b411..b4f435a 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1561,16 +1561,24 @@ describe("shouldAbortForCancellation", () => { expect(shouldAbortForCancellation("cancelled")).toBe(true); }); - it("returns true when status is 'failed'", () => { - expect(shouldAbortForCancellation("failed")).toBe(true); + it("returns true when status is 'cancelling'", () => { + expect(shouldAbortForCancellation("cancelling")).toBe(true); }); - it("returns true when status is 'completed'", () => { - expect(shouldAbortForCancellation("completed")).toBe(true); + // FAR-107: terminal-but-not-cancelled statuses MUST NOT trigger Job deletion. + // The previous "anything but running" guard caused k8s_job_deleted_externally + // false positives for in-flight runs whenever the API briefly reported a + // transient/stale status. + it("returns false for non-cancellation terminal statuses (FAR-107)", () => { + expect(shouldAbortForCancellation("succeeded")).toBe(false); + expect(shouldAbortForCancellation("failed")).toBe(false); + expect(shouldAbortForCancellation("completed")).toBe(false); }); - it("returns true for any non-running non-empty string", () => { - expect(shouldAbortForCancellation("unknown")).toBe(true); + it("returns false for unknown statuses (FAR-107)", () => { + expect(shouldAbortForCancellation("unknown")).toBe(false); + expect(shouldAbortForCancellation("queued")).toBe(false); + expect(shouldAbortForCancellation("pending")).toBe(false); }); }); diff --git a/src/server/execute.ts b/src/server/execute.ts index 1f0c1c4..fed6fb3 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -90,13 +90,23 @@ export function isK8s404(err: unknown): boolean { } /** - * Returns true when the heartbeat-run status indicates the run is no longer - * active and the K8s Job should be cancelled. + * Returns true when the heartbeat-run status indicates the run was explicitly + * cancelled and the K8s Job must be torn down. + * + * Only `cancelled` / `cancelling` qualify. Treating any non-`running` status + * as cancellation (the previous behaviour) produced spurious + * k8s_job_deleted_externally errors for in-flight runs whenever the API + * briefly reported a transient or stale status — Nancy's runs at + * Privileged Escalation hit this without anyone actually cancelling them + * (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`) + * are unreachable in practice while the adapter is still executing + * (the adapter's own return is what flips them) and even if observed, + * they do not warrant our deleting a Job that may still be doing work. * Exported for unit tests. */ export function shouldAbortForCancellation(runStatus: string | undefined): boolean { if (!runStatus) return false; - return runStatus !== "running"; + return runStatus === "cancelled" || runStatus === "cancelling"; } /**