diff --git a/src/server/execute.test.ts b/src/server/execute.test.ts index a56b411..b4f435a 100644 --- a/src/server/execute.test.ts +++ b/src/server/execute.test.ts @@ -1561,16 +1561,24 @@ describe("shouldAbortForCancellation", () => { expect(shouldAbortForCancellation("cancelled")).toBe(true); }); - it("returns true when status is 'failed'", () => { - expect(shouldAbortForCancellation("failed")).toBe(true); + it("returns true when status is 'cancelling'", () => { + expect(shouldAbortForCancellation("cancelling")).toBe(true); }); - it("returns true when status is 'completed'", () => { - expect(shouldAbortForCancellation("completed")).toBe(true); + // FAR-107: terminal-but-not-cancelled statuses MUST NOT trigger Job deletion. + // The previous "anything but running" guard caused k8s_job_deleted_externally + // false positives for in-flight runs whenever the API briefly reported a + // transient/stale status. + it("returns false for non-cancellation terminal statuses (FAR-107)", () => { + expect(shouldAbortForCancellation("succeeded")).toBe(false); + expect(shouldAbortForCancellation("failed")).toBe(false); + expect(shouldAbortForCancellation("completed")).toBe(false); }); - it("returns true for any non-running non-empty string", () => { - expect(shouldAbortForCancellation("unknown")).toBe(true); + it("returns false for unknown statuses (FAR-107)", () => { + expect(shouldAbortForCancellation("unknown")).toBe(false); + expect(shouldAbortForCancellation("queued")).toBe(false); + expect(shouldAbortForCancellation("pending")).toBe(false); }); }); diff --git a/src/server/execute.ts b/src/server/execute.ts index 1f0c1c4..fed6fb3 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -90,13 +90,23 @@ export function isK8s404(err: unknown): boolean { } /** - * Returns true when the heartbeat-run status indicates the run is no longer - * active and the K8s Job should be cancelled. + * Returns true when the heartbeat-run status indicates the run was explicitly + * cancelled and the K8s Job must be torn down. + * + * Only `cancelled` / `cancelling` qualify. Treating any non-`running` status + * as cancellation (the previous behaviour) produced spurious + * k8s_job_deleted_externally errors for in-flight runs whenever the API + * briefly reported a transient or stale status — Nancy's runs at + * Privileged Escalation hit this without anyone actually cancelling them + * (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`) + * are unreachable in practice while the adapter is still executing + * (the adapter's own return is what flips them) and even if observed, + * they do not warrant our deleting a Job that may still be doing work. * Exported for unit tests. */ export function shouldAbortForCancellation(runStatus: string | undefined): boolean { if (!runStatus) return false; - return runStatus !== "running"; + return runStatus === "cancelled" || runStatus === "cancelling"; } /**