fix: scope cancel-polling to explicit cancellation states only (FAR-107)
shouldAbortForCancellation previously treated any non-`running` runStatus as a cancellation signal — which made the keepalive's cancel-poll delete the K8s Job whenever the heartbeat-runs API briefly returned a transient or stale status (e.g. queued, pending, succeeded, failed, completed, unknown) for an in-flight run. The follow-up `waitForJobCompletion` poll then observed the 404 and surfaced a spurious `k8s_job_deleted_externally` error to the user, even though no human or external system deleted the Job. Privileged Escalation's "null-pointer-nancy" agent reproduced this on runs that were never cancelled and were not adjacent to a paperclip restart, ruling out the SIGTERM path that 0.1.50 already addressed. Tighten the guard to fire only on `cancelled` / `cancelling`. Other terminal statuses are unreachable while the adapter is still executing (the adapter's own return is what flips them) and even if observed mid-run, they do not justify deleting a Job that may still be doing real work — the natural completion path will tear it down. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -1561,16 +1561,24 @@ describe("shouldAbortForCancellation", () => {
|
||||
expect(shouldAbortForCancellation("cancelled")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true when status is 'failed'", () => {
|
||||
expect(shouldAbortForCancellation("failed")).toBe(true);
|
||||
it("returns true when status is 'cancelling'", () => {
|
||||
expect(shouldAbortForCancellation("cancelling")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true when status is 'completed'", () => {
|
||||
expect(shouldAbortForCancellation("completed")).toBe(true);
|
||||
// FAR-107: terminal-but-not-cancelled statuses MUST NOT trigger Job deletion.
|
||||
// The previous "anything but running" guard caused k8s_job_deleted_externally
|
||||
// false positives for in-flight runs whenever the API briefly reported a
|
||||
// transient/stale status.
|
||||
it("returns false for non-cancellation terminal statuses (FAR-107)", () => {
|
||||
expect(shouldAbortForCancellation("succeeded")).toBe(false);
|
||||
expect(shouldAbortForCancellation("failed")).toBe(false);
|
||||
expect(shouldAbortForCancellation("completed")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true for any non-running non-empty string", () => {
|
||||
expect(shouldAbortForCancellation("unknown")).toBe(true);
|
||||
it("returns false for unknown statuses (FAR-107)", () => {
|
||||
expect(shouldAbortForCancellation("unknown")).toBe(false);
|
||||
expect(shouldAbortForCancellation("queued")).toBe(false);
|
||||
expect(shouldAbortForCancellation("pending")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
+13
-3
@@ -90,13 +90,23 @@ export function isK8s404(err: unknown): boolean {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the heartbeat-run status indicates the run is no longer
|
||||
* active and the K8s Job should be cancelled.
|
||||
* Returns true when the heartbeat-run status indicates the run was explicitly
|
||||
* cancelled and the K8s Job must be torn down.
|
||||
*
|
||||
* Only `cancelled` / `cancelling` qualify. Treating any non-`running` status
|
||||
* as cancellation (the previous behaviour) produced spurious
|
||||
* k8s_job_deleted_externally errors for in-flight runs whenever the API
|
||||
* briefly reported a transient or stale status — Nancy's runs at
|
||||
* Privileged Escalation hit this without anyone actually cancelling them
|
||||
* (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`)
|
||||
* are unreachable in practice while the adapter is still executing
|
||||
* (the adapter's own return is what flips them) and even if observed,
|
||||
* they do not warrant our deleting a Job that may still be doing work.
|
||||
* Exported for unit tests.
|
||||
*/
|
||||
export function shouldAbortForCancellation(runStatus: string | undefined): boolean {
|
||||
if (!runStatus) return false;
|
||||
return runStatus !== "running";
|
||||
return runStatus === "cancelled" || runStatus === "cancelling";
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user