Compare commits

...

4 Commits

Author SHA1 Message Date
Chris Farhood 83e105393c 0.1.51 2026-04-26 21:24:15 +00:00
Chris Farhood 49288fa5c7 fix: scope cancel-polling to explicit cancellation states only (FAR-107)
shouldAbortForCancellation previously treated any non-`running` runStatus
as a cancellation signal — which made the keepalive's cancel-poll delete
the K8s Job whenever the heartbeat-runs API briefly returned a transient
or stale status (e.g. queued, pending, succeeded, failed, completed,
unknown) for an in-flight run.  The follow-up `waitForJobCompletion`
poll then observed the 404 and surfaced a spurious
`k8s_job_deleted_externally` error to the user, even though no human
or external system deleted the Job.

Privileged Escalation's "null-pointer-nancy" agent reproduced this on
runs that were never cancelled and were not adjacent to a paperclip
restart, ruling out the SIGTERM path that 0.1.50 already addressed.

Tighten the guard to fire only on `cancelled` / `cancelling`.  Other
terminal statuses are unreachable while the adapter is still executing
(the adapter's own return is what flips them) and even if observed
mid-run, they do not justify deleting a Job that may still be doing
real work — the natural completion path will tear it down.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-26 21:24:11 +00:00
Chris Farhood dae9e18659 0.1.50 2026-04-26 21:19:03 +00:00
Chris Farhood 6923597b31 fix: do not delete active Jobs on SIGTERM — leave for orphan reattach (FAR-107)
Root cause of Nancy's k8s_job_deleted_externally false positive: the
paperclip server itself receives SIGTERM during rolling deploys,
evictions, scale-down, etc.  The previous SIGTERM handler iterated
activeJobs and deleted every Job before exiting, which surfaced in the
in-flight heartbeat as "K8s Job was deleted externally" — even though
nothing external touched it.

With reattachOrphanedJobs=true (default), this is exactly the wrong
behaviour: leaving the Jobs alive lets the next paperclip process
discover them via the orphan-classification path and reattach their
log streams.  With reattachOrphanedJobs=false the operator opted into
manual cleanup, so we still must not auto-delete.

The Job's ownerReference (FAR-15) keeps the prompt Secret tied to the
Job, so both survive together and TTL handles cleanup on natural
completion.  Test rewritten to assert the new contract: SIGTERM must
not touch K8s Jobs.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-26 21:19:02 +00:00
4 changed files with 52 additions and 42 deletions
+2 -2
View File
@@ -1,12 +1,12 @@
{
"name": "paperclip-adapter-claude-k8s",
"version": "0.1.49",
"version": "0.1.51",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "paperclip-adapter-claude-k8s",
"version": "0.1.49",
"version": "0.1.51",
"license": "MIT",
"dependencies": {
"@kubernetes/client-node": "^1.0.0",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "paperclip-adapter-claude-k8s",
"version": "0.1.49",
"version": "0.1.51",
"description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs",
"license": "MIT",
"repository": {
+22 -12
View File
@@ -1561,16 +1561,24 @@ describe("shouldAbortForCancellation", () => {
expect(shouldAbortForCancellation("cancelled")).toBe(true);
});
it("returns true when status is 'failed'", () => {
expect(shouldAbortForCancellation("failed")).toBe(true);
it("returns true when status is 'cancelling'", () => {
expect(shouldAbortForCancellation("cancelling")).toBe(true);
});
it("returns true when status is 'completed'", () => {
expect(shouldAbortForCancellation("completed")).toBe(true);
// FAR-107: terminal-but-not-cancelled statuses MUST NOT trigger Job deletion.
// The previous "anything but running" guard caused k8s_job_deleted_externally
// false positives for in-flight runs whenever the API briefly reported a
// transient/stale status.
it("returns false for non-cancellation terminal statuses (FAR-107)", () => {
expect(shouldAbortForCancellation("succeeded")).toBe(false);
expect(shouldAbortForCancellation("failed")).toBe(false);
expect(shouldAbortForCancellation("completed")).toBe(false);
});
it("returns true for any non-running non-empty string", () => {
expect(shouldAbortForCancellation("unknown")).toBe(true);
it("returns false for unknown statuses (FAR-107)", () => {
expect(shouldAbortForCancellation("unknown")).toBe(false);
expect(shouldAbortForCancellation("queued")).toBe(false);
expect(shouldAbortForCancellation("pending")).toBe(false);
});
});
@@ -1771,7 +1779,7 @@ describe("execute: SIGTERM handler best-effort cleanup", () => {
vi.useRealTimers();
});
it("deletes the active Job when SIGTERM fires during execution", async () => {
it("does NOT delete active Jobs on SIGTERM — leaves them for orphan reattach (FAR-107)", async () => {
// Mock process.kill to prevent the test process from actually being killed.
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
@@ -1782,17 +1790,19 @@ describe("execute: SIGTERM handler best-effort cleanup", () => {
// Flush microtasks through the async setup chain: getSelfPodInfo, listJobs,
// readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and
// ensureSigtermHandler() all complete before the try block enters streaming.
// 30 rounds is more than enough for the ~7 sequential await points.
for (let i = 0; i < 30; i++) await Promise.resolve();
// Emit SIGTERM — the process.once handler fires synchronously and kicks off
// async cleanup (deleteNamespacedJob). The mock resolves immediately.
// Reset deleteJob spy after setup so we can detect any SIGTERM-driven calls.
mockBatchDeleteJob.mockClear();
// Emit SIGTERM — the handler must re-raise to the default handler without
// touching the K8s Job. Deleting the Job here would surface as
// k8s_job_deleted_externally in the in-flight run (FAR-107).
process.emit("SIGTERM");
// Flush microtasks for deleteJob to resolve and the .then(process.kill) to run.
for (let i = 0; i < 10; i++) await Promise.resolve();
expect(mockBatchDeleteJob).toHaveBeenCalled();
expect(mockBatchDeleteJob).not.toHaveBeenCalled();
expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM");
killSpy.mockRestore();
+27 -27
View File
@@ -58,30 +58,20 @@ function ensureSigtermHandler(): void {
if (sigtermHandlerRegistered) return;
sigtermHandlerRegistered = true;
process.once("SIGTERM", () => {
const jobs = [...activeJobs];
void Promise.allSettled(
jobs.map(async (ref) => {
try {
const batchApi = getBatchApi(ref.kubeconfigPath);
await batchApi.deleteNamespacedJob({
name: ref.jobName,
namespace: ref.namespace,
body: { propagationPolicy: "Background" },
});
} catch { /* best-effort */ }
if (ref.promptSecretName && ref.promptSecretNamespace) {
try {
const coreApi = getCoreApi(ref.kubeconfigPath);
await coreApi.deleteNamespacedSecret({
name: ref.promptSecretName,
namespace: ref.promptSecretNamespace,
});
} catch { /* best-effort */ }
}
}),
).then(() => {
process.kill(process.pid, "SIGTERM");
});
// Do NOT delete active K8s Jobs on SIGTERM (FAR-107). Paperclip itself
// receives SIGTERM during rolling deploys, evictions, scale-down, etc.
// Deleting the Jobs we own there causes the in-flight heartbeat to surface
// a false-positive `k8s_job_deleted_externally` error and tears down work
// the user expected to keep running.
//
// The correct behaviour with `reattachOrphanedJobs=true` (default) is to
// leave the Jobs alive: the next paperclip process discovers them via the
// orphan-classification path and reattaches their log streams. When
// `reattachOrphanedJobs=false` the operator explicitly opted into manual
// cleanup and should not have us auto-deleting either. The owning Job's
// ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so
// both survive together and TTL cleans them up after natural completion.
process.kill(process.pid, "SIGTERM");
});
}
@@ -100,13 +90,23 @@ export function isK8s404(err: unknown): boolean {
}
/**
* Returns true when the heartbeat-run status indicates the run is no longer
* active and the K8s Job should be cancelled.
* Returns true when the heartbeat-run status indicates the run was explicitly
* cancelled and the K8s Job must be torn down.
*
* Only `cancelled` / `cancelling` qualify. Treating any non-`running` status
* as cancellation (the previous behaviour) produced spurious
* k8s_job_deleted_externally errors for in-flight runs whenever the API
* briefly reported a transient or stale status — Nancy's runs at
* Privileged Escalation hit this without anyone actually cancelling them
* (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`)
* are unreachable in practice while the adapter is still executing
* (the adapter's own return is what flips them) and even if observed,
* they do not warrant our deleting a Job that may still be doing work.
* Exported for unit tests.
*/
export function shouldAbortForCancellation(runStatus: string | undefined): boolean {
if (!runStatus) return false;
return runStatus !== "running";
return runStatus === "cancelled" || runStatus === "cancelling";
}
/**