feat: implement cancel support via keepalive poll and SIGTERM handler (FAR-26)

- Poll GET /api/heartbeat-runs/:runId on every keepalive tick (15s); when
  status != 'running', delete the K8s Job, set logStopSignal, and return
  errorCode='cancelled' — Job gone within ~15s of external cancellation.
- SIGTERM handler best-effort deletes all active Jobs/Secrets and re-emits
  the signal to let the process exit naturally.
- Export shouldAbortForCancellation() helper; add tests for helper, cancel
  poll path, and SIGTERM cleanup.
- Guard: PAPERCLIP_API_URL missing logs a warning and skips cancel polling;
  HTTP 5xx from poll treated as transient; reattach path skips cancel poll.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
2026-04-24 15:20:45 +00:00
parent c55d6c61fc
commit f097440f3c
2 changed files with 380 additions and 9 deletions
+260 -1
View File
@@ -60,7 +60,7 @@ vi.mock("@paperclipai/adapter-utils/server-utils", async (importOriginal) => {
});
});
const { isK8s404, buildPartialRunError, classifyOrphan, describePodTerminatedError, streamPodLogsOnce, execute } = await import("./execute.js");
const { isK8s404, buildPartialRunError, classifyOrphan, describePodTerminatedError, streamPodLogsOnce, shouldAbortForCancellation, execute } = await import("./execute.js");
function makeJob(opts: {
runId?: string;
@@ -1220,3 +1220,262 @@ describe("execute: concurrency guard — multiple orphans", () => {
expect(result.errorMessage).toContain("different task");
});
});
// ─── shouldAbortForCancellation ──────────────────────────────────────────────
describe("shouldAbortForCancellation", () => {
it("returns false for undefined", () => {
expect(shouldAbortForCancellation(undefined)).toBe(false);
});
it("returns false for empty string", () => {
expect(shouldAbortForCancellation("")).toBe(false);
});
it("returns false when status is 'running'", () => {
expect(shouldAbortForCancellation("running")).toBe(false);
});
it("returns true when status is 'cancelled'", () => {
expect(shouldAbortForCancellation("cancelled")).toBe(true);
});
it("returns true when status is 'failed'", () => {
expect(shouldAbortForCancellation("failed")).toBe(true);
});
it("returns true when status is 'completed'", () => {
expect(shouldAbortForCancellation("completed")).toBe(true);
});
it("returns true for any non-running non-empty string", () => {
expect(shouldAbortForCancellation("unknown")).toBe(true);
});
});
// ─── execute: cancel-polling path ────────────────────────────────────────────
describe("execute: cancel-polling via keepalive tick", () => {
const mockFetch = vi.fn();
beforeEach(() => {
vi.resetAllMocks();
vi.useFakeTimers();
// Replace global fetch for this suite
vi.stubGlobal("fetch", mockFetch);
process.env.PAPERCLIP_API_URL = "http://paperclip-test.local";
mockReadSkillEntries.mockResolvedValue([]);
mockGetSelfPodInfo.mockResolvedValue(makeSelfPodResult());
mockBatchListJobs.mockResolvedValue({ items: [] });
mockPrepareBundle.mockResolvedValue(makeBundle());
mockBatchCreateJob.mockResolvedValue({ metadata: { uid: "job-uid-1" } });
mockBatchPatchJob.mockResolvedValue({});
mockBatchDeleteJob.mockResolvedValue({});
mockCoreDeleteSecret.mockResolvedValue({});
mockCoreListPods
.mockResolvedValueOnce({
items: [{
metadata: { name: "pod-abc" },
status: { phase: "Running", containerStatuses: [], initContainerStatuses: [] },
}],
})
.mockResolvedValue({
items: [{
metadata: { name: "pod-abc" },
status: { containerStatuses: [{ name: "claude", state: { terminated: { exitCode: 0 } } }] },
}],
});
// Job never reaches terminal on its own (cancel kicks in first)
mockBatchReadJob.mockResolvedValue({ status: { conditions: [] } });
// Log stream never ends (hung — simulates long-running Claude)
mockLogFn.mockImplementation(() => new Promise(() => { /* never resolves */ }));
});
afterEach(() => {
vi.useRealTimers();
vi.unstubAllGlobals();
delete process.env.PAPERCLIP_API_URL;
});
it("returns errorCode=cancelled when poll detects non-running status within one keepalive tick", async () => {
// Use a flag so readJob throws 404 only AFTER deleteJob is called (simulating
// K8s state where the job disappears after deletion).
let jobDeleted = false;
mockBatchDeleteJob.mockImplementation(async () => { jobDeleted = true; return {}; });
mockBatchReadJob.mockImplementation(async () => {
if (jobDeleted) {
throw Object.assign(new Error("Not Found"), { response: { statusCode: 404 } });
}
return { status: { conditions: [] } };
});
// Cancel poll returns "cancelled" status.
mockFetch.mockResolvedValue({
ok: true,
json: async () => ({ status: "cancelled" }),
});
const executePromise = execute(
makeCtx({ authToken: "tok-abc" } as Partial<AdapterExecutionContext>),
);
// Timer sequence:
// t=15100: keepalive fires → pre-check non-terminal → fetch → cancelled →
// deleteJob (jobDeleted=true) → stop signal set
// t=15300: stop poller fires (200ms) → destroys writable → starts 3s bail timer
// t=17100: completion watcher polls → 404 (jobDeleted=true) → jobGone → settles
// t=18300: bail timer fires → streamPodLogsOnce returns → streamPodLogs exits →
// trackedLogStream settles → Promise.allSettled resolves
await vi.advanceTimersByTimeAsync(15_100); // keepalive fires → cancel detected
await vi.advanceTimersByTimeAsync(2_100); // completion watcher polls → 404 → settles
await vi.advanceTimersByTimeAsync(3_100); // bail timer fires → log stream settles
const result = await executePromise;
expect(result.errorCode).toBe("cancelled");
expect(result.errorMessage).toBe("Run cancelled");
expect(result.timedOut).toBe(false);
expect(mockBatchDeleteJob).toHaveBeenCalled();
});
it("treats HTTP 500 on cancel poll as transient and does not cancel", async () => {
// Cancel poll returns 500 → transient, should not cancel.
// After a while the job completes normally.
mockFetch.mockResolvedValue({ ok: false, status: 500 });
// Override: job completes after keepalive tick fires
mockBatchReadJob
.mockResolvedValueOnce({ status: { conditions: [] } }) // first keepalive check: non-terminal
.mockResolvedValue({ status: { conditions: [{ type: "Complete", status: "True" }] } });
mockLogFn.mockImplementation(
async (_ns: string, _pod: string, _ctr: string, writable: import("node:stream").Writable) => {
writable.write(CLAUDE_HAPPY_OUTPUT);
},
);
const executePromise = execute(
makeCtx({ authToken: "tok-abc" } as Partial<AdapterExecutionContext>),
);
await vi.advanceTimersByTimeAsync(15_100); // keepalive fires: 500 → transient, no cancel
await vi.advanceTimersByTimeAsync(3_100); // log reconnect sleep → stopSignal already true
const result = await executePromise;
expect(result.errorCode).toBeUndefined();
expect(result.exitCode).toBe(0);
expect(result.sessionId).toBe("sess_test123");
});
it("skips cancel poll when authToken is absent", async () => {
// No authToken → cancel poll must not be attempted → job completes normally
mockBatchReadJob.mockResolvedValue({
status: { conditions: [{ type: "Complete", status: "True" }] },
});
mockLogFn.mockImplementation(
async (_ns: string, _pod: string, _ctr: string, writable: import("node:stream").Writable) => {
writable.write(CLAUDE_HAPPY_OUTPUT);
},
);
const executePromise = execute(makeCtx()); // no authToken
await vi.advanceTimersByTimeAsync(3_100);
const result = await executePromise;
expect(mockFetch).not.toHaveBeenCalled();
expect(result.exitCode).toBe(0);
});
it("skips cancel poll when PAPERCLIP_API_URL is not set", async () => {
delete process.env.PAPERCLIP_API_URL;
mockBatchReadJob.mockResolvedValue({
status: { conditions: [{ type: "Complete", status: "True" }] },
});
mockLogFn.mockImplementation(
async (_ns: string, _pod: string, _ctr: string, writable: import("node:stream").Writable) => {
writable.write(CLAUDE_HAPPY_OUTPUT);
},
);
const executePromise = execute(
makeCtx({ authToken: "tok-abc" } as Partial<AdapterExecutionContext>),
);
await vi.advanceTimersByTimeAsync(3_100);
const result = await executePromise;
expect(mockFetch).not.toHaveBeenCalled();
expect(result.exitCode).toBe(0);
});
});
// ─── execute: SIGTERM handler ─────────────────────────────────────────────────
describe("execute: SIGTERM handler best-effort cleanup", () => {
beforeEach(() => {
vi.resetAllMocks();
vi.useFakeTimers();
mockReadSkillEntries.mockResolvedValue([]);
mockGetSelfPodInfo.mockResolvedValue(makeSelfPodResult());
mockBatchListJobs.mockResolvedValue({ items: [] });
mockPrepareBundle.mockResolvedValue(makeBundle());
mockBatchCreateJob.mockResolvedValue({ metadata: { uid: "job-uid-1" } });
mockBatchPatchJob.mockResolvedValue({});
mockBatchDeleteJob.mockResolvedValue({});
mockCoreDeleteSecret.mockResolvedValue({});
mockCoreListPods
.mockResolvedValueOnce({
items: [{
metadata: { name: "pod-abc" },
status: { phase: "Running", containerStatuses: [], initContainerStatuses: [] },
}],
})
.mockResolvedValue({
items: [{
metadata: { name: "pod-abc" },
status: { containerStatuses: [{ name: "claude", state: { terminated: { exitCode: 0 } } }] },
}],
});
mockBatchReadJob.mockResolvedValue({ status: { conditions: [] } });
mockLogFn.mockImplementation(() => new Promise(() => { /* never resolves */ }));
});
afterEach(() => {
vi.useRealTimers();
});
it("deletes the active Job when SIGTERM fires during execution", async () => {
// Mock process.kill to prevent the test process from actually being killed.
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
// Start execute() and suppress unhandled rejection (we won't await it).
const executePromise = execute(makeCtx());
executePromise.catch(() => {});
// Flush microtasks through the async setup chain: getSelfPodInfo, listJobs,
// readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and
// ensureSigtermHandler() all complete before the try block enters streaming.
// 30 rounds is more than enough for the ~7 sequential await points.
for (let i = 0; i < 30; i++) await Promise.resolve();
// Emit SIGTERM — the process.once handler fires synchronously and kicks off
// async cleanup (deleteNamespacedJob). The mock resolves immediately.
process.emit("SIGTERM");
// Flush microtasks for deleteJob to resolve and the .then(process.kill) to run.
for (let i = 0; i < 10; i++) await Promise.resolve();
expect(mockBatchDeleteJob).toHaveBeenCalled();
expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM");
killSpy.mockRestore();
// afterEach calls vi.useRealTimers() which clears all pending fake timers,
// so we do not need to settle executePromise.
});
});