Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fd7dce7239 | |||
| b1878c684e | |||
| 83e105393c | |||
| 49288fa5c7 | |||
| dae9e18659 | |||
| 6923597b31 |
Generated
+2
-2
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "paperclip-adapter-claude-k8s",
|
"name": "paperclip-adapter-claude-k8s",
|
||||||
"version": "0.1.49",
|
"version": "0.1.52",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "paperclip-adapter-claude-k8s",
|
"name": "paperclip-adapter-claude-k8s",
|
||||||
"version": "0.1.49",
|
"version": "0.1.52",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@kubernetes/client-node": "^1.0.0",
|
"@kubernetes/client-node": "^1.0.0",
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "paperclip-adapter-claude-k8s",
|
"name": "paperclip-adapter-claude-k8s",
|
||||||
"version": "0.1.49",
|
"version": "0.1.52",
|
||||||
"description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs",
|
"description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"repository": {
|
"repository": {
|
||||||
|
|||||||
+22
-12
@@ -1561,16 +1561,24 @@ describe("shouldAbortForCancellation", () => {
|
|||||||
expect(shouldAbortForCancellation("cancelled")).toBe(true);
|
expect(shouldAbortForCancellation("cancelled")).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("returns true when status is 'failed'", () => {
|
it("returns true when status is 'cancelling'", () => {
|
||||||
expect(shouldAbortForCancellation("failed")).toBe(true);
|
expect(shouldAbortForCancellation("cancelling")).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("returns true when status is 'completed'", () => {
|
// FAR-107: terminal-but-not-cancelled statuses MUST NOT trigger Job deletion.
|
||||||
expect(shouldAbortForCancellation("completed")).toBe(true);
|
// The previous "anything but running" guard caused k8s_job_deleted_externally
|
||||||
|
// false positives for in-flight runs whenever the API briefly reported a
|
||||||
|
// transient/stale status.
|
||||||
|
it("returns false for non-cancellation terminal statuses (FAR-107)", () => {
|
||||||
|
expect(shouldAbortForCancellation("succeeded")).toBe(false);
|
||||||
|
expect(shouldAbortForCancellation("failed")).toBe(false);
|
||||||
|
expect(shouldAbortForCancellation("completed")).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("returns true for any non-running non-empty string", () => {
|
it("returns false for unknown statuses (FAR-107)", () => {
|
||||||
expect(shouldAbortForCancellation("unknown")).toBe(true);
|
expect(shouldAbortForCancellation("unknown")).toBe(false);
|
||||||
|
expect(shouldAbortForCancellation("queued")).toBe(false);
|
||||||
|
expect(shouldAbortForCancellation("pending")).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1771,7 +1779,7 @@ describe("execute: SIGTERM handler best-effort cleanup", () => {
|
|||||||
vi.useRealTimers();
|
vi.useRealTimers();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("deletes the active Job when SIGTERM fires during execution", async () => {
|
it("does NOT delete active Jobs on SIGTERM — leaves them for orphan reattach (FAR-107)", async () => {
|
||||||
// Mock process.kill to prevent the test process from actually being killed.
|
// Mock process.kill to prevent the test process from actually being killed.
|
||||||
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
|
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
|
||||||
|
|
||||||
@@ -1782,17 +1790,19 @@ describe("execute: SIGTERM handler best-effort cleanup", () => {
|
|||||||
// Flush microtasks through the async setup chain: getSelfPodInfo, listJobs,
|
// Flush microtasks through the async setup chain: getSelfPodInfo, listJobs,
|
||||||
// readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and
|
// readSkillEntries, prepareBundle, createJob, onLog, activeJobs.add(), and
|
||||||
// ensureSigtermHandler() all complete before the try block enters streaming.
|
// ensureSigtermHandler() all complete before the try block enters streaming.
|
||||||
// 30 rounds is more than enough for the ~7 sequential await points.
|
|
||||||
for (let i = 0; i < 30; i++) await Promise.resolve();
|
for (let i = 0; i < 30; i++) await Promise.resolve();
|
||||||
|
|
||||||
// Emit SIGTERM — the process.once handler fires synchronously and kicks off
|
// Reset deleteJob spy after setup so we can detect any SIGTERM-driven calls.
|
||||||
// async cleanup (deleteNamespacedJob). The mock resolves immediately.
|
mockBatchDeleteJob.mockClear();
|
||||||
|
|
||||||
|
// Emit SIGTERM — the handler must re-raise to the default handler without
|
||||||
|
// touching the K8s Job. Deleting the Job here would surface as
|
||||||
|
// k8s_job_deleted_externally in the in-flight run (FAR-107).
|
||||||
process.emit("SIGTERM");
|
process.emit("SIGTERM");
|
||||||
|
|
||||||
// Flush microtasks for deleteJob to resolve and the .then(process.kill) to run.
|
|
||||||
for (let i = 0; i < 10; i++) await Promise.resolve();
|
for (let i = 0; i < 10; i++) await Promise.resolve();
|
||||||
|
|
||||||
expect(mockBatchDeleteJob).toHaveBeenCalled();
|
expect(mockBatchDeleteJob).not.toHaveBeenCalled();
|
||||||
expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM");
|
expect(killSpy).toHaveBeenCalledWith(process.pid, "SIGTERM");
|
||||||
|
|
||||||
killSpy.mockRestore();
|
killSpy.mockRestore();
|
||||||
|
|||||||
+113
-37
@@ -58,30 +58,20 @@ function ensureSigtermHandler(): void {
|
|||||||
if (sigtermHandlerRegistered) return;
|
if (sigtermHandlerRegistered) return;
|
||||||
sigtermHandlerRegistered = true;
|
sigtermHandlerRegistered = true;
|
||||||
process.once("SIGTERM", () => {
|
process.once("SIGTERM", () => {
|
||||||
const jobs = [...activeJobs];
|
// Do NOT delete active K8s Jobs on SIGTERM (FAR-107). Paperclip itself
|
||||||
void Promise.allSettled(
|
// receives SIGTERM during rolling deploys, evictions, scale-down, etc.
|
||||||
jobs.map(async (ref) => {
|
// Deleting the Jobs we own there causes the in-flight heartbeat to surface
|
||||||
try {
|
// a false-positive `k8s_job_deleted_externally` error and tears down work
|
||||||
const batchApi = getBatchApi(ref.kubeconfigPath);
|
// the user expected to keep running.
|
||||||
await batchApi.deleteNamespacedJob({
|
//
|
||||||
name: ref.jobName,
|
// The correct behaviour with `reattachOrphanedJobs=true` (default) is to
|
||||||
namespace: ref.namespace,
|
// leave the Jobs alive: the next paperclip process discovers them via the
|
||||||
body: { propagationPolicy: "Background" },
|
// orphan-classification path and reattaches their log streams. When
|
||||||
});
|
// `reattachOrphanedJobs=false` the operator explicitly opted into manual
|
||||||
} catch { /* best-effort */ }
|
// cleanup and should not have us auto-deleting either. The owning Job's
|
||||||
if (ref.promptSecretName && ref.promptSecretNamespace) {
|
// ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so
|
||||||
try {
|
// both survive together and TTL cleans them up after natural completion.
|
||||||
const coreApi = getCoreApi(ref.kubeconfigPath);
|
process.kill(process.pid, "SIGTERM");
|
||||||
await coreApi.deleteNamespacedSecret({
|
|
||||||
name: ref.promptSecretName,
|
|
||||||
namespace: ref.promptSecretNamespace,
|
|
||||||
});
|
|
||||||
} catch { /* best-effort */ }
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
).then(() => {
|
|
||||||
process.kill(process.pid, "SIGTERM");
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,13 +90,23 @@ export function isK8s404(err: unknown): boolean {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true when the heartbeat-run status indicates the run is no longer
|
* Returns true when the heartbeat-run status indicates the run was explicitly
|
||||||
* active and the K8s Job should be cancelled.
|
* cancelled and the K8s Job must be torn down.
|
||||||
|
*
|
||||||
|
* Only `cancelled` / `cancelling` qualify. Treating any non-`running` status
|
||||||
|
* as cancellation (the previous behaviour) produced spurious
|
||||||
|
* k8s_job_deleted_externally errors for in-flight runs whenever the API
|
||||||
|
* briefly reported a transient or stale status — Nancy's runs at
|
||||||
|
* Privileged Escalation hit this without anyone actually cancelling them
|
||||||
|
* (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`)
|
||||||
|
* are unreachable in practice while the adapter is still executing
|
||||||
|
* (the adapter's own return is what flips them) and even if observed,
|
||||||
|
* they do not warrant our deleting a Job that may still be doing work.
|
||||||
* Exported for unit tests.
|
* Exported for unit tests.
|
||||||
*/
|
*/
|
||||||
export function shouldAbortForCancellation(runStatus: string | undefined): boolean {
|
export function shouldAbortForCancellation(runStatus: string | undefined): boolean {
|
||||||
if (!runStatus) return false;
|
if (!runStatus) return false;
|
||||||
return runStatus !== "running";
|
return runStatus === "cancelled" || runStatus === "cancelling";
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -650,30 +650,82 @@ export interface PodTerminatedState {
|
|||||||
signal: number | null;
|
signal: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getPodTerminatedState(
|
/**
|
||||||
|
* Result of a pod-state lookup. `state` is the terminated state when available;
|
||||||
|
* `phase` and `podMissing` give the caller enough context to render an honest
|
||||||
|
* truncation-cause message instead of guessing "likely deleted" (FAR-107).
|
||||||
|
*/
|
||||||
|
export interface PodLookupResult {
|
||||||
|
state: PodTerminatedState | null;
|
||||||
|
phase: string | null;
|
||||||
|
podMissing: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function lookupPodState(
|
||||||
namespace: string,
|
namespace: string,
|
||||||
jobName: string,
|
jobName: string,
|
||||||
kubeconfigPath?: string,
|
kubeconfigPath?: string,
|
||||||
): Promise<PodTerminatedState | null> {
|
): Promise<PodLookupResult> {
|
||||||
const coreApi = getCoreApi(kubeconfigPath);
|
const coreApi = getCoreApi(kubeconfigPath);
|
||||||
const podList = await coreApi.listNamespacedPod({
|
const podList = await coreApi.listNamespacedPod({
|
||||||
namespace,
|
namespace,
|
||||||
labelSelector: `job-name=${jobName}`,
|
labelSelector: `job-name=${jobName}`,
|
||||||
});
|
});
|
||||||
const pod = podList.items[0];
|
const pod = podList.items[0];
|
||||||
if (!pod) return null;
|
if (!pod) return { state: null, phase: null, podMissing: true };
|
||||||
|
|
||||||
|
const phase = pod.status?.phase ?? null;
|
||||||
const containerStatus = pod.status?.containerStatuses?.find((s) => s.name === "claude");
|
const containerStatus = pod.status?.containerStatuses?.find((s) => s.name === "claude");
|
||||||
const terminated = containerStatus?.state?.terminated;
|
const terminated = containerStatus?.state?.terminated;
|
||||||
if (!terminated) return null;
|
if (!terminated) return { state: null, phase, podMissing: false };
|
||||||
return {
|
return {
|
||||||
exitCode: terminated.exitCode ?? null,
|
state: {
|
||||||
reason: terminated.reason ?? null,
|
exitCode: terminated.exitCode ?? null,
|
||||||
message: (terminated.message ?? "").trim() || null,
|
reason: terminated.reason ?? null,
|
||||||
signal: terminated.signal ?? null,
|
message: (terminated.message ?? "").trim() || null,
|
||||||
|
signal: terminated.signal ?? null,
|
||||||
|
},
|
||||||
|
phase,
|
||||||
|
podMissing: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the claude container's terminated state, retrying briefly when the pod
|
||||||
|
* exists in a terminal phase but kubelet has not yet propagated the
|
||||||
|
* containerStatuses[].state.terminated field. Without this retry, fast
|
||||||
|
* truncated-stream exits surface as "pod state unavailable" (FAR-107) and
|
||||||
|
* mask the real exit code / OOMKilled / SIGTERM cause.
|
||||||
|
*/
|
||||||
|
async function getPodLookupWithRetry(
|
||||||
|
namespace: string,
|
||||||
|
jobName: string,
|
||||||
|
kubeconfigPath?: string,
|
||||||
|
attempts = 4,
|
||||||
|
delayMs = 500,
|
||||||
|
): Promise<PodLookupResult> {
|
||||||
|
let last: PodLookupResult = { state: null, phase: null, podMissing: true };
|
||||||
|
for (let i = 0; i < attempts; i++) {
|
||||||
|
last = await lookupPodState(namespace, jobName, kubeconfigPath);
|
||||||
|
if (last.state) return last;
|
||||||
|
if (last.podMissing) return last;
|
||||||
|
// Pod exists but no terminated state. If it is in a terminal phase the
|
||||||
|
// containerStatuses update is in flight — wait briefly and retry. If it
|
||||||
|
// is still Running/Pending, retrying is unlikely to help, so bail.
|
||||||
|
if (last.phase !== "Succeeded" && last.phase !== "Failed") return last;
|
||||||
|
if (i < attempts - 1) await new Promise((r) => setTimeout(r, delayMs));
|
||||||
|
}
|
||||||
|
return last;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getPodTerminatedState(
|
||||||
|
namespace: string,
|
||||||
|
jobName: string,
|
||||||
|
kubeconfigPath?: string,
|
||||||
|
): Promise<PodTerminatedState | null> {
|
||||||
|
return (await lookupPodState(namespace, jobName, kubeconfigPath)).state;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Format a human-readable explanation for a truncated run, including the
|
* Format a human-readable explanation for a truncated run, including the
|
||||||
* pod's claude-container terminated state when available. Exit code 137
|
* pod's claude-container terminated state when available. Exit code 137
|
||||||
@@ -682,9 +734,17 @@ async function getPodTerminatedState(
|
|||||||
*/
|
*/
|
||||||
export function describeTruncationCause(
|
export function describeTruncationCause(
|
||||||
state: PodTerminatedState | null,
|
state: PodTerminatedState | null,
|
||||||
|
lookup?: PodLookupResult,
|
||||||
): string {
|
): string {
|
||||||
if (!state) {
|
if (!state) {
|
||||||
return "pod state unavailable — likely deleted before exit could be read";
|
if (lookup?.podMissing) {
|
||||||
|
return "pod is gone — Job pod was removed (eviction, preemption, or external delete) before exit could be read";
|
||||||
|
}
|
||||||
|
if (lookup && !lookup.podMissing) {
|
||||||
|
const phaseHint = lookup.phase ? `pod phase=${lookup.phase}` : "pod present";
|
||||||
|
return `container terminated state not yet observable (${phaseHint}) — kubelet status update did not land within retry window; exit cause unknown`;
|
||||||
|
}
|
||||||
|
return "pod state unavailable — exit cause unknown";
|
||||||
}
|
}
|
||||||
const parts: string[] = [];
|
const parts: string[] = [];
|
||||||
if (state.exitCode !== null) {
|
if (state.exitCode !== null) {
|
||||||
@@ -1554,7 +1614,23 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (parsedStream.truncatedMidStream) {
|
if (parsedStream.truncatedMidStream) {
|
||||||
const cause = describeTruncationCause(podTerminatedState);
|
// Re-query pod state with retry — the initial single-shot read can lose
|
||||||
|
// to kubelet propagation lag and surface a useless "pod state unavailable"
|
||||||
|
// message that hides the real exit cause (OOMKilled, SIGTERM, etc). The
|
||||||
|
// retry distinguishes pod-genuinely-gone from terminated-state-lag and
|
||||||
|
// gives the operator the actual exit code/reason where possible (FAR-107).
|
||||||
|
let lookup: PodLookupResult | undefined;
|
||||||
|
let refreshedState = podTerminatedState;
|
||||||
|
try {
|
||||||
|
lookup = await getPodLookupWithRetry(namespace, jobName, kubeconfigPath);
|
||||||
|
refreshedState = lookup.state;
|
||||||
|
if (refreshedState && refreshedState.exitCode !== null) {
|
||||||
|
exitCode = refreshedState.exitCode;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
await onLog("stderr", `[paperclip] truncation diagnostic: pod re-query failed (${err instanceof Error ? err.message : String(err)})\n`).catch(() => {});
|
||||||
|
}
|
||||||
|
const cause = describeTruncationCause(refreshedState, lookup);
|
||||||
const modelHint = parsedStream.model ? ` (model: ${parsedStream.model})` : "";
|
const modelHint = parsedStream.model ? ` (model: ${parsedStream.model})` : "";
|
||||||
return {
|
return {
|
||||||
exitCode,
|
exitCode,
|
||||||
|
|||||||
Reference in New Issue
Block a user