Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d184a1732b | |||
| be84428226 | |||
| d9928030d6 | |||
| 76fc6fcdfc | |||
| 3169f49f23 | |||
| e0b35d230f |
Generated
+2
-2
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "paperclip-adapter-claude-k8s",
|
||||
"version": "0.1.46",
|
||||
"version": "0.1.49",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "paperclip-adapter-claude-k8s",
|
||||
"version": "0.1.46",
|
||||
"version": "0.1.49",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@kubernetes/client-node": "^1.0.0",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "paperclip-adapter-claude-k8s",
|
||||
"version": "0.1.46",
|
||||
"version": "0.1.49",
|
||||
"description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs",
|
||||
"license": "MIT",
|
||||
"repository": {
|
||||
|
||||
@@ -150,10 +150,10 @@ describe("buildPartialRunError", () => {
|
||||
expect(buildPartialRunError(null, "", "")).toBe("Claude exited with code -1");
|
||||
});
|
||||
|
||||
it("skips system/init events and returns generic message when only init captured", () => {
|
||||
it("returns init-only message when stdout is init-only with non-zero exit code (FAR-101)", () => {
|
||||
const msg = buildPartialRunError(1, "claude-sonnet-4-6", initLine);
|
||||
expect(msg).toBe(
|
||||
"Claude started but did not produce a result (model: claude-sonnet-4-6) — check API credentials, model support, and adapter config",
|
||||
"Claude exited immediately after init (model: claude-sonnet-4-6) (exit code 1) — the model may be unsupported or the session may have been rejected before producing output",
|
||||
);
|
||||
});
|
||||
|
||||
@@ -170,15 +170,15 @@ describe("buildPartialRunError", () => {
|
||||
expect(msg).toBe("Claude exited with code 1: Error: no API key configured");
|
||||
});
|
||||
|
||||
it("skips result events (structured protocol artefact — not surfaced verbatim)", () => {
|
||||
it("returns init-only message when stdout has init + result event but no plain content (structured artefact, not surfaced verbatim)", () => {
|
||||
// In production, buildPartialRunError is only called when parseClaudeStreamJson
|
||||
// returns null (no result event). If somehow a result event appears here, the
|
||||
// raw JSON blob must not be shown — the "did not produce a result" message is
|
||||
// cleaner and avoids leaking protocol internals to the UI.
|
||||
// raw JSON blob must not be shown — the init-only message is cleaner and avoids
|
||||
// leaking protocol internals to the UI.
|
||||
const resultLike = JSON.stringify({ type: "result", subtype: "error", result: "rate limit" });
|
||||
const stdout = [initLine, resultLike].join("\n");
|
||||
const msg = buildPartialRunError(2, "claude-sonnet-4-6", stdout);
|
||||
expect(msg).toContain("did not produce a result");
|
||||
expect(msg).toContain("Claude exited immediately after init");
|
||||
expect(msg).toContain("claude-sonnet-4-6");
|
||||
expect(msg).not.toMatch(/\{.*type.*result/);
|
||||
});
|
||||
@@ -245,6 +245,44 @@ describe("buildPartialRunError", () => {
|
||||
const msg = buildPartialRunError(1, "model-x", stdout);
|
||||
expect(msg).toBe("Claude exited with code 1: real error line");
|
||||
});
|
||||
|
||||
it("appends pod terminated reason/message when state is provided (FAR-100)", () => {
|
||||
const msg = buildPartialRunError(1, "claude-sonnet-4-6", initLine, {
|
||||
exitCode: 1,
|
||||
reason: "Error",
|
||||
message: "model not supported",
|
||||
signal: null,
|
||||
});
|
||||
expect(msg).toContain("Claude exited immediately after init");
|
||||
expect(msg).toContain("claude-sonnet-4-6");
|
||||
expect(msg).toContain("[pod: reason=Error, message=model not supported]");
|
||||
});
|
||||
|
||||
it("flags exit 137 as OOMKilled in pod cause", () => {
|
||||
const msg = buildPartialRunError(137, "claude-sonnet-4-6", initLine, {
|
||||
exitCode: 137,
|
||||
reason: "OOMKilled",
|
||||
message: null,
|
||||
signal: null,
|
||||
});
|
||||
expect(msg).toContain("[pod: reason=OOMKilled, SIGKILL (commonly OOMKilled)]");
|
||||
});
|
||||
|
||||
it("appends pod cause to content-line message", () => {
|
||||
const stdout = [initLine, "Error: bad request"].join("\n");
|
||||
const msg = buildPartialRunError(1, "claude-sonnet-4-6", stdout, {
|
||||
exitCode: 1,
|
||||
reason: "Error",
|
||||
message: null,
|
||||
signal: null,
|
||||
});
|
||||
expect(msg).toBe("Claude exited with code 1: Error: bad request [pod: reason=Error]");
|
||||
});
|
||||
|
||||
it("does not append anything when podState is null (back-compat)", () => {
|
||||
const msg = buildPartialRunError(1, "claude-sonnet-4-6", initLine, null);
|
||||
expect(msg).not.toContain("[pod:");
|
||||
});
|
||||
});
|
||||
|
||||
describe("classifyOrphan", () => {
|
||||
@@ -981,7 +1019,8 @@ describe("execute: happy path", () => {
|
||||
const result = await executePromise;
|
||||
|
||||
expect(result.errorCode).toBe("k8s_job_deleted_externally");
|
||||
expect(result.errorMessage).toBe("K8s Job was deleted externally before Claude could complete");
|
||||
expect(result.errorMessage).toMatch(/^K8s Job was deleted externally before Claude could complete \[/);
|
||||
expect(result.errorMessage).toContain("detected_via=");
|
||||
expect(result.exitCode).toBeNull();
|
||||
});
|
||||
|
||||
|
||||
+161
-31
@@ -110,24 +110,12 @@ export function shouldAbortForCancellation(runStatus: string | undefined): boole
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the error message when Claude's stdout contains no result event.
|
||||
* Skips system/init event lines so the UI doesn't display the raw init JSON.
|
||||
* Exported for unit tests.
|
||||
* Returns the first non-JSON/plain-text line in stdout, treating JSON objects
|
||||
* with a "type" field as protocol artefacts and skipping them.
|
||||
* Used by buildPartialRunError to detect init-only runs.
|
||||
*/
|
||||
export function buildPartialRunError(
|
||||
exitCode: number | null,
|
||||
model: string,
|
||||
stdout: string,
|
||||
): string {
|
||||
if (exitCode === 0) return "Failed to parse Claude JSON output";
|
||||
|
||||
// Walk stdout lines and skip every structured streaming event (any JSON
|
||||
// object that carries a non-empty "type" field: system, assistant, user,
|
||||
// rate_limit_event, result, …). All of these are protocol artefacts and
|
||||
// produce confusing raw-JSON blobs when surfaced verbatim as an error
|
||||
// message. Only plain-text lines (non-JSON, or JSON without a type field)
|
||||
// are treated as human-readable content worth including in the error.
|
||||
const firstContentLine = stdout.split(/\r?\n/)
|
||||
function firstContentLine(stdout: string): string {
|
||||
return stdout.split(/\r?\n/)
|
||||
.map((l) => l.trim())
|
||||
.find((l) => {
|
||||
if (!l) return false;
|
||||
@@ -142,19 +130,82 @@ export function buildPartialRunError(
|
||||
}
|
||||
return true;
|
||||
}) ?? "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when stdout contains only init/system/assistant events from the
|
||||
* given model with no human-readable content lines. Used to detect init-only
|
||||
* non-zero-exit runs that should be classified as claude_init_failed rather than
|
||||
* the generic "Claude exited with code N" message.
|
||||
*/
|
||||
function isInitOnlyRun(model: string, stdout: string): boolean {
|
||||
if (!stdout.trim() || !model) return false;
|
||||
const content = firstContentLine(stdout);
|
||||
if (content) return false;
|
||||
// Check that at least the init event for this model was seen
|
||||
const hasModelInit = stdout.includes(`"model":"${model}"`) || stdout.includes(`"model":"${model.replace(/-/g, "_")}"`);
|
||||
return hasModelInit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the pod's terminated-state detail (reason/message/signal) to a
|
||||
* partial-run error message when available. Exit code is already in the
|
||||
* caller-supplied message, so we only append fields that add new signal —
|
||||
* specifically reason (e.g. OOMKilled, Error, ContainerCannotRun), message
|
||||
* (kubelet diagnostic text), and signal. Saves the operator a kubectl trip.
|
||||
*/
|
||||
function appendPodCause(message: string, state: PodTerminatedState | null): string {
|
||||
if (!state) return message;
|
||||
const parts: string[] = [];
|
||||
if (state.reason) parts.push(`reason=${state.reason}`);
|
||||
if (state.message) parts.push(`message=${state.message}`);
|
||||
if (state.signal !== null) parts.push(`signal=${state.signal}`);
|
||||
if (state.exitCode === 137) parts.push("SIGKILL (commonly OOMKilled)");
|
||||
if (parts.length === 0) return message;
|
||||
return `${message} [pod: ${parts.join(", ")}]`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the error message when Claude's stdout contains no result event.
|
||||
* Skips system/init event lines so the UI doesn't display the raw init JSON.
|
||||
* When `podState` is provided, appends the K8s container terminated reason/
|
||||
* message so failures self-explain without requiring `kubectl`.
|
||||
* Exported for unit tests.
|
||||
*/
|
||||
export function buildPartialRunError(
|
||||
exitCode: number | null,
|
||||
model: string,
|
||||
stdout: string,
|
||||
podState: PodTerminatedState | null = null,
|
||||
): string {
|
||||
if (exitCode === 0) return "Failed to parse Claude JSON output";
|
||||
|
||||
// If the stream contained only structured events with no plain-text output,
|
||||
// surface the model name so the operator can diagnose missing credentials
|
||||
// or unsupported/misconfigured model.
|
||||
const initOnlyOutput = stdout.trim() !== "" && model !== "" && !firstContentLine;
|
||||
if (initOnlyOutput) {
|
||||
const modelHint = model ? ` (model: ${model})` : "";
|
||||
return `Claude started but did not produce a result${modelHint} — check API credentials, model support, and adapter config`;
|
||||
const contentLine = firstContentLine(stdout);
|
||||
if (contentLine) {
|
||||
return appendPodCause(`Claude exited with code ${exitCode ?? -1}: ${contentLine}`, podState);
|
||||
}
|
||||
|
||||
return firstContentLine
|
||||
? `Claude exited with code ${exitCode ?? -1}: ${firstContentLine}`
|
||||
: `Claude exited with code ${exitCode ?? -1}`;
|
||||
if (isInitOnlyRun(model, stdout) && (exitCode ?? 0) !== 0) {
|
||||
const modelHint = model ? ` (model: ${model})` : "";
|
||||
return appendPodCause(
|
||||
`Claude exited immediately after init${modelHint} (exit code ${exitCode ?? -1}) — the model may be unsupported or the session may have been rejected before producing output`,
|
||||
podState,
|
||||
);
|
||||
}
|
||||
|
||||
const initOnlyOutput = stdout.trim() !== "" && model !== "";
|
||||
if (initOnlyOutput) {
|
||||
const modelHint = model ? ` (model: ${model})` : "";
|
||||
return appendPodCause(
|
||||
`Claude started but did not produce a result${modelHint} — check API credentials, model support, and adapter config`,
|
||||
podState,
|
||||
);
|
||||
}
|
||||
|
||||
return appendPodCause(`Claude exited with code ${exitCode ?? -1}`, podState);
|
||||
}
|
||||
|
||||
export type OrphanClassification =
|
||||
@@ -531,11 +582,14 @@ async function readPodLogs(
|
||||
* is treated as a soft terminal: succeeded=false, timedOut=false, jobGone=true.
|
||||
* The caller should log this and fall through to stdout parsing.
|
||||
*/
|
||||
type JobConditionSnapshot = { type?: string; status?: string; reason?: string; message?: string };
|
||||
|
||||
async function waitForJobCompletion(
|
||||
namespace: string,
|
||||
jobName: string,
|
||||
timeoutMs: number,
|
||||
kubeconfigPath?: string,
|
||||
observer?: { lastConditions: JobConditionSnapshot[] | null; pollCount: number },
|
||||
): Promise<{ succeeded: boolean; timedOut: boolean; jobGone?: boolean }> {
|
||||
const batchApi = getBatchApi(kubeconfigPath);
|
||||
const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : 0;
|
||||
@@ -554,6 +608,12 @@ async function waitForJobCompletion(
|
||||
throw err;
|
||||
}
|
||||
const conditions = job.status?.conditions ?? [];
|
||||
if (observer) {
|
||||
observer.pollCount += 1;
|
||||
observer.lastConditions = conditions.map((c) => ({
|
||||
type: c.type, status: c.status, reason: c.reason, message: c.message,
|
||||
}));
|
||||
}
|
||||
|
||||
const complete = conditions.find((c) => c.type === "Complete" && c.status === "True");
|
||||
if (complete) return { succeeded: true, timedOut: false };
|
||||
@@ -1061,6 +1121,17 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
// Set when the job disappeared (404) or grace-timer fired before we saw a
|
||||
// terminal condition — used to emit a clearer error when stdout parsing fails.
|
||||
let jobDeletedExternally = false;
|
||||
// Forensics for k8s_job_deleted_externally — captures which of the three
|
||||
// detection paths observed the 404, the last successful Job-condition read
|
||||
// before deletion, and timing. Surfaced in the error message so the next
|
||||
// occurrence is self-diagnosing instead of opaque (FAR-107).
|
||||
let jobGoneDetectionPath: string | null = null;
|
||||
let jobGoneAt: number | null = null;
|
||||
const jobObserver: { lastConditions: JobConditionSnapshot[] | null; pollCount: number } = {
|
||||
lastConditions: null,
|
||||
pollCount: 0,
|
||||
};
|
||||
let podRunningAt: number | null = null;
|
||||
|
||||
const activeJobRef: ActiveJobRef = {
|
||||
namespace,
|
||||
@@ -1093,6 +1164,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
podName = await waitForPod(namespace, jobName, scheduleTimeoutMs, onLog, kubeconfigPath);
|
||||
await onLog("stdout", `[paperclip] Pod running: ${podName}\n`);
|
||||
}
|
||||
podRunningAt = Date.now();
|
||||
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
@@ -1228,7 +1300,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
// while streamPodLogs reconnects, holding execute() open for minutes.
|
||||
// logStopSignal.stopped is set on every settled path (fulfilled, rejected,
|
||||
// or grace) so streamPodLogs stops reconnecting promptly.
|
||||
type CompletionResult = { succeeded: boolean; timedOut: boolean; jobGone?: boolean };
|
||||
type CompletionResult = { succeeded: boolean; timedOut: boolean; jobGone?: boolean; gracePeriodFired?: boolean };
|
||||
let gracePoller: ReturnType<typeof setInterval> | null = null;
|
||||
const completionWithGrace = new Promise<CompletionResult>((resolve, reject) => {
|
||||
let settled = false;
|
||||
@@ -1246,11 +1318,37 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
logStopSignal.stopped = true;
|
||||
reject(err);
|
||||
};
|
||||
waitForJobCompletion(namespace, jobName, completionTimeoutMs, kubeconfigPath).then(settleOk).catch(settleErr);
|
||||
waitForJobCompletion(namespace, jobName, completionTimeoutMs, kubeconfigPath, jobObserver).then(settleOk).catch(settleErr);
|
||||
gracePoller = setInterval(() => {
|
||||
if (logExitTime !== null && Date.now() - logExitTime >= LOG_EXIT_COMPLETION_GRACE_MS) {
|
||||
void onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
||||
settleOk({ succeeded: false, timedOut: false, jobGone: true });
|
||||
// Stop the grace poller immediately so we don't double-fire while the
|
||||
// verification read below is in flight.
|
||||
if (gracePoller) { clearInterval(gracePoller); gracePoller = null; }
|
||||
// The log stream exiting only means the container stopped producing
|
||||
// output — it does NOT prove the Job was deleted. Verify Job
|
||||
// presence with a one-shot read so we can distinguish:
|
||||
// (a) Job 404 → truly gone (TTL or external deletion)
|
||||
// (b) Job still present → K8s condition propagation lag (FAR-23)
|
||||
// Without this check we mis-classify (b) as "deleted externally" and
|
||||
// emit a false-positive k8s_job_deleted_externally error (FAR-107).
|
||||
void (async () => {
|
||||
try {
|
||||
await getBatchApi(kubeconfigPath).readNamespacedJob({ name: jobName, namespace });
|
||||
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update; Job ${jobName} still present — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
||||
settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true });
|
||||
} catch (err: unknown) {
|
||||
if (isK8s404(err)) {
|
||||
jobGoneDetectionPath = "grace-period-verify-404";
|
||||
jobGoneAt = Date.now();
|
||||
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago and Job ${jobName} is gone (TTL or external deletion) — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
||||
settleOk({ succeeded: false, timedOut: false, jobGone: true });
|
||||
} else {
|
||||
// K8s API hiccup — bail out without claiming external deletion.
|
||||
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago; Job state unverifiable (${err instanceof Error ? err.message : String(err)}) — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
||||
settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true });
|
||||
}
|
||||
}
|
||||
})();
|
||||
}
|
||||
}, 1_000);
|
||||
});
|
||||
@@ -1318,6 +1416,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
// completion), so log streaming has captured the full output — continue
|
||||
// to stdout parsing rather than returning an error.
|
||||
jobDeletedExternally = true;
|
||||
if (!jobGoneDetectionPath) {
|
||||
jobGoneDetectionPath = "completion-poll-404";
|
||||
jobGoneAt = Date.now();
|
||||
}
|
||||
await onLog("stdout", `[paperclip] Job ${jobName} was deleted before terminal condition was observed (TTL or external deletion) — proceeding with captured output.\n`);
|
||||
}
|
||||
} else {
|
||||
@@ -1326,7 +1428,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
// (60s) so we don't hang the heartbeat indefinitely if the K8s API is degraded.
|
||||
jobTimedOut = false;
|
||||
const RECHECK_TIMEOUT_MS = 60_000;
|
||||
const actualState = await waitForJobCompletion(namespace, jobName, RECHECK_TIMEOUT_MS, kubeconfigPath);
|
||||
const actualState = await waitForJobCompletion(namespace, jobName, RECHECK_TIMEOUT_MS, kubeconfigPath, jobObserver);
|
||||
if (actualState.timedOut) {
|
||||
// Re-check itself timed out — the job may still be running.
|
||||
// Return an error so the UI knows the run is not done.
|
||||
@@ -1335,6 +1437,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
// Job was deleted before we could confirm terminal state — same as the
|
||||
// fulfilled+jobGone case above: proceed with captured output.
|
||||
jobDeletedExternally = true;
|
||||
if (!jobGoneDetectionPath) {
|
||||
jobGoneDetectionPath = "recheck-poll-404";
|
||||
jobGoneAt = Date.now();
|
||||
}
|
||||
await onLog("stdout", `[paperclip] Job ${jobName} was deleted before terminal condition was observed (TTL or external deletion) — proceeding with captured output.\n`);
|
||||
} else if (!actualState.succeeded) {
|
||||
// Job still not terminal — the completion error was likely transient.
|
||||
@@ -1404,11 +1510,35 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
|
||||
if (!parsed) {
|
||||
if (jobDeletedExternally && exitCode === null) {
|
||||
// Forensic context (FAR-107): users sometimes see this error when nothing
|
||||
// actually deleted the Job manually. Surface enough state in the message
|
||||
// to distinguish self-delete (SIGTERM/cancel), TTL-after-completion, and
|
||||
// genuine external deletion without needing cluster shell access.
|
||||
const detailParts: string[] = [];
|
||||
if (jobGoneDetectionPath) detailParts.push(`detected_via=${jobGoneDetectionPath}`);
|
||||
detailParts.push(`job=${jobName}`);
|
||||
detailParts.push(`ns=${namespace}`);
|
||||
if (podRunningAt !== null && jobGoneAt !== null) {
|
||||
detailParts.push(`elapsed_since_pod_running=${Math.round((jobGoneAt - podRunningAt) / 1000)}s`);
|
||||
}
|
||||
detailParts.push(`completion_polls=${jobObserver.pollCount}`);
|
||||
const lastConds = jobObserver.lastConditions;
|
||||
if (lastConds && lastConds.length > 0) {
|
||||
const summary = lastConds
|
||||
.map((c) => `${c.type}=${c.status}${c.reason ? `(${c.reason})` : ""}`)
|
||||
.join(",");
|
||||
detailParts.push(`last_job_conditions=[${summary}]`);
|
||||
} else {
|
||||
detailParts.push("last_job_conditions=none_observed");
|
||||
}
|
||||
detailParts.push(`stdout_bytes=${stdout.length}`);
|
||||
const stdoutLines = stdout.split("\n").filter((l) => l.trim()).length;
|
||||
detailParts.push(`stdout_nonempty_lines=${stdoutLines}`);
|
||||
return {
|
||||
exitCode,
|
||||
signal: null,
|
||||
timedOut: false,
|
||||
errorMessage: "K8s Job was deleted externally before Claude could complete",
|
||||
errorMessage: `K8s Job was deleted externally before Claude could complete [${detailParts.join(", ")}]`,
|
||||
errorCode: "k8s_job_deleted_externally",
|
||||
resultJson: { stdout },
|
||||
};
|
||||
@@ -1439,7 +1569,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
||||
exitCode,
|
||||
signal: null,
|
||||
timedOut: false,
|
||||
errorMessage: buildPartialRunError(exitCode, parsedStream.model, stdout),
|
||||
errorMessage: buildPartialRunError(exitCode, parsedStream.model, stdout, podTerminatedState),
|
||||
resultJson: { stdout },
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user