fix: re-check job state when completion waiter throws to prevent UI staleness

When waitForJobCompletion threw a transient error (API disconnect, etc.),
the code fell through with jobTimedOut=true and returned a result even
though the job was still running. This caused the UI to think the run
was complete while the job kept running, resulting in concurrency errors.

Now when completion throws, we re-check the job's actual state. If still
not terminal, we return a k8s_job_state_mismatch error so the UI knows
the run is not done.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 07:26:10 -04:00
parent 710cf37f5e
commit efbbfbc299
6 changed files with 67 additions and 7 deletions
+1 -1
View File
@@ -1 +1 @@
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/server/execute.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,uBAAuB,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AA2SlG,wBAAsB,OAAO,CAAC,GAAG,EAAE,uBAAuB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAkQ3F"}
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../../src/server/execute.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,uBAAuB,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AA2SlG,wBAAsB,OAAO,CAAC,GAAG,EAAE,uBAAuB,GAAG,OAAO,CAAC,sBAAsB,CAAC,CAgS3F"}
+31 -1
View File
@@ -364,11 +364,41 @@ export async function execute(ctx) {
await onLog("stdout", stdout);
}
}
// If the follow stream missed output (container exited quickly), do a
// one-shot log read as fallback before the pod is cleaned up.
if (!stdout.trim()) {
await onLog("stdout", `[paperclip] Log stream returned empty — reading pod logs directly...\n`);
stdout = await readPodLogs(namespace, podName, kubeconfigPath);
if (stdout.trim()) {
await onLog("stdout", stdout);
}
}
if (completionResult.status === "fulfilled") {
jobTimedOut = completionResult.value.timedOut;
}
else {
jobTimedOut = true;
// waitForJobCompletion threw — re-check job state to avoid returning
// while the job is still running (which would cause UI staleness and
// concurrency errors on retry).
jobTimedOut = false;
const actualState = await waitForJobCompletion(namespace, jobName, 0, kubeconfigPath);
if (actualState.timedOut) {
// Truly a timeout after re-check — treat as timed out.
jobTimedOut = true;
}
else if (!actualState.succeeded) {
// Job still not terminal — the completion error was likely transient.
// Return an error so the UI knows the run is not done, rather than
// returning with parsed (potentially incomplete) stdout.
await onLog("stderr", `[paperclip] Job ${jobName} still not terminal after log/completion mismatch — returning error to keep UI in sync.\n`);
return {
exitCode,
signal: null,
timedOut: false,
errorMessage: `Job ${jobName} did not complete cleanly (log stream ended before job reached terminal state)`,
errorCode: "k8s_job_state_mismatch",
};
}
}
exitCode = await getPodExitCode(namespace, jobName, kubeconfigPath);
}
+1 -1
View File
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -1,12 +1,12 @@
{
"name": "@farhoodliquor/paperclip-adapter-claude-k8s",
"version": "0.1.14",
"version": "0.1.15",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@farhoodliquor/paperclip-adapter-claude-k8s",
"version": "0.1.14",
"version": "0.1.15",
"license": "MIT",
"dependencies": {
"@kubernetes/client-node": "^1.0.0",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@farhoodliquor/paperclip-adapter-claude-k8s",
"version": "0.1.14",
"version": "0.1.15",
"description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs",
"license": "MIT",
"repository": {
+31 -1
View File
@@ -444,10 +444,40 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
}
}
// If the follow stream missed output (container exited quickly), do a
// one-shot log read as fallback before the pod is cleaned up.
if (!stdout.trim()) {
await onLog("stdout", `[paperclip] Log stream returned empty — reading pod logs directly...\n`);
stdout = await readPodLogs(namespace, podName, kubeconfigPath);
if (stdout.trim()) {
await onLog("stdout", stdout);
}
}
if (completionResult.status === "fulfilled") {
jobTimedOut = completionResult.value.timedOut;
} else {
jobTimedOut = true;
// waitForJobCompletion threw — re-check job state to avoid returning
// while the job is still running (which would cause UI staleness and
// concurrency errors on retry).
jobTimedOut = false;
const actualState = await waitForJobCompletion(namespace, jobName, 0, kubeconfigPath);
if (actualState.timedOut) {
// Truly a timeout after re-check — treat as timed out.
jobTimedOut = true;
} else if (!actualState.succeeded) {
// Job still not terminal — the completion error was likely transient.
// Return an error so the UI knows the run is not done, rather than
// returning with parsed (potentially incomplete) stdout.
await onLog("stderr", `[paperclip] Job ${jobName} still not terminal after log/completion mismatch — returning error to keep UI in sync.\n`);
return {
exitCode,
signal: null,
timedOut: false,
errorMessage: `Job ${jobName} did not complete cleanly (log stream ended before job reached terminal state)`,
errorCode: "k8s_job_state_mismatch",
};
}
}
exitCode = await getPodExitCode(namespace, jobName, kubeconfigPath);