From c0dba8e9045354d49503f3352545f83b4b273abd Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Thu, 23 Apr 2026 23:58:51 +0000 Subject: [PATCH] fix: never auto-delete live K8s orphans; block on mismatch (#8) Co-Authored-By: Claude Sonnet Co-Authored-By: Paperclip --- src/server/config-schema.ts | 2 +- src/server/execute.ts | 92 +++++++++++++++++++++++++------------ 2 files changed, 64 insertions(+), 30 deletions(-) diff --git a/src/server/config-schema.ts b/src/server/config-schema.ts index 5f8caa1..6a7617d 100644 --- a/src/server/config-schema.ts +++ b/src/server/config-schema.ts @@ -93,7 +93,7 @@ export function getConfigSchema(): AdapterConfigSchema { type: "toggle", key: "reattachOrphanedJobs", label: "Reattach to Orphaned Jobs", - hint: "If a prior K8s Job for the same agent/task/session is still running (e.g. Paperclip restarted mid-run), attach to it and stream its output instead of deleting it and starting a new pod. Default: on.", + hint: "If a prior K8s Job for the same agent/task/session is still running (e.g. Paperclip restarted mid-run), attach to it and stream its output instead of blocking the new run. When false, any non-terminal orphan blocks the new run. Default: on.", default: true, }, // Resource Limits diff --git a/src/server/execute.ts b/src/server/execute.ts index 728887c..5e52541 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -569,38 +569,72 @@ export async function execute(ctx: AdapterExecutionContext): Promise (j.metadata?.labels?.["paperclip.io/run-id"] ?? "") === runId, ); - // Pick the most recent reattachable orphan — same task + session, not - // terminal. Only one target is chosen; any other orphans get cleaned up. - if (reattachOrphanedJobs && orphaned.length > 0) { - const candidates = orphaned - .filter((j) => classifyOrphan(j, { taskId: currentTaskLabel, sessionId: currentSessionLabel }) === "reattach") - .sort((a, b) => { - const at = new Date(a.metadata?.creationTimestamp ?? 0).getTime(); - const bt = new Date(b.metadata?.creationTimestamp ?? 0).getTime(); - return bt - at; - }); - const chosen = candidates[0]; - const chosenName = chosen?.metadata?.name; - if (chosen && chosenName) { - reattachTarget = { - jobName: chosenName, - namespace: chosen.metadata?.namespace ?? guardNamespace, - priorRunId: chosen.metadata?.labels?.["paperclip.io/run-id"] ?? "", - image: chosen.spec?.template?.spec?.containers?.[0]?.image ?? "unknown", + if (orphaned.length > 0) { + if (!reattachOrphanedJobs) { + // When reattach is disabled, block on any non-terminal orphan. + const names = orphaned.map((j) => j.metadata?.name).join(", "); + await onLog("stderr", `[paperclip] Concurrent run blocked: orphaned Job(s) running and reattach disabled: ${names}\n`); + return { + exitCode: null, + signal: null, + timedOut: false, + errorMessage: `Concurrent run blocked: orphaned Job(s) still running for this agent (reattach disabled)`, + errorCode: "k8s_concurrent_run_blocked", }; } - } - const toDelete = orphaned.filter( - (j) => !reattachTarget || j.metadata?.name !== reattachTarget.jobName, - ); - if (toDelete.length > 0) { - const orphanNames = toDelete.map((j) => j.metadata?.name).join(", "); - await onLog("stdout", `[paperclip] Cleaning up ${toDelete.length} orphaned K8s Job(s) from previous run(s): ${orphanNames}\n`); - for (const j of toDelete) { - const name = j.metadata?.name; - if (name) { - await cleanupJob(guardNamespace, name, onLog, kubeconfigPath); + // Apply the decision matrix to each orphan, newest-first. The first + // reattachable orphan becomes the target; any block classification + // stops the new run immediately. Orphans are never deleted here — + // terminal ones are cleaned up by TTL; live mismatches should not be + // killed because they may still be doing real work. + const sortedOrphans = [...orphaned].sort((a, b) => { + const at = new Date(a.metadata?.creationTimestamp ?? 0).getTime(); + const bt = new Date(b.metadata?.creationTimestamp ?? 0).getTime(); + return bt - at; + }); + for (const orphan of sortedOrphans) { + const classification = classifyOrphan(orphan, { + taskId: currentTaskLabel, + sessionId: currentSessionLabel, + }); + const orphanName = orphan.metadata?.name ?? "unknown"; + if (classification === "reattach") { + if (!reattachTarget) { + reattachTarget = { + jobName: orphanName, + namespace: orphan.metadata?.namespace ?? guardNamespace, + priorRunId: orphan.metadata?.labels?.["paperclip.io/run-id"] ?? "", + image: orphan.spec?.template?.spec?.containers?.[0]?.image ?? "unknown", + }; + } + } else if (classification === "block_task_unknown") { + await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} has missing task label — cannot safely reattach\n`); + return { + exitCode: null, + signal: null, + timedOut: false, + errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} has unknown task context`, + errorCode: "k8s_orphan_task_unknown", + }; + } else if (classification === "block_task_mismatch") { + await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} belongs to a different task\n`); + return { + exitCode: null, + signal: null, + timedOut: false, + errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} is running a different task`, + errorCode: "k8s_concurrent_run_blocked", + }; + } else if (classification === "block_session_mismatch") { + await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} has a different session\n`); + return { + exitCode: null, + signal: null, + timedOut: false, + errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} has a mismatched session`, + errorCode: "k8s_orphan_session_mismatch", + }; } } }