07ef106c66
The 30s grace timer that bounds K8s Job condition propagation lag was armed by streamPodLogs's onFirstStreamExit callback the moment streamPodLogsOnce returned for the first time. A transient K8s log-API disconnect mid-run also returns from streamPodLogsOnce — so the grace timer fired 30s later regardless of whether streamPodLogs had already reconnected and the container was still producing output. Nancy / Privileged Escalation reproduced this on long Opus-4-6 runs: the prod paperclip pod was stable, the cancel-poll guard was already narrowed in 0.1.51, but every long run truncated with claude_truncated + "container terminated state not yet observable (pod phase=Running)" because the run was being abandoned mid-output. Replace the boolean onFirstStreamExit signal with a streamActivity ref carrying lastActiveAt + streamHasExited. streamPodLogs refreshes lastActiveAt every time a streamPodLogsOnce attempt returns non-empty output, so reconnects that resume real output keep the grace clock reset. The grace timer fires only once the stream has exited at least once AND no chunk has arrived for the full grace window — which preserves the original FAR-23 behaviour (container truly exited but Job condition lags) while ending the false-truncation of healthy streams. Adds a regression test that asserts a stream drop + reconnect + deferred Job completion does not surface as truncated. Co-Authored-By: Paperclip <noreply@paperclip.ing>
1723 lines
73 KiB
TypeScript
1723 lines
73 KiB
TypeScript
import type { AdapterExecutionContext, AdapterExecutionResult } from "@paperclipai/adapter-utils";
|
|
import {
|
|
asString,
|
|
asNumber,
|
|
asBoolean,
|
|
parseObject,
|
|
readPaperclipRuntimeSkillEntries,
|
|
resolvePaperclipDesiredSkillNames,
|
|
} from "@paperclipai/adapter-utils/server-utils";
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { prepareClaudePromptBundle } from "./prompt-cache.js";
|
|
import {
|
|
parseClaudeStreamJson,
|
|
describeClaudeFailure,
|
|
isClaudeMaxTurnsResult,
|
|
isClaudeUnknownSessionError,
|
|
} from "./parse.js";
|
|
import { getSelfPodInfo, getBatchApi, getCoreApi, getLogApi } from "./k8s-client.js";
|
|
import { buildJobManifest, sanitizeLabelValue } from "./job-manifest.js";
|
|
import { LogLineDedupFilter } from "./log-dedup.js";
|
|
import type * as k8s from "@kubernetes/client-node";
|
|
import { Writable } from "node:stream";
|
|
|
|
const POLL_INTERVAL_MS = 2000;
|
|
const KEEPALIVE_INTERVAL_MS = 15_000;
|
|
const LOG_STREAM_RECONNECT_DELAY_MS = 3_000;
|
|
const MAX_LOG_RECONNECT_ATTEMPTS = 50;
|
|
// Upper bound on how long streamPodLogsOnce will wait after stopSignal fires
|
|
// before force-returning, even if logApi.log has not yet resolved. Defensive
|
|
// against the K8s client library not propagating writable.destroy() into an
|
|
// abort of the underlying HTTP request.
|
|
const LOG_STREAM_BAIL_TIMEOUT_MS = 3_000;
|
|
// After the log stream exits (container stopped producing output), wait this
|
|
// long for the K8s Job condition to be confirmed before treating the job as
|
|
// done. K8s Job conditions can lag pod exit by several seconds or more under
|
|
// cluster load. Without this bound, waitForJobCompletion keeps polling while
|
|
// streamPodLogs keeps reconnecting — together they can hold execute() open for
|
|
// minutes, causing stale "running" status in the UI (FAR-23).
|
|
const LOG_EXIT_COMPLETION_GRACE_MS = 30_000;
|
|
|
|
// Module-level tracking of active Jobs for SIGTERM best-effort cleanup.
|
|
interface ActiveJobRef {
|
|
namespace: string;
|
|
jobName: string;
|
|
promptSecretName?: string;
|
|
promptSecretNamespace?: string;
|
|
kubeconfigPath?: string;
|
|
}
|
|
const activeJobs = new Set<ActiveJobRef>();
|
|
// Per-agent serialization lock: prevents the TOCTOU race (FAR-29) where two
|
|
// concurrent execute() calls for the same agent both pass the list-then-create
|
|
// guard and create K8s Jobs simultaneously on the shared PVC.
|
|
const agentCreationMutex = new Map<string, Promise<void>>();
|
|
let sigtermHandlerRegistered = false;
|
|
|
|
function ensureSigtermHandler(): void {
|
|
if (sigtermHandlerRegistered) return;
|
|
sigtermHandlerRegistered = true;
|
|
process.once("SIGTERM", () => {
|
|
// Do NOT delete active K8s Jobs on SIGTERM (FAR-107). Paperclip itself
|
|
// receives SIGTERM during rolling deploys, evictions, scale-down, etc.
|
|
// Deleting the Jobs we own there causes the in-flight heartbeat to surface
|
|
// a false-positive `k8s_job_deleted_externally` error and tears down work
|
|
// the user expected to keep running.
|
|
//
|
|
// The correct behaviour with `reattachOrphanedJobs=true` (default) is to
|
|
// leave the Jobs alive: the next paperclip process discovers them via the
|
|
// orphan-classification path and reattaches their log streams. When
|
|
// `reattachOrphanedJobs=false` the operator explicitly opted into manual
|
|
// cleanup and should not have us auto-deleting either. The owning Job's
|
|
// ownerReference (FAR-15) keeps the prompt Secret tied to the Job, so
|
|
// both survive together and TTL cleans them up after natural completion.
|
|
process.kill(process.pid, "SIGTERM");
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Detect a Kubernetes 404 (Not Found) error from @kubernetes/client-node.
|
|
* Works for both v0.x (response.statusCode) and v1.0+ (response.status, message).
|
|
* Exported for unit tests.
|
|
*/
|
|
export function isK8s404(err: unknown): boolean {
|
|
if (!(err instanceof Error)) return false;
|
|
const e = err as unknown as Record<string, unknown>;
|
|
const resp = e.response as Record<string, unknown> | undefined;
|
|
if (resp?.statusCode === 404 || resp?.status === 404) return true;
|
|
if (e.statusCode === 404) return true;
|
|
return /HTTP-Code:\s*404\b/.test(err.message);
|
|
}
|
|
|
|
/**
|
|
* Returns true when the heartbeat-run status indicates the run was explicitly
|
|
* cancelled and the K8s Job must be torn down.
|
|
*
|
|
* Only `cancelled` / `cancelling` qualify. Treating any non-`running` status
|
|
* as cancellation (the previous behaviour) produced spurious
|
|
* k8s_job_deleted_externally errors for in-flight runs whenever the API
|
|
* briefly reported a transient or stale status — Nancy's runs at
|
|
* Privileged Escalation hit this without anyone actually cancelling them
|
|
* (FAR-107). Other terminal statuses (`succeeded`/`failed`/`completed`)
|
|
* are unreachable in practice while the adapter is still executing
|
|
* (the adapter's own return is what flips them) and even if observed,
|
|
* they do not warrant our deleting a Job that may still be doing work.
|
|
* Exported for unit tests.
|
|
*/
|
|
export function shouldAbortForCancellation(runStatus: string | undefined): boolean {
|
|
if (!runStatus) return false;
|
|
return runStatus === "cancelled" || runStatus === "cancelling";
|
|
}
|
|
|
|
/**
|
|
* Returns the first non-JSON/plain-text line in stdout, treating JSON objects
|
|
* with a "type" field as protocol artefacts and skipping them.
|
|
* Used by buildPartialRunError to detect init-only runs.
|
|
*/
|
|
function firstContentLine(stdout: string): string {
|
|
return stdout.split(/\r?\n/)
|
|
.map((l) => l.trim())
|
|
.find((l) => {
|
|
if (!l) return false;
|
|
try {
|
|
const obj = JSON.parse(l);
|
|
if (typeof obj === "object" && obj !== null) {
|
|
const t = (obj as Record<string, unknown>).type;
|
|
if (typeof t === "string" && t) return false;
|
|
}
|
|
} catch {
|
|
// not JSON — treat as content
|
|
}
|
|
return true;
|
|
}) ?? "";
|
|
}
|
|
|
|
/**
|
|
* Returns true when stdout contains only init/system/assistant events from the
|
|
* given model with no human-readable content lines. Used to detect init-only
|
|
* non-zero-exit runs that should be classified as claude_init_failed rather than
|
|
* the generic "Claude exited with code N" message.
|
|
*/
|
|
function isInitOnlyRun(model: string, stdout: string): boolean {
|
|
if (!stdout.trim() || !model) return false;
|
|
const content = firstContentLine(stdout);
|
|
if (content) return false;
|
|
// Check that at least the init event for this model was seen
|
|
const hasModelInit = stdout.includes(`"model":"${model}"`) || stdout.includes(`"model":"${model.replace(/-/g, "_")}"`);
|
|
return hasModelInit;
|
|
}
|
|
|
|
/**
|
|
* Append the pod's terminated-state detail (reason/message/signal) to a
|
|
* partial-run error message when available. Exit code is already in the
|
|
* caller-supplied message, so we only append fields that add new signal —
|
|
* specifically reason (e.g. OOMKilled, Error, ContainerCannotRun), message
|
|
* (kubelet diagnostic text), and signal. Saves the operator a kubectl trip.
|
|
*/
|
|
function appendPodCause(message: string, state: PodTerminatedState | null): string {
|
|
if (!state) return message;
|
|
const parts: string[] = [];
|
|
if (state.reason) parts.push(`reason=${state.reason}`);
|
|
if (state.message) parts.push(`message=${state.message}`);
|
|
if (state.signal !== null) parts.push(`signal=${state.signal}`);
|
|
if (state.exitCode === 137) parts.push("SIGKILL (commonly OOMKilled)");
|
|
if (parts.length === 0) return message;
|
|
return `${message} [pod: ${parts.join(", ")}]`;
|
|
}
|
|
|
|
/**
|
|
* Build the error message when Claude's stdout contains no result event.
|
|
* Skips system/init event lines so the UI doesn't display the raw init JSON.
|
|
* When `podState` is provided, appends the K8s container terminated reason/
|
|
* message so failures self-explain without requiring `kubectl`.
|
|
* Exported for unit tests.
|
|
*/
|
|
export function buildPartialRunError(
|
|
exitCode: number | null,
|
|
model: string,
|
|
stdout: string,
|
|
podState: PodTerminatedState | null = null,
|
|
): string {
|
|
if (exitCode === 0) return "Failed to parse Claude JSON output";
|
|
|
|
// If the stream contained only structured events with no plain-text output,
|
|
// surface the model name so the operator can diagnose missing credentials
|
|
// or unsupported/misconfigured model.
|
|
const contentLine = firstContentLine(stdout);
|
|
if (contentLine) {
|
|
return appendPodCause(`Claude exited with code ${exitCode ?? -1}: ${contentLine}`, podState);
|
|
}
|
|
|
|
if (isInitOnlyRun(model, stdout) && (exitCode ?? 0) !== 0) {
|
|
const modelHint = model ? ` (model: ${model})` : "";
|
|
return appendPodCause(
|
|
`Claude exited immediately after init${modelHint} (exit code ${exitCode ?? -1}) — the model may be unsupported or the session may have been rejected before producing output`,
|
|
podState,
|
|
);
|
|
}
|
|
|
|
const initOnlyOutput = stdout.trim() !== "" && model !== "";
|
|
if (initOnlyOutput) {
|
|
const modelHint = model ? ` (model: ${model})` : "";
|
|
return appendPodCause(
|
|
`Claude started but did not produce a result${modelHint} — check API credentials, model support, and adapter config`,
|
|
podState,
|
|
);
|
|
}
|
|
|
|
return appendPodCause(`Claude exited with code ${exitCode ?? -1}`, podState);
|
|
}
|
|
|
|
export type OrphanClassification =
|
|
| "reattach"
|
|
| "block_session_mismatch"
|
|
| "block_task_mismatch"
|
|
| "block_task_unknown";
|
|
|
|
/**
|
|
* Classify a non-terminal orphaned K8s Job (one whose `paperclip.io/run-id`
|
|
* label does not match the current runId but does belong to this agent) as a
|
|
* reattach candidate or a block reason.
|
|
*
|
|
* Decision matrix:
|
|
* - taskId mismatch (both present, different values) → block_task_mismatch
|
|
* - taskId missing on either side → block_task_unknown
|
|
* - taskId match + both have sessionId + sessionIds differ → block_session_mismatch
|
|
* - taskId match + one or both sides missing sessionId → reattach (reconcile)
|
|
* - taskId match + both have sessionId + sessionIds match → reattach (happy path)
|
|
*
|
|
* Exported for unit tests.
|
|
*/
|
|
export function classifyOrphan(
|
|
job: k8s.V1Job,
|
|
expected: { taskId: string | null; sessionId: string | null },
|
|
): OrphanClassification {
|
|
const labels = job.metadata?.labels ?? {};
|
|
const jobTaskId = labels["paperclip.io/task-id"] ?? null;
|
|
const jobSessionId = labels["paperclip.io/session-id"] ?? null;
|
|
|
|
// taskId missing on either side
|
|
if (!expected.taskId || !jobTaskId) return "block_task_unknown";
|
|
|
|
// taskId mismatch
|
|
if (expected.taskId !== jobTaskId) return "block_task_mismatch";
|
|
|
|
// taskId matches — check sessionId
|
|
if (expected.sessionId && jobSessionId && expected.sessionId !== jobSessionId) {
|
|
return "block_session_mismatch";
|
|
}
|
|
|
|
return "reattach";
|
|
}
|
|
|
|
/**
|
|
* Build an error message for a pod that reached phase=Failed before or
|
|
* instead of streaming logs. Includes the claude container's terminated exit
|
|
* code and reason when available so operators can diagnose crashes without
|
|
* needing kubectl. Exported for unit tests.
|
|
*/
|
|
export function describePodTerminatedError(
|
|
podName: string,
|
|
phase: string,
|
|
containerStatuses: k8s.V1ContainerStatus[],
|
|
): string {
|
|
const mainCs = containerStatuses.find((cs) => cs.name === "claude");
|
|
const terminated = mainCs?.state?.terminated;
|
|
if (terminated) {
|
|
const code = terminated.exitCode ?? "unknown";
|
|
const reason = terminated.reason ?? terminated.message ?? "no reason";
|
|
return `Pod ${podName} reached phase=${phase}: claude exited ${code} (${reason})`;
|
|
}
|
|
return `Pod ${podName} reached phase=${phase}`;
|
|
}
|
|
|
|
/**
|
|
* Wait for the Job's pod to reach a terminal or running state.
|
|
* Returns the pod name once logs can be streamed, or throws on failure.
|
|
*/
|
|
async function waitForPod(
|
|
namespace: string,
|
|
jobName: string,
|
|
timeoutMs: number,
|
|
onLog: AdapterExecutionContext["onLog"],
|
|
kubeconfigPath?: string,
|
|
): Promise<string> {
|
|
const coreApi = getCoreApi(kubeconfigPath);
|
|
const deadline = Date.now() + timeoutMs;
|
|
const labelSelector = `job-name=${jobName}`;
|
|
|
|
await onLog("stdout", `[paperclip] Waiting for pod to be scheduled (job: ${jobName})...\n`);
|
|
|
|
let lastStatus = "";
|
|
while (Date.now() < deadline) {
|
|
const podList = await coreApi.listNamespacedPod({
|
|
namespace,
|
|
labelSelector,
|
|
});
|
|
const pod = podList.items[0];
|
|
|
|
if (!pod) {
|
|
if (lastStatus !== "no-pod") {
|
|
await onLog("stdout", `[paperclip] Waiting for Job controller to create pod...\n`);
|
|
lastStatus = "no-pod";
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
continue;
|
|
}
|
|
|
|
const podName = pod.metadata?.name ?? "unknown";
|
|
const phase = pod.status?.phase ?? "Unknown";
|
|
const initStatuses = pod.status?.initContainerStatuses ?? [];
|
|
const containerStatuses = pod.status?.containerStatuses ?? [];
|
|
|
|
// Log phase transitions
|
|
const statusKey = `${phase}:${initStatuses.map((s) => s.state?.waiting?.reason ?? s.state?.terminated?.reason ?? "ok").join(",")}:${containerStatuses.map((s) => s.state?.waiting?.reason ?? (s.state?.running ? "running" : "waiting")).join(",")}`;
|
|
if (statusKey !== lastStatus) {
|
|
const details: string[] = [`phase=${phase}`];
|
|
for (const init of initStatuses) {
|
|
if (init.state?.waiting) details.push(`init/${init.name}: waiting (${init.state.waiting.reason ?? "unknown"})`);
|
|
else if (init.state?.running) details.push(`init/${init.name}: running`);
|
|
else if (init.state?.terminated) details.push(`init/${init.name}: done (exit ${init.state.terminated.exitCode})`);
|
|
}
|
|
for (const cs of containerStatuses) {
|
|
if (cs.state?.waiting) details.push(`${cs.name}: waiting (${cs.state.waiting.reason ?? "unknown"})`);
|
|
else if (cs.state?.running) details.push(`${cs.name}: running`);
|
|
else if (cs.state?.terminated) details.push(`${cs.name}: terminated (exit ${cs.state.terminated.exitCode ?? "?"}, ${cs.state.terminated.reason ?? "no reason"})`);
|
|
}
|
|
await onLog("stdout", `[paperclip] Pod ${podName}: ${details.join(", ")}\n`);
|
|
lastStatus = statusKey;
|
|
}
|
|
|
|
// Ready to stream logs
|
|
if (phase === "Running" || phase === "Succeeded") {
|
|
return podName;
|
|
}
|
|
// phase=Failed means the pod crashed before we could stream logs.
|
|
// Throwing here routes the caller into the error path with a structured
|
|
// message instead of entering the log-streaming path with a dead pod.
|
|
if (phase === "Failed") {
|
|
throw new Error(describePodTerminatedError(podName, phase, containerStatuses));
|
|
}
|
|
|
|
// Init containers done + main running (phase may still say Pending briefly)
|
|
const allInitsDone = initStatuses.length > 0 && initStatuses.every(
|
|
(s) => s.state?.terminated?.exitCode === 0,
|
|
);
|
|
const mainRunning = containerStatuses.some((s) => s.state?.running);
|
|
if (allInitsDone && mainRunning) {
|
|
return podName;
|
|
}
|
|
|
|
// Check for init container failures
|
|
for (const init of initStatuses) {
|
|
const terminated = init.state?.terminated;
|
|
if (terminated && (terminated.exitCode ?? 0) !== 0) {
|
|
throw new Error(`Init container "${init.name}" failed with exit code ${terminated.exitCode}: ${terminated.reason ?? terminated.message ?? "unknown"}`);
|
|
}
|
|
const waiting = init.state?.waiting;
|
|
if (waiting?.reason === "ErrImagePull" || waiting?.reason === "ImagePullBackOff") {
|
|
throw new Error(`Init container "${init.name}" image pull failed: ${waiting.message ?? waiting.reason}`);
|
|
}
|
|
if (waiting?.reason === "CrashLoopBackOff") {
|
|
throw new Error(`Init container "${init.name}" crash loop: ${waiting.message ?? waiting.reason}`);
|
|
}
|
|
}
|
|
|
|
// Check for unrecoverable scheduling failures
|
|
const conditions = pod.status?.conditions ?? [];
|
|
const unschedulable = conditions.find(
|
|
(c) => c.type === "PodScheduled" && c.status === "False" && c.reason === "Unschedulable",
|
|
);
|
|
if (unschedulable) {
|
|
throw new Error(`Pod unschedulable: ${unschedulable.message ?? "insufficient resources"}`);
|
|
}
|
|
|
|
// Check for main container image pull errors
|
|
for (const cs of containerStatuses) {
|
|
const waiting = cs.state?.waiting;
|
|
if (waiting?.reason === "ErrImagePull" || waiting?.reason === "ImagePullBackOff") {
|
|
throw new Error(`Image pull failed for "${cs.name}": ${waiting.message ?? waiting.reason}`);
|
|
}
|
|
if (waiting?.reason === "CrashLoopBackOff") {
|
|
throw new Error(`Container "${cs.name}" crash loop: ${waiting.message ?? waiting.reason}`);
|
|
}
|
|
}
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
}
|
|
|
|
throw new Error(`Timed out waiting for pod to be scheduled (${Math.round(timeoutMs / 1000)}s)`);
|
|
}
|
|
|
|
/**
|
|
* Stream pod logs once via follow. Returns accumulated stdout when the
|
|
* stream ends (container exit, API disconnect, or abort signal).
|
|
*/
|
|
export async function streamPodLogsOnce(
|
|
namespace: string,
|
|
podName: string,
|
|
onLog: AdapterExecutionContext["onLog"],
|
|
kubeconfigPath?: string,
|
|
sinceSeconds?: number,
|
|
dedup?: LogLineDedupFilter,
|
|
stopSignal?: { stopped: boolean },
|
|
): Promise<string> {
|
|
const logApi = getLogApi(kubeconfigPath);
|
|
const chunks: string[] = [];
|
|
|
|
const writable = new Writable({
|
|
write(chunk: Buffer, _encoding, callback) {
|
|
const text = chunk.toString("utf-8");
|
|
chunks.push(text);
|
|
const emitted = dedup ? dedup.filter(text) : text;
|
|
if (!emitted) {
|
|
callback();
|
|
return;
|
|
}
|
|
// Forward raw stream-json lines unchanged. The Paperclip UI uses the
|
|
// adapter's ui-parser export (src/ui-parser.ts) to render structured
|
|
// transcript entries — pre-formatting here would strip that structure
|
|
// and produce flat plain text that looks nothing like claude_local.
|
|
void onLog("stdout", emitted).then(() => callback(), callback);
|
|
},
|
|
});
|
|
|
|
// When the job completion signal fires, destroy the writable to abort the
|
|
// in-flight follow stream. Without this, logApi.log can hang indefinitely
|
|
// when the pod terminates without closing the HTTP connection cleanly.
|
|
let stopPoller: ReturnType<typeof setInterval> | null = null;
|
|
let bailTimer: ReturnType<typeof setTimeout> | null = null;
|
|
let bailResolve: (() => void) | null = null;
|
|
// Bail promise resolves LOG_STREAM_BAIL_TIMEOUT_MS after stopSignal fires,
|
|
// even if logApi.log has not resolved by then. This is a safety net for the
|
|
// case where writable.destroy() fails to propagate to an abort of the HTTP
|
|
// request (e.g. the K8s client is awaiting a response that never comes).
|
|
const bailPromise = new Promise<void>((resolve) => {
|
|
bailResolve = resolve;
|
|
});
|
|
if (stopSignal) {
|
|
stopPoller = setInterval(() => {
|
|
if (stopSignal.stopped) {
|
|
if (!writable.destroyed) writable.destroy();
|
|
if (!bailTimer && bailResolve) {
|
|
bailTimer = setTimeout(() => {
|
|
onLog("stderr", "[paperclip] Log stream bail timer fired — forcing return\n").catch(() => {});
|
|
bailResolve!();
|
|
}, LOG_STREAM_BAIL_TIMEOUT_MS);
|
|
}
|
|
}
|
|
}, 200);
|
|
}
|
|
|
|
const logPromise = logApi.log(namespace, podName, "claude", writable, {
|
|
follow: true,
|
|
pretty: false,
|
|
...(sinceSeconds ? { sinceSeconds } : {}),
|
|
}).catch(() => {
|
|
// follow may fail if the container already exited, the API connection
|
|
// dropped, or we aborted via writable.destroy() — not fatal.
|
|
});
|
|
|
|
try {
|
|
if (stopSignal) {
|
|
await Promise.race([logPromise, bailPromise]);
|
|
} else {
|
|
await logPromise;
|
|
}
|
|
} finally {
|
|
if (stopPoller) clearInterval(stopPoller);
|
|
if (bailTimer) clearTimeout(bailTimer);
|
|
}
|
|
|
|
return chunks.join("");
|
|
}
|
|
|
|
/**
|
|
* Stream pod logs with automatic reconnection. Keeps retrying the log
|
|
* stream until the stop signal fires (job completed) or the container
|
|
* exits normally. This handles silent K8s API connection drops that
|
|
* would otherwise cause the UI to stop receiving real output.
|
|
*
|
|
* Capped at MAX_LOG_RECONNECT_ATTEMPTS to prevent infinite reconnect
|
|
* loops during sustained API partitions.
|
|
*
|
|
* `activity` tracks stream liveness so execute()'s grace timer can
|
|
* distinguish a transient K8s log-API reconnect from a real container
|
|
* exit (FAR-107). Two signals:
|
|
* - `streamHasExited` becomes true on the first return from
|
|
* streamPodLogsOnce. Until then we are still in the warm-up window
|
|
* and waitForJobCompletion is the authoritative signal — grace must
|
|
* not fire.
|
|
* - `lastActiveAt` advances every time a streamPodLogsOnce attempt
|
|
* returns non-empty output (the container is still producing).
|
|
* The grace timer fires only once GRACE_MS have passed since the
|
|
* last chunk, so output that resumes after a transient drop keeps
|
|
* the run alive.
|
|
*/
|
|
async function streamPodLogs(
|
|
namespace: string,
|
|
podName: string,
|
|
onLog: AdapterExecutionContext["onLog"],
|
|
kubeconfigPath?: string,
|
|
stopSignal?: { stopped: boolean },
|
|
dedup?: LogLineDedupFilter,
|
|
activity?: { lastActiveAt: number; streamHasExited: boolean },
|
|
): Promise<string> {
|
|
const allChunks: string[] = [];
|
|
let attempt = 0;
|
|
// Track the timestamp of the last successfully received log line so
|
|
// reconnects use a tight window instead of an ever-growing one anchored
|
|
// at stream start. This is the primary fix for FAR-105 duplicative logs.
|
|
let lastLogReceivedAt = Math.floor(Date.now() / 1000);
|
|
// Shared across reconnects so replayed lines inside the `sinceSeconds`
|
|
// overlap window are dropped before they reach the streaming UI (FAR-123).
|
|
if (!dedup) dedup = new LogLineDedupFilter();
|
|
|
|
while (!stopSignal?.stopped) {
|
|
if (attempt >= MAX_LOG_RECONNECT_ATTEMPTS) {
|
|
await onLog("stderr", `[paperclip] Log stream: max reconnect attempts (${MAX_LOG_RECONNECT_ATTEMPTS}) reached — giving up.\n`);
|
|
break;
|
|
}
|
|
|
|
// On reconnect, ask for logs since the last received line (+5s buffer)
|
|
// instead of since stream start. This keeps the window tight and
|
|
// avoids ever-growing duplicate output.
|
|
const sinceSeconds = attempt > 0
|
|
? Math.max(1, Math.floor(Date.now() / 1000) - lastLogReceivedAt + 5)
|
|
: undefined;
|
|
|
|
if (attempt > 0) {
|
|
await onLog("stdout", `[paperclip] Log stream disconnected — reconnecting (attempt ${attempt}/${MAX_LOG_RECONNECT_ATTEMPTS})...\n`);
|
|
}
|
|
|
|
const preStreamTs = Math.floor(Date.now() / 1000);
|
|
const result = await streamPodLogsOnce(namespace, podName, onLog, kubeconfigPath, sinceSeconds, dedup, stopSignal);
|
|
if (activity) activity.streamHasExited = true;
|
|
if (result) {
|
|
allChunks.push(result);
|
|
// Update last-received timestamp to now (the stream just ended,
|
|
// so any log lines in `result` were received up to this moment).
|
|
lastLogReceivedAt = Math.floor(Date.now() / 1000);
|
|
// Refresh stream liveness so the grace timer in execute() does not
|
|
// fire while output is still flowing through reconnects (FAR-107).
|
|
if (activity) activity.lastActiveAt = Date.now();
|
|
} else if (attempt === 0) {
|
|
// First attempt returned nothing — update timestamp so reconnect
|
|
// window stays reasonable.
|
|
lastLogReceivedAt = preStreamTs;
|
|
}
|
|
attempt++;
|
|
|
|
// If the job is done or the container exited, no need to reconnect.
|
|
if (stopSignal?.stopped) break;
|
|
|
|
// Brief pause before reconnecting to avoid tight loops.
|
|
await new Promise((resolve) => setTimeout(resolve, LOG_STREAM_RECONNECT_DELAY_MS));
|
|
}
|
|
|
|
// Flush any buffered partial line so the final assistant/result chunk
|
|
// isn't dropped when the stream ends mid-line.
|
|
const tail = dedup.flush();
|
|
if (tail) await onLog("stdout", tail);
|
|
|
|
return allChunks.join("");
|
|
}
|
|
|
|
/**
|
|
* One-shot read of pod logs (no follow). Used as fallback when the
|
|
* follow stream missed output because the container exited quickly.
|
|
*/
|
|
async function readPodLogs(
|
|
namespace: string,
|
|
podName: string,
|
|
kubeconfigPath?: string,
|
|
): Promise<string> {
|
|
const coreApi = getCoreApi(kubeconfigPath);
|
|
try {
|
|
const log = await coreApi.readNamespacedPodLog({
|
|
name: podName,
|
|
namespace,
|
|
container: "claude",
|
|
});
|
|
return typeof log === "string" ? log : "";
|
|
} catch {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wait for the Job to reach a terminal state (Complete or Failed).
|
|
* Returns the Job's final status. A 404 (job deleted by TTL or externally)
|
|
* is treated as a soft terminal: succeeded=false, timedOut=false, jobGone=true.
|
|
* The caller should log this and fall through to stdout parsing.
|
|
*/
|
|
type JobConditionSnapshot = { type?: string; status?: string; reason?: string; message?: string };
|
|
|
|
async function waitForJobCompletion(
|
|
namespace: string,
|
|
jobName: string,
|
|
timeoutMs: number,
|
|
kubeconfigPath?: string,
|
|
observer?: { lastConditions: JobConditionSnapshot[] | null; pollCount: number },
|
|
): Promise<{ succeeded: boolean; timedOut: boolean; jobGone?: boolean }> {
|
|
const batchApi = getBatchApi(kubeconfigPath);
|
|
const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : 0;
|
|
|
|
while (deadline === 0 || Date.now() < deadline) {
|
|
let job;
|
|
try {
|
|
job = await batchApi.readNamespacedJob({ name: jobName, namespace });
|
|
} catch (err: unknown) {
|
|
if (isK8s404(err)) {
|
|
// Job was deleted (TTL garbage collection or external deletion) before
|
|
// we detected its terminal condition. The container must have already
|
|
// exited for TTL to fire, so log streaming will have captured the output.
|
|
return { succeeded: false, timedOut: false, jobGone: true };
|
|
}
|
|
throw err;
|
|
}
|
|
const conditions = job.status?.conditions ?? [];
|
|
if (observer) {
|
|
observer.pollCount += 1;
|
|
observer.lastConditions = conditions.map((c) => ({
|
|
type: c.type, status: c.status, reason: c.reason, message: c.message,
|
|
}));
|
|
}
|
|
|
|
const complete = conditions.find((c) => c.type === "Complete" && c.status === "True");
|
|
if (complete) return { succeeded: true, timedOut: false };
|
|
|
|
const failed = conditions.find((c) => c.type === "Failed" && c.status === "True");
|
|
if (failed) {
|
|
const isDeadlineExceeded = failed.reason === "DeadlineExceeded";
|
|
return { succeeded: false, timedOut: isDeadlineExceeded };
|
|
}
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
}
|
|
|
|
return { succeeded: false, timedOut: true };
|
|
}
|
|
|
|
/**
|
|
* Get the exit code from the Job's pod.
|
|
*/
|
|
async function getPodExitCode(namespace: string, jobName: string, kubeconfigPath?: string): Promise<number | null> {
|
|
const state = await getPodTerminatedState(namespace, jobName, kubeconfigPath);
|
|
return state?.exitCode ?? null;
|
|
}
|
|
|
|
/**
|
|
* Get the claude container's terminated state (exit code, reason, message,
|
|
* signal) from the Job's pod. Returns null if the pod or container is gone.
|
|
* Used by the no-result error path to explain *why* a run was truncated.
|
|
*/
|
|
export interface PodTerminatedState {
|
|
exitCode: number | null;
|
|
reason: string | null;
|
|
message: string | null;
|
|
signal: number | null;
|
|
}
|
|
|
|
/**
|
|
* Result of a pod-state lookup. `state` is the terminated state when available;
|
|
* `phase` and `podMissing` give the caller enough context to render an honest
|
|
* truncation-cause message instead of guessing "likely deleted" (FAR-107).
|
|
*/
|
|
export interface PodLookupResult {
|
|
state: PodTerminatedState | null;
|
|
phase: string | null;
|
|
podMissing: boolean;
|
|
}
|
|
|
|
async function lookupPodState(
|
|
namespace: string,
|
|
jobName: string,
|
|
kubeconfigPath?: string,
|
|
): Promise<PodLookupResult> {
|
|
const coreApi = getCoreApi(kubeconfigPath);
|
|
const podList = await coreApi.listNamespacedPod({
|
|
namespace,
|
|
labelSelector: `job-name=${jobName}`,
|
|
});
|
|
const pod = podList.items[0];
|
|
if (!pod) return { state: null, phase: null, podMissing: true };
|
|
|
|
const phase = pod.status?.phase ?? null;
|
|
const containerStatus = pod.status?.containerStatuses?.find((s) => s.name === "claude");
|
|
const terminated = containerStatus?.state?.terminated;
|
|
if (!terminated) return { state: null, phase, podMissing: false };
|
|
return {
|
|
state: {
|
|
exitCode: terminated.exitCode ?? null,
|
|
reason: terminated.reason ?? null,
|
|
message: (terminated.message ?? "").trim() || null,
|
|
signal: terminated.signal ?? null,
|
|
},
|
|
phase,
|
|
podMissing: false,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Read the claude container's terminated state, retrying briefly when the pod
|
|
* exists in a terminal phase but kubelet has not yet propagated the
|
|
* containerStatuses[].state.terminated field. Without this retry, fast
|
|
* truncated-stream exits surface as "pod state unavailable" (FAR-107) and
|
|
* mask the real exit code / OOMKilled / SIGTERM cause.
|
|
*/
|
|
async function getPodLookupWithRetry(
|
|
namespace: string,
|
|
jobName: string,
|
|
kubeconfigPath?: string,
|
|
attempts = 4,
|
|
delayMs = 500,
|
|
): Promise<PodLookupResult> {
|
|
let last: PodLookupResult = { state: null, phase: null, podMissing: true };
|
|
for (let i = 0; i < attempts; i++) {
|
|
last = await lookupPodState(namespace, jobName, kubeconfigPath);
|
|
if (last.state) return last;
|
|
if (last.podMissing) return last;
|
|
// Pod exists but no terminated state. If it is in a terminal phase the
|
|
// containerStatuses update is in flight — wait briefly and retry. If it
|
|
// is still Running/Pending, retrying is unlikely to help, so bail.
|
|
if (last.phase !== "Succeeded" && last.phase !== "Failed") return last;
|
|
if (i < attempts - 1) await new Promise((r) => setTimeout(r, delayMs));
|
|
}
|
|
return last;
|
|
}
|
|
|
|
async function getPodTerminatedState(
|
|
namespace: string,
|
|
jobName: string,
|
|
kubeconfigPath?: string,
|
|
): Promise<PodTerminatedState | null> {
|
|
return (await lookupPodState(namespace, jobName, kubeconfigPath)).state;
|
|
}
|
|
|
|
/**
|
|
* Format a human-readable explanation for a truncated run, including the
|
|
* pod's claude-container terminated state when available. Exit code 137
|
|
* is annotated as SIGKILL/OOM since that is the most common cause.
|
|
* Exported for unit tests.
|
|
*/
|
|
export function describeTruncationCause(
|
|
state: PodTerminatedState | null,
|
|
lookup?: PodLookupResult,
|
|
): string {
|
|
if (!state) {
|
|
if (lookup?.podMissing) {
|
|
return "pod is gone — Job pod was removed (eviction, preemption, or external delete) before exit could be read";
|
|
}
|
|
if (lookup && !lookup.podMissing) {
|
|
const phaseHint = lookup.phase ? `pod phase=${lookup.phase}` : "pod present";
|
|
return `container terminated state not yet observable (${phaseHint}) — kubelet status update did not land within retry window; exit cause unknown`;
|
|
}
|
|
return "pod state unavailable — exit cause unknown";
|
|
}
|
|
const parts: string[] = [];
|
|
if (state.exitCode !== null) {
|
|
parts.push(`exit code ${state.exitCode}`);
|
|
if (state.exitCode === 137) parts.push("SIGKILL (commonly OOMKilled)");
|
|
else if (state.exitCode === 143) parts.push("SIGTERM");
|
|
} else {
|
|
parts.push("no exit code");
|
|
}
|
|
if (state.signal !== null) parts.push(`signal ${state.signal}`);
|
|
if (state.reason) parts.push(`reason=${state.reason}`);
|
|
if (state.message) parts.push(`message=${state.message}`);
|
|
return parts.join(", ");
|
|
}
|
|
|
|
/**
|
|
* Delete Job and its pods. Best-effort — failures are logged but not thrown.
|
|
*/
|
|
async function cleanupJob(
|
|
namespace: string,
|
|
jobName: string,
|
|
onLog: AdapterExecutionContext["onLog"],
|
|
kubeconfigPath?: string,
|
|
): Promise<void> {
|
|
try {
|
|
const batchApi = getBatchApi(kubeconfigPath);
|
|
await batchApi.deleteNamespacedJob({
|
|
name: jobName,
|
|
namespace,
|
|
body: { propagationPolicy: "Background" },
|
|
});
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Warning: failed to cleanup job ${jobName}: ${msg}\n`);
|
|
}
|
|
}
|
|
|
|
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
|
const { runId, runtime, config: rawConfig, onLog, onMeta } = ctx;
|
|
const config = parseObject(rawConfig);
|
|
const timeoutSec = asNumber(config.timeoutSec, 0);
|
|
const graceSec = asNumber(config.graceSec, 60);
|
|
const retainJobs = asBoolean(config.retainJobs, false);
|
|
const kubeconfigPath = asString(config.kubeconfig, "") || undefined;
|
|
const paperclipApiUrl = process.env.PAPERCLIP_API_URL ?? "";
|
|
if (!paperclipApiUrl) {
|
|
await onLog("stderr", "[paperclip] Warning: PAPERCLIP_API_URL not set — cancel polling disabled\n");
|
|
}
|
|
|
|
// Guard: claude_k8s must not run concurrently for the same agent (shared PVC/session).
|
|
// After a server restart, orphaned K8s Jobs from previous (now-failed) runs may
|
|
// still be running. We detect those by comparing the Job's run-id label against
|
|
// the current runId. When reattachOrphanedJobs is enabled and the orphan matches
|
|
// the current agent+task+session, we attach to it instead of deleting it (FAR-124).
|
|
const agentId = ctx.agent.id;
|
|
const sanitizedAgentId = sanitizeLabelValue(agentId);
|
|
if (!sanitizedAgentId) {
|
|
await onLog("stderr", `[paperclip] Cannot create K8s Job: agent.id "${agentId}" produces no valid RFC 1123 label characters\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Agent ID "${agentId}" cannot be sanitized to a valid Kubernetes label`,
|
|
errorCode: "k8s_agent_id_invalid",
|
|
};
|
|
}
|
|
// FAR-29: serialize guard+create per agent within this process to prevent the
|
|
// TOCTOU race where two concurrent execute() calls both pass the list-then-create
|
|
// guard and create K8s Jobs simultaneously on the shared PVC.
|
|
const _prevCreation = agentCreationMutex.get(agentId) ?? Promise.resolve();
|
|
let _releaseMutex: () => void = () => {};
|
|
const _mutexSlot = new Promise<void>((resolve) => { _releaseMutex = resolve; });
|
|
// Chain: next caller for this agent waits on _mutexSlot, which resolves in finally.
|
|
agentCreationMutex.set(agentId, _prevCreation.then(() => _mutexSlot, () => _mutexSlot));
|
|
// Wait for any prior execute() call to finish its guard+create phase.
|
|
await _prevCreation.catch(() => {});
|
|
|
|
// Hoist declarations used in both the guard+create phase and the log-streaming
|
|
// section so the mutex try/finally can be added without a large re-indent.
|
|
let reattachTarget: { jobName: string; namespace: string; priorRunId: string; image: string } | null = null;
|
|
// eslint-disable-next-line prefer-const
|
|
let jobName!: string;
|
|
// eslint-disable-next-line prefer-const
|
|
let namespace!: string;
|
|
let promptSecret: { name: string; namespace: string; data: Record<string, string> } | null = null;
|
|
// runtimeSessionParams and currentSessionIdRaw are also used after the
|
|
// try block (in the result-parsing section) so hoist them here.
|
|
const runtimeSessionParams = parseObject(runtime.sessionParams);
|
|
const currentSessionIdRaw = asString(runtimeSessionParams.sessionId, runtime.sessionId ?? "");
|
|
const coreApi = getCoreApi(kubeconfigPath);
|
|
const batchApi = getBatchApi(kubeconfigPath);
|
|
|
|
try {
|
|
const selfPod = await getSelfPodInfo(kubeconfigPath);
|
|
const guardNamespace = asString(config.namespace, "") || selfPod.namespace;
|
|
const reattachOrphanedJobs = asBoolean(config.reattachOrphanedJobs, true);
|
|
const currentSessionLabel = currentSessionIdRaw ? sanitizeLabelValue(currentSessionIdRaw) : null;
|
|
const currentTaskIdRaw = asString(ctx.context.taskId, "") || asString(ctx.context.issueId, "");
|
|
const currentTaskLabel = currentTaskIdRaw ? sanitizeLabelValue(currentTaskIdRaw) : null;
|
|
try {
|
|
const existing = await batchApi.listNamespacedJob({
|
|
namespace: guardNamespace,
|
|
labelSelector: `paperclip.io/agent-id=${sanitizedAgentId},paperclip.io/adapter-type=claude_k8s`,
|
|
});
|
|
const running = existing.items.filter(
|
|
(j) =>
|
|
!j.metadata?.deletionTimestamp &&
|
|
!j.status?.conditions?.some((c) => (c.type === "Complete" || c.type === "Failed") && c.status === "True"),
|
|
);
|
|
if (running.length > 0) {
|
|
// Separate orphaned jobs (from a previous server-side run) from truly
|
|
// concurrent jobs (same runId — shouldn't happen but guard defensively).
|
|
const orphaned = running.filter(
|
|
(j) => (j.metadata?.labels?.["paperclip.io/run-id"] ?? "") !== runId,
|
|
);
|
|
const samRun = running.filter(
|
|
(j) => (j.metadata?.labels?.["paperclip.io/run-id"] ?? "") === runId,
|
|
);
|
|
|
|
if (orphaned.length > 0) {
|
|
if (!reattachOrphanedJobs) {
|
|
// When reattach is disabled, block on any non-terminal orphan.
|
|
const names = orphaned.map((j) => j.metadata?.name).join(", ");
|
|
await onLog("stderr", `[paperclip] Concurrent run blocked: orphaned Job(s) running and reattach disabled: ${names}\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrent run blocked: orphaned Job(s) still running for this agent (reattach disabled)`,
|
|
errorCode: "k8s_concurrent_run_blocked",
|
|
};
|
|
}
|
|
|
|
// Apply the decision matrix to each orphan, newest-first. The first
|
|
// reattachable orphan becomes the target; any block classification
|
|
// stops the new run immediately. Orphans are never deleted here —
|
|
// terminal ones are cleaned up by TTL; live mismatches should not be
|
|
// killed because they may still be doing real work.
|
|
const sortedOrphans = [...orphaned].sort((a, b) => {
|
|
const at = new Date(a.metadata?.creationTimestamp ?? 0).getTime();
|
|
const bt = new Date(b.metadata?.creationTimestamp ?? 0).getTime();
|
|
return bt - at;
|
|
});
|
|
for (const orphan of sortedOrphans) {
|
|
const classification = classifyOrphan(orphan, {
|
|
taskId: currentTaskLabel,
|
|
sessionId: currentSessionLabel,
|
|
});
|
|
const orphanName = orphan.metadata?.name ?? "unknown";
|
|
if (classification === "reattach") {
|
|
if (!reattachTarget) {
|
|
reattachTarget = {
|
|
jobName: orphanName,
|
|
namespace: orphan.metadata?.namespace ?? guardNamespace,
|
|
priorRunId: orphan.metadata?.labels?.["paperclip.io/run-id"] ?? "",
|
|
image: orphan.spec?.template?.spec?.containers?.[0]?.image ?? "unknown",
|
|
};
|
|
}
|
|
} else if (classification === "block_task_unknown") {
|
|
await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} has missing task label — cannot safely reattach\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} has unknown task context`,
|
|
errorCode: "k8s_orphan_task_unknown",
|
|
};
|
|
} else if (classification === "block_task_mismatch") {
|
|
await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} belongs to a different task\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} is running a different task`,
|
|
errorCode: "k8s_concurrent_run_blocked",
|
|
};
|
|
} else if (classification === "block_session_mismatch") {
|
|
await onLog("stderr", `[paperclip] Blocked: orphaned Job ${orphanName} has a different session\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrent run blocked: orphaned Job ${orphanName} has a mismatched session`,
|
|
errorCode: "k8s_orphan_session_mismatch",
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
// If there are still running Jobs that belong to THIS run (shouldn't happen
|
|
// since we haven't created the Job yet), block execution.
|
|
if (samRun.length > 0) {
|
|
const names = samRun.map((j) => j.metadata?.name).join(", ");
|
|
await onLog("stderr", `[paperclip] Concurrent run blocked: existing Job(s) still running for this run: ${names}\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrent run blocked: Job ${names} is still running for this agent`,
|
|
errorCode: "k8s_concurrent_run_blocked",
|
|
};
|
|
}
|
|
}
|
|
} catch (err: unknown) {
|
|
// If we can't list jobs, fail closed — the K8s concurrency guard is the
|
|
// only thing preventing zombie Jobs on a shared PVC from corrupting
|
|
// sessions. 404 (namespace not found) is treated as a hard failure;
|
|
// other errors (5xx, network) are also surfaced.
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Concurrency guard failed: unable to list jobs: ${msg}\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Concurrency guard unreachable: ${msg}`,
|
|
errorCode: "k8s_concurrency_guard_unreachable",
|
|
};
|
|
}
|
|
|
|
// Prepare the prompt bundle (skills + instructions) on the server filesystem.
|
|
// The K8s Job pod mounts the same PVC at /paperclip, so bundle paths written
|
|
// here are accessible inside the pod at the identical absolute path.
|
|
const skillEntries = await readPaperclipRuntimeSkillEntries(config, import.meta.dirname ?? __dirname);
|
|
const desiredSkillNames = new Set(resolvePaperclipDesiredSkillNames(config, skillEntries));
|
|
const desiredSkills = skillEntries.filter((e) => desiredSkillNames.has(e.key));
|
|
const skillSummary = desiredSkills.length > 0 ? desiredSkills.map((s) => s.runtimeName ?? s.key).join(", ") : "none";
|
|
await onLog("stdout", `[paperclip] Skills bundled (${desiredSkills.length}): ${skillSummary}\n`);
|
|
const instructionsFilePath = asString(config.instructionsFilePath, "").trim();
|
|
const instructionsFileDir = instructionsFilePath ? `${path.dirname(instructionsFilePath)}/` : "";
|
|
let instructionsContents: string | null = null;
|
|
if (instructionsFilePath) {
|
|
try {
|
|
const raw = await fs.readFile(instructionsFilePath, "utf-8");
|
|
const pathDirective =
|
|
`\nThe above agent instructions were loaded from ${instructionsFilePath}. ` +
|
|
`Resolve any relative file references from ${instructionsFileDir}. ` +
|
|
`This base directory is authoritative for sibling instruction files such as ` +
|
|
`./HEARTBEAT.md, ./SOUL.md, and ./TOOLS.md; do not resolve those from the parent agent directory.`;
|
|
instructionsContents = raw + pathDirective;
|
|
} catch (err) {
|
|
await onLog(
|
|
"stderr",
|
|
`[paperclip] Warning: could not read agent instructions file "${instructionsFilePath}": ${err instanceof Error ? err.message : String(err)}\n`,
|
|
);
|
|
}
|
|
}
|
|
const promptBundle = await prepareClaudePromptBundle({
|
|
companyId: ctx.agent.companyId,
|
|
skills: desiredSkills,
|
|
instructionsContents,
|
|
onLog,
|
|
});
|
|
|
|
if (reattachTarget) {
|
|
jobName = reattachTarget.jobName;
|
|
namespace = reattachTarget.namespace;
|
|
|
|
// Announce reattach metadata. Prompt and args aren't known here — they
|
|
// belong to the prior run that created this pod and are already present
|
|
// on the running container.
|
|
if (onMeta) {
|
|
await onMeta({
|
|
adapterType: "claude_k8s",
|
|
command: `kubectl job/${jobName}`,
|
|
cwd: namespace,
|
|
commandArgs: [],
|
|
commandNotes: [
|
|
`Image: ${reattachTarget.image}`,
|
|
`Namespace: ${namespace}`,
|
|
`Reattached from prior run: ${reattachTarget.priorRunId || "unknown"}`,
|
|
`Timeout: ${timeoutSec}s`,
|
|
],
|
|
prompt: "",
|
|
context: ctx.context,
|
|
} as Parameters<typeof onMeta>[0]);
|
|
}
|
|
|
|
await onLog("stdout", `[paperclip] Reattaching to in-flight K8s Job ${jobName} in namespace ${namespace} (prior run ${reattachTarget.priorRunId || "unknown"})\n`);
|
|
|
|
// Relabel the reattached Job with the current run-id (and session-id if
|
|
// available) so the next concurrency guard sees it as owned by this run
|
|
// rather than an orphan from the prior run.
|
|
const labelPatch: Array<{ op: "add" | "replace"; path: string; value: string }> = [
|
|
{ op: "replace", path: "/metadata/labels/paperclip.io~1run-id", value: runId },
|
|
];
|
|
if (currentSessionLabel) {
|
|
labelPatch.push({ op: "replace", path: "/metadata/labels/paperclip.io~1session-id", value: currentSessionLabel });
|
|
}
|
|
try {
|
|
await batchApi.patchNamespacedJob({
|
|
name: jobName,
|
|
namespace,
|
|
body: labelPatch,
|
|
});
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Warning: failed to relabel reattached Job ${jobName}: ${msg}\n`);
|
|
}
|
|
} else {
|
|
// Build Job manifest
|
|
const built = buildJobManifest({ ctx, selfPod, promptBundle });
|
|
const job = built.job;
|
|
jobName = built.jobName;
|
|
namespace = built.namespace;
|
|
const prompt = built.prompt;
|
|
const claudeArgs = built.claudeArgs;
|
|
const promptMetrics = built.promptMetrics;
|
|
promptSecret = built.promptSecret;
|
|
if (built.skippedLabels.length > 0) {
|
|
await onLog("stderr", `[paperclip] Warning: skipped ${built.skippedLabels.length} extra label(s) with reserved prefix: ${built.skippedLabels.join(", ")}\n`);
|
|
}
|
|
|
|
// Report invocation metadata
|
|
if (onMeta) {
|
|
await onMeta({
|
|
adapterType: "claude_k8s",
|
|
command: `kubectl job/${jobName}`,
|
|
cwd: namespace,
|
|
commandArgs: claudeArgs,
|
|
commandNotes: [
|
|
`Image: ${job.spec?.template.spec?.containers[0]?.image ?? "unknown"}`,
|
|
`Namespace: ${namespace}`,
|
|
`Timeout: ${timeoutSec}s`,
|
|
`Skills (${desiredSkills.length}): ${skillSummary}`,
|
|
],
|
|
prompt,
|
|
...(promptMetrics ? { promptMetrics } : {}),
|
|
context: ctx.context,
|
|
} as Parameters<typeof onMeta>[0]);
|
|
}
|
|
|
|
// If the prompt is large, create a Secret to hold it (avoids the ~1 MiB
|
|
// PodSpec limit). The Secret is cleaned up in the finally block.
|
|
if (promptSecret) {
|
|
try {
|
|
await coreApi.createNamespacedSecret({
|
|
namespace: promptSecret.namespace,
|
|
body: {
|
|
apiVersion: "v1",
|
|
kind: "Secret",
|
|
metadata: {
|
|
name: promptSecret.name,
|
|
namespace: promptSecret.namespace,
|
|
labels: {
|
|
"app.kubernetes.io/managed-by": "paperclip",
|
|
"paperclip.io/adapter-type": "claude_k8s",
|
|
"paperclip.io/run-id": runId,
|
|
},
|
|
},
|
|
stringData: promptSecret.data,
|
|
},
|
|
});
|
|
await onLog("stdout", `[paperclip] Created prompt Secret: ${promptSecret.name} (${Math.round(Buffer.byteLength(prompt, "utf-8") / 1024)} KiB)\n`);
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Failed to create prompt Secret: ${msg}\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Failed to create prompt Secret: ${msg}`,
|
|
errorCode: "k8s_prompt_secret_create_failed",
|
|
};
|
|
}
|
|
}
|
|
|
|
// Create the Job
|
|
let createdJobUid: string | undefined;
|
|
try {
|
|
const created = await batchApi.createNamespacedJob({ namespace, body: job });
|
|
createdJobUid = created.metadata?.uid;
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Failed to create K8s Job: ${msg}\n`);
|
|
if (promptSecret) {
|
|
try {
|
|
await coreApi.deleteNamespacedSecret({ name: promptSecret.name, namespace: promptSecret.namespace });
|
|
} catch { /* best-effort */ }
|
|
}
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Failed to create Kubernetes Job: ${msg}`,
|
|
errorCode: "k8s_job_create_failed",
|
|
};
|
|
}
|
|
|
|
// Attach ownerReference so K8s GC cleans up the Secret if the process
|
|
// crashes before the finally block runs.
|
|
if (promptSecret && createdJobUid) {
|
|
try {
|
|
await coreApi.patchNamespacedSecret({
|
|
name: promptSecret.name,
|
|
namespace: promptSecret.namespace,
|
|
body: [
|
|
{
|
|
op: "add",
|
|
path: "/metadata/ownerReferences",
|
|
value: [
|
|
{
|
|
apiVersion: "batch/v1",
|
|
kind: "Job",
|
|
name: jobName,
|
|
uid: createdJobUid,
|
|
blockOwnerDeletion: false,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
});
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
await onLog("stderr", `[paperclip] Warning: failed to set ownerReference on prompt Secret: ${msg}\n`);
|
|
}
|
|
}
|
|
|
|
await onLog("stdout", `[paperclip] Created K8s Job: ${jobName} in namespace ${namespace} (deadline: ${timeoutSec > 0 ? `${timeoutSec}s` : "none"})\n`);
|
|
}
|
|
} finally {
|
|
// Release the per-agent creation mutex so the next queued execute() call
|
|
// can proceed with its guard+create phase (FAR-29).
|
|
_releaseMutex();
|
|
}
|
|
|
|
let stdout = "";
|
|
let exitCode: number | null = null;
|
|
let podTerminatedState: PodTerminatedState | null = null;
|
|
let jobTimedOut = false;
|
|
let keepaliveTimer: ReturnType<typeof setInterval> | null = null;
|
|
// Set when we return a mismatch error so the finally block knows not to
|
|
// delete a job that is still alive and the UI is waiting on.
|
|
let skipCleanup = false;
|
|
// Set when the job disappeared (404) or grace-timer fired before we saw a
|
|
// terminal condition — used to emit a clearer error when stdout parsing fails.
|
|
let jobDeletedExternally = false;
|
|
// Forensics for k8s_job_deleted_externally — captures which of the three
|
|
// detection paths observed the 404, the last successful Job-condition read
|
|
// before deletion, and timing. Surfaced in the error message so the next
|
|
// occurrence is self-diagnosing instead of opaque (FAR-107).
|
|
let jobGoneDetectionPath: string | null = null;
|
|
let jobGoneAt: number | null = null;
|
|
const jobObserver: { lastConditions: JobConditionSnapshot[] | null; pollCount: number } = {
|
|
lastConditions: null,
|
|
pollCount: 0,
|
|
};
|
|
let podRunningAt: number | null = null;
|
|
|
|
const activeJobRef: ActiveJobRef = {
|
|
namespace,
|
|
jobName,
|
|
...(promptSecret ? { promptSecretName: promptSecret.name, promptSecretNamespace: promptSecret.namespace } : {}),
|
|
kubeconfigPath,
|
|
};
|
|
activeJobs.add(activeJobRef);
|
|
ensureSigtermHandler();
|
|
|
|
try {
|
|
// Wait for pod to be ready for log streaming
|
|
const scheduleTimeoutMs = 120_000; // 2 minutes for scheduling
|
|
let podName: string;
|
|
try {
|
|
if (reattachTarget) {
|
|
// Pod is already running from the prior run — look it up directly.
|
|
const podList = await coreApi.listNamespacedPod({
|
|
namespace,
|
|
labelSelector: `job-name=${jobName}`,
|
|
});
|
|
const pod = podList.items[0];
|
|
const name = pod?.metadata?.name;
|
|
if (!name) {
|
|
throw new Error(`Reattach target Job ${jobName} has no pod`);
|
|
}
|
|
podName = name;
|
|
await onLog("stdout", `[paperclip] Reattached to pod ${podName}\n`);
|
|
} else {
|
|
podName = await waitForPod(namespace, jobName, scheduleTimeoutMs, onLog, kubeconfigPath);
|
|
await onLog("stdout", `[paperclip] Pod running: ${podName}\n`);
|
|
}
|
|
podRunningAt = Date.now();
|
|
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
const phase = reattachTarget ? "reattach" : "scheduling";
|
|
await onLog("stderr", `[paperclip] Pod ${phase} failed: ${msg}\n`);
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Pod ${phase} failed: ${msg}`,
|
|
errorCode: reattachTarget ? "k8s_pod_reattach_failed" : "k8s_pod_schedule_failed",
|
|
};
|
|
}
|
|
|
|
// Stream logs and wait for completion concurrently.
|
|
// The log stream will end when the container exits.
|
|
// We also poll the Job status to detect deadline exceeded.
|
|
// 0 = no timeout (run indefinitely, matching claude_local behavior)
|
|
const completionTimeoutMs = timeoutSec > 0 ? (timeoutSec + graceSec) * 1000 : 0;
|
|
|
|
// Keepalive: periodically send a status line via onLog so the
|
|
// Paperclip server knows the adapter is still alive even when the
|
|
// pod produces no output (e.g. Claude is in a long thinking phase).
|
|
let lastLogAt = Date.now();
|
|
let keepaliveJobTerminal = false;
|
|
let consecutiveTerminalReadings = 0;
|
|
// Shared signal: when job completion resolves, tell the log streamer to
|
|
// stop reconnecting. Declared before keepaliveTimer so the cancel path
|
|
// inside the timer can set it without temporal dead zone issues.
|
|
const logStopSignal = { stopped: false };
|
|
// Shared dedup filter: created here so the one-shot fallback can
|
|
// reuse it and avoid pushing already-sent lines to the UI (finding #6, FAR-15).
|
|
const logDedup = new LogLineDedupFilter();
|
|
// Set when the run is externally cancelled (cancel-poll path).
|
|
let cancelled = false;
|
|
|
|
keepaliveTimer = setInterval(() => {
|
|
// Fire-and-forget the async work; setInterval callbacks must be
|
|
// synchronous or the timer will drift.
|
|
void (async () => {
|
|
if (keepaliveJobTerminal || cancelled) return;
|
|
|
|
// Verify the Job is still alive before announcing or refreshing.
|
|
// Require two consecutive terminal readings before latching to
|
|
// guard against a stale K8s API cache returning a false terminal
|
|
// status on a single read (finding #5, FAR-15).
|
|
try {
|
|
const job = await batchApi.readNamespacedJob({ name: jobName, namespace });
|
|
const terminal = job.status?.conditions?.some(
|
|
(c) => (c.type === "Complete" || c.type === "Failed") && c.status === "True",
|
|
);
|
|
if (terminal) {
|
|
consecutiveTerminalReadings++;
|
|
if (consecutiveTerminalReadings >= 2) {
|
|
keepaliveJobTerminal = true;
|
|
}
|
|
return;
|
|
}
|
|
consecutiveTerminalReadings = 0;
|
|
} catch (err: unknown) {
|
|
// Only treat 404 (Job deleted) as terminal. Transient 5xx or
|
|
// connection resets should NOT permanently disable the keepalive —
|
|
// the next tick will re-check and the reaper uses the staleness
|
|
// window as a safety net.
|
|
if (isK8s404(err)) {
|
|
keepaliveJobTerminal = true;
|
|
return;
|
|
}
|
|
// Log transient errors but leave keepaliveJobTerminal false so
|
|
// the next tick retries.
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
void onLog("stderr", `[paperclip] keepalive: transient error checking job status: ${msg}\n`).catch(() => {});
|
|
return;
|
|
}
|
|
|
|
// Cancel-polling: check if the Paperclip run was cancelled externally.
|
|
// Skipped on the reattach path to avoid tearing down an adopted Job.
|
|
// HTTP non-2xx is treated as transient — never interpret a 5xx as cancel.
|
|
if (!reattachTarget && paperclipApiUrl && ctx.authToken) {
|
|
try {
|
|
const resp = await fetch(`${paperclipApiUrl}/api/heartbeat-runs/${runId}`, {
|
|
headers: { Authorization: `Bearer ${ctx.authToken}` },
|
|
});
|
|
if (resp.ok) {
|
|
const data = await resp.json() as { status?: string };
|
|
if (shouldAbortForCancellation(data.status)) {
|
|
void onLog("stdout", `[paperclip] Run cancelled externally — deleting Job ${jobName}\n`).catch(() => {});
|
|
cancelled = true;
|
|
logStopSignal.stopped = true;
|
|
try {
|
|
await batchApi.deleteNamespacedJob({
|
|
name: jobName,
|
|
namespace,
|
|
body: { propagationPolicy: "Background" },
|
|
});
|
|
} catch { /* best-effort — completion watcher will see 404 and settle */ }
|
|
return;
|
|
}
|
|
} else if (resp.status >= 500) {
|
|
void onLog("stderr", `[paperclip] keepalive: cancel poll returned HTTP ${resp.status} — transient, ignoring\n`).catch(() => {});
|
|
}
|
|
} catch {
|
|
// network error — transient, skip this tick
|
|
}
|
|
}
|
|
|
|
const silenceSec = Math.round((Date.now() - lastLogAt) / 1000);
|
|
void onLog("stdout", `[paperclip] keepalive — job ${jobName} running (${silenceSec}s since last output)\n`).catch(() => {});
|
|
})();
|
|
}, KEEPALIVE_INTERVAL_MS);
|
|
const wrappedOnLog: typeof onLog = async (stream, chunk) => {
|
|
lastLogAt = Date.now();
|
|
return onLog(stream, chunk);
|
|
};
|
|
|
|
// Track stream liveness so the grace timer below only fires when output
|
|
// has actually stopped — not on a transient K8s log-API reconnect that
|
|
// streamPodLogs heals on its own (FAR-107).
|
|
const streamActivity: { lastActiveAt: number; streamHasExited: boolean } = {
|
|
lastActiveAt: Date.now(),
|
|
streamHasExited: false,
|
|
};
|
|
const trackedLogStream = streamPodLogs(
|
|
namespace, podName, wrappedOnLog, kubeconfigPath, logStopSignal, logDedup,
|
|
streamActivity,
|
|
);
|
|
|
|
// completionWithGrace races waitForJobCompletion against a grace timer
|
|
// that fires LOG_EXIT_COMPLETION_GRACE_MS after the log stream exits.
|
|
// This bounds the stale-UI window when K8s Job conditions lag container
|
|
// exit (FAR-23): without it, waitForJobCompletion polls indefinitely
|
|
// while streamPodLogs reconnects, holding execute() open for minutes.
|
|
// logStopSignal.stopped is set on every settled path (fulfilled, rejected,
|
|
// or grace) so streamPodLogs stops reconnecting promptly.
|
|
type CompletionResult = { succeeded: boolean; timedOut: boolean; jobGone?: boolean; gracePeriodFired?: boolean };
|
|
let gracePoller: ReturnType<typeof setInterval> | null = null;
|
|
const completionWithGrace = new Promise<CompletionResult>((resolve, reject) => {
|
|
let settled = false;
|
|
const settleOk = (r: CompletionResult) => {
|
|
if (settled) return;
|
|
settled = true;
|
|
if (gracePoller) { clearInterval(gracePoller); gracePoller = null; }
|
|
logStopSignal.stopped = true;
|
|
resolve(r);
|
|
};
|
|
const settleErr = (err: unknown) => {
|
|
if (settled) return;
|
|
settled = true;
|
|
if (gracePoller) { clearInterval(gracePoller); gracePoller = null; }
|
|
logStopSignal.stopped = true;
|
|
reject(err);
|
|
};
|
|
waitForJobCompletion(namespace, jobName, completionTimeoutMs, kubeconfigPath, jobObserver).then(settleOk).catch(settleErr);
|
|
gracePoller = setInterval(() => {
|
|
// Only consider grace once the stream has exited at least once.
|
|
// Until then we are still in the warm-up window and
|
|
// waitForJobCompletion is the authoritative signal. Once the
|
|
// stream has exited, fire only after GRACE_MS of inactivity
|
|
// measured against the last received chunk — output that resumes
|
|
// through a reconnect resets the clock so transient drops do not
|
|
// truncate live runs (FAR-107).
|
|
if (
|
|
streamActivity.streamHasExited &&
|
|
Date.now() - streamActivity.lastActiveAt >= LOG_EXIT_COMPLETION_GRACE_MS
|
|
) {
|
|
// Stop the grace poller immediately so we don't double-fire while the
|
|
// verification read below is in flight.
|
|
if (gracePoller) { clearInterval(gracePoller); gracePoller = null; }
|
|
// The log stream exiting only means the container stopped producing
|
|
// output — it does NOT prove the Job was deleted. Verify Job
|
|
// presence with a one-shot read so we can distinguish:
|
|
// (a) Job 404 → truly gone (TTL or external deletion)
|
|
// (b) Job still present → K8s condition propagation lag (FAR-23)
|
|
// Without this check we mis-classify (b) as "deleted externally" and
|
|
// emit a false-positive k8s_job_deleted_externally error (FAR-107).
|
|
void (async () => {
|
|
try {
|
|
await getBatchApi(kubeconfigPath).readNamespacedJob({ name: jobName, namespace });
|
|
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago without K8s Job condition update; Job ${jobName} still present — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
|
settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true });
|
|
} catch (err: unknown) {
|
|
if (isK8s404(err)) {
|
|
jobGoneDetectionPath = "grace-period-verify-404";
|
|
jobGoneAt = Date.now();
|
|
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago and Job ${jobName} is gone (TTL or external deletion) — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
|
settleOk({ succeeded: false, timedOut: false, jobGone: true });
|
|
} else {
|
|
// K8s API hiccup — bail out without claiming external deletion.
|
|
await onLog("stdout", `[paperclip] Log stream exited ${LOG_EXIT_COMPLETION_GRACE_MS / 1000}s ago; Job state unverifiable (${err instanceof Error ? err.message : String(err)}) — proceeding with captured output (FAR-23)\n`).catch(() => {});
|
|
settleOk({ succeeded: false, timedOut: false, gracePeriodFired: true });
|
|
}
|
|
}
|
|
})();
|
|
}
|
|
}, 1_000);
|
|
});
|
|
|
|
const [logResult, completionResult] = await Promise.allSettled([
|
|
trackedLogStream,
|
|
completionWithGrace,
|
|
]);
|
|
|
|
// Stop the keepalive immediately once the job has reached a terminal
|
|
// state — do not wait for the finally block.
|
|
if (keepaliveTimer) {
|
|
clearInterval(keepaliveTimer);
|
|
keepaliveTimer = null;
|
|
}
|
|
|
|
// If the run was externally cancelled, return a clean cancelled result
|
|
// without processing stdout (the finally block still runs for cleanup).
|
|
if (cancelled) {
|
|
return {
|
|
exitCode: null,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorCode: "cancelled",
|
|
errorMessage: "Run cancelled",
|
|
};
|
|
}
|
|
|
|
if (logResult.status === "fulfilled") {
|
|
stdout = logResult.value;
|
|
}
|
|
|
|
// One-shot log fallback: handles two failure modes with a single read.
|
|
// Mode 1 — empty stream: the follow stream returned nothing (fast exit before connection).
|
|
// Mode 2 — partial stream: we have some output but no result event (follow stream raced
|
|
// with container exit and captured only the init line before the connection dropped).
|
|
// A one-shot readPodLogs is more reliable for already-terminated containers and reads
|
|
// from the beginning of the log, giving us the full output.
|
|
// We use a cheap string scan for the result-event guard (avoids a full JSON parse here;
|
|
// the authoritative parse happens once below after all fallbacks complete).
|
|
const hasResultEvent = stdout.split("\n").some((l) => { try { return JSON.parse(l).type === "result"; } catch { return false; } });
|
|
const needsOneShot = !stdout.trim() || (stdout.trim() && !hasResultEvent);
|
|
if (needsOneShot) {
|
|
if (!stdout.trim()) {
|
|
await onLog("stdout", `[paperclip] Log stream returned empty — reading pod logs directly...\n`);
|
|
}
|
|
const oneShotLogs = await readPodLogs(namespace, podName, kubeconfigPath);
|
|
if (!stdout.trim() && oneShotLogs.trim()) {
|
|
stdout = oneShotLogs;
|
|
const deduped = logDedup.filter(stdout) + logDedup.flush();
|
|
if (deduped) await onLog("stdout", deduped);
|
|
} else if (oneShotLogs && oneShotLogs.length > stdout.length) {
|
|
await onLog("stdout", `[paperclip] Log stream captured partial output — supplemental one-shot read returned more content.\n`);
|
|
const deduped = logDedup.filter(oneShotLogs) + logDedup.flush();
|
|
if (deduped) await onLog("stdout", deduped);
|
|
stdout = oneShotLogs;
|
|
}
|
|
}
|
|
|
|
if (completionResult.status === "fulfilled") {
|
|
jobTimedOut = completionResult.value.timedOut;
|
|
if (completionResult.value.jobGone) {
|
|
// Job was deleted by TTL or externally before we observed the Complete/Failed
|
|
// condition. The container must have exited first (TTL only fires after
|
|
// completion), so log streaming has captured the full output — continue
|
|
// to stdout parsing rather than returning an error.
|
|
jobDeletedExternally = true;
|
|
if (!jobGoneDetectionPath) {
|
|
jobGoneDetectionPath = "completion-poll-404";
|
|
jobGoneAt = Date.now();
|
|
}
|
|
await onLog("stdout", `[paperclip] Job ${jobName} was deleted before terminal condition was observed (TTL or external deletion) — proceeding with captured output.\n`);
|
|
}
|
|
} else {
|
|
// waitForJobCompletion threw an unexpected error — re-check job state to
|
|
// avoid returning while the job is still running. Use a bounded timeout
|
|
// (60s) so we don't hang the heartbeat indefinitely if the K8s API is degraded.
|
|
jobTimedOut = false;
|
|
const RECHECK_TIMEOUT_MS = 60_000;
|
|
const actualState = await waitForJobCompletion(namespace, jobName, RECHECK_TIMEOUT_MS, kubeconfigPath, jobObserver);
|
|
if (actualState.timedOut) {
|
|
// Re-check itself timed out — the job may still be running.
|
|
// Return an error so the UI knows the run is not done.
|
|
jobTimedOut = true;
|
|
} else if (actualState.jobGone) {
|
|
// Job was deleted before we could confirm terminal state — same as the
|
|
// fulfilled+jobGone case above: proceed with captured output.
|
|
jobDeletedExternally = true;
|
|
if (!jobGoneDetectionPath) {
|
|
jobGoneDetectionPath = "recheck-poll-404";
|
|
jobGoneAt = Date.now();
|
|
}
|
|
await onLog("stdout", `[paperclip] Job ${jobName} was deleted before terminal condition was observed (TTL or external deletion) — proceeding with captured output.\n`);
|
|
} else if (!actualState.succeeded) {
|
|
// Job still not terminal — the completion error was likely transient.
|
|
// Return an error so the UI knows the run is not done, rather than
|
|
// returning with parsed (potentially incomplete) stdout.
|
|
await onLog("stderr", `[paperclip] Job ${jobName} still not terminal after log/completion mismatch — returning error to keep UI in sync.\n`);
|
|
skipCleanup = true;
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Job ${jobName} did not complete cleanly (log stream ended before job reached terminal state)`,
|
|
errorCode: "k8s_job_state_mismatch",
|
|
};
|
|
}
|
|
}
|
|
|
|
podTerminatedState = await getPodTerminatedState(namespace, jobName, kubeconfigPath);
|
|
exitCode = podTerminatedState?.exitCode ?? null;
|
|
} finally {
|
|
if (keepaliveTimer) clearInterval(keepaliveTimer);
|
|
activeJobs.delete(activeJobRef);
|
|
if (skipCleanup) {
|
|
await onLog("stdout", `[paperclip] Retaining job ${jobName} (state mismatch — UI is waiting on it)\n`);
|
|
} else if (!retainJobs) {
|
|
await cleanupJob(namespace, jobName, onLog, kubeconfigPath);
|
|
} else {
|
|
await onLog("stdout", `[paperclip] Retaining job ${jobName} for debugging (retainJobs=true)\n`);
|
|
}
|
|
// Clean up prompt Secret if one was created
|
|
if (promptSecret) {
|
|
try {
|
|
await coreApi.deleteNamespacedSecret({ name: promptSecret.name, namespace: promptSecret.namespace });
|
|
} catch {
|
|
// Best-effort cleanup — TTL or manual deletion will catch stragglers
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse Claude output (reuse claude_local parsing)
|
|
if (jobTimedOut) {
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: true,
|
|
errorMessage: `Timed out after ${timeoutSec}s`,
|
|
errorCode: "timeout",
|
|
};
|
|
}
|
|
|
|
const parsedStream = parseClaudeStreamJson(stdout);
|
|
const parsed = parsedStream.resultJson;
|
|
|
|
// If the session was stale, clear it so the next heartbeat starts fresh
|
|
if (parsed && (exitCode ?? 0) !== 0 && isClaudeUnknownSessionError(parsed)) {
|
|
await onLog("stdout", `[paperclip] Claude session is unavailable; clearing for next run.\n`);
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: describeClaudeFailure(parsed) ?? "Session unavailable",
|
|
errorCode: "session_unavailable",
|
|
clearSession: true,
|
|
resultJson: parsed,
|
|
};
|
|
}
|
|
|
|
if (!parsed) {
|
|
if (jobDeletedExternally && exitCode === null) {
|
|
// Forensic context (FAR-107): users sometimes see this error when nothing
|
|
// actually deleted the Job manually. Surface enough state in the message
|
|
// to distinguish self-delete (SIGTERM/cancel), TTL-after-completion, and
|
|
// genuine external deletion without needing cluster shell access.
|
|
const detailParts: string[] = [];
|
|
if (jobGoneDetectionPath) detailParts.push(`detected_via=${jobGoneDetectionPath}`);
|
|
detailParts.push(`job=${jobName}`);
|
|
detailParts.push(`ns=${namespace}`);
|
|
if (podRunningAt !== null && jobGoneAt !== null) {
|
|
detailParts.push(`elapsed_since_pod_running=${Math.round((jobGoneAt - podRunningAt) / 1000)}s`);
|
|
}
|
|
detailParts.push(`completion_polls=${jobObserver.pollCount}`);
|
|
const lastConds = jobObserver.lastConditions;
|
|
if (lastConds && lastConds.length > 0) {
|
|
const summary = lastConds
|
|
.map((c) => `${c.type}=${c.status}${c.reason ? `(${c.reason})` : ""}`)
|
|
.join(",");
|
|
detailParts.push(`last_job_conditions=[${summary}]`);
|
|
} else {
|
|
detailParts.push("last_job_conditions=none_observed");
|
|
}
|
|
detailParts.push(`stdout_bytes=${stdout.length}`);
|
|
const stdoutLines = stdout.split("\n").filter((l) => l.trim()).length;
|
|
detailParts.push(`stdout_nonempty_lines=${stdoutLines}`);
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `K8s Job was deleted externally before Claude could complete [${detailParts.join(", ")}]`,
|
|
errorCode: "k8s_job_deleted_externally",
|
|
resultJson: { stdout },
|
|
};
|
|
}
|
|
if (parsedStream.llmApiEmptyResponse) {
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: "LLM API returned an empty response (stop_reason: null, output_tokens: 0) — the upstream model API may be degraded or misconfigured",
|
|
errorCode: "llm_api_error",
|
|
resultJson: { stdout },
|
|
};
|
|
}
|
|
if (parsedStream.truncatedMidStream) {
|
|
// Re-query pod state with retry — the initial single-shot read can lose
|
|
// to kubelet propagation lag and surface a useless "pod state unavailable"
|
|
// message that hides the real exit cause (OOMKilled, SIGTERM, etc). The
|
|
// retry distinguishes pod-genuinely-gone from terminated-state-lag and
|
|
// gives the operator the actual exit code/reason where possible (FAR-107).
|
|
let lookup: PodLookupResult | undefined;
|
|
let refreshedState = podTerminatedState;
|
|
try {
|
|
lookup = await getPodLookupWithRetry(namespace, jobName, kubeconfigPath);
|
|
refreshedState = lookup.state;
|
|
if (refreshedState && refreshedState.exitCode !== null) {
|
|
exitCode = refreshedState.exitCode;
|
|
}
|
|
} catch (err) {
|
|
await onLog("stderr", `[paperclip] truncation diagnostic: pod re-query failed (${err instanceof Error ? err.message : String(err)})\n`).catch(() => {});
|
|
}
|
|
const cause = describeTruncationCause(refreshedState, lookup);
|
|
const modelHint = parsedStream.model ? ` (model: ${parsedStream.model})` : "";
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: `Claude run was truncated mid-stream${modelHint} — assistant produced content but no result event arrived; ${cause}`,
|
|
errorCode: "claude_truncated",
|
|
resultJson: { stdout },
|
|
};
|
|
}
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage: buildPartialRunError(exitCode, parsedStream.model, stdout, podTerminatedState),
|
|
resultJson: { stdout },
|
|
};
|
|
}
|
|
|
|
const usage = parsedStream.usage ?? (() => {
|
|
const usageObj = parseObject(parsed.usage as Record<string, unknown>);
|
|
return {
|
|
inputTokens: asNumber(usageObj.input_tokens, 0),
|
|
cachedInputTokens: asNumber(usageObj.cache_read_input_tokens, 0),
|
|
outputTokens: asNumber(usageObj.output_tokens, 0),
|
|
};
|
|
})();
|
|
|
|
const fallbackSessionId = currentSessionIdRaw;
|
|
const resolvedSessionId = parsedStream.sessionId
|
|
?? (asString(parsed.session_id as string, fallbackSessionId) || fallbackSessionId);
|
|
const model = asString(config.model, "");
|
|
const workspaceContext = parseObject(ctx.context.paperclipWorkspace);
|
|
const workspaceId = asString(workspaceContext.workspaceId, "") || null;
|
|
const workspaceRepoUrl = asString(workspaceContext.repoUrl, "") || null;
|
|
const workspaceRepoRef = asString(workspaceContext.repoRef, "") || null;
|
|
const cwd = asString(workspaceContext.cwd, "");
|
|
|
|
const resolvedSessionParams = resolvedSessionId
|
|
? {
|
|
sessionId: resolvedSessionId,
|
|
...(cwd ? { cwd } : {}),
|
|
...(workspaceId ? { workspaceId } : {}),
|
|
...(workspaceRepoUrl ? { repoUrl: workspaceRepoUrl } : {}),
|
|
...(workspaceRepoRef ? { repoRef: workspaceRepoRef } : {}),
|
|
} as Record<string, unknown>
|
|
: null;
|
|
|
|
const clearSessionForMaxTurns = isClaudeMaxTurnsResult(parsed);
|
|
|
|
return {
|
|
exitCode,
|
|
signal: null,
|
|
timedOut: false,
|
|
errorMessage:
|
|
(exitCode ?? 0) === 0
|
|
? null
|
|
: describeClaudeFailure(parsed) ?? `Claude exited with code ${exitCode ?? -1}`,
|
|
usage,
|
|
sessionId: resolvedSessionId || null,
|
|
sessionParams: resolvedSessionParams,
|
|
sessionDisplayId: resolvedSessionId || null,
|
|
provider: "anthropic",
|
|
model: parsedStream.model || asString(parsed.model as string, model),
|
|
billingType: "api",
|
|
costUsd: parsedStream.costUsd ?? asNumber(parsed.total_cost_usd, 0),
|
|
resultJson: parsed,
|
|
summary: parsedStream.summary || asString(parsed.result as string, ""),
|
|
clearSession: clearSessionForMaxTurns,
|
|
} as AdapterExecutionResult;
|
|
}
|