diff --git a/package-lock.json b/package-lock.json index cc175b6..97b5e4f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "license": "MIT", "dependencies": { "@kubernetes/client-node": "^1.0.0", diff --git a/package.json b/package.json index f62db1e..fb6c5c9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs", "license": "MIT", "repository": { diff --git a/src/server/execute.ts b/src/server/execute.ts index 026da2a..da1df61 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -20,6 +20,11 @@ const MAX_LOG_RECONNECT_ATTEMPTS = 50; // Covers the cleanup path (delete job, parse stdout) so a slow K8s API call // doesn't trip the 5-minute reaper staleness window. const POST_TERMINAL_KEEPALIVE_MS = 90_000; +// Upper bound on how long streamPodLogsOnce will wait after stopSignal fires +// before force-returning, even if logApi.log has not yet resolved. Defensive +// against the K8s client library not propagating writable.destroy() into an +// abort of the underlying HTTP request. +const LOG_STREAM_BAIL_TIMEOUT_MS = 3_000; /** * Detect a Kubernetes 404 (Not Found) error from @kubernetes/client-node. @@ -272,25 +277,44 @@ async function streamPodLogsOnce( // in-flight follow stream. Without this, logApi.log can hang indefinitely // when the pod terminates without closing the HTTP connection cleanly. let stopPoller: ReturnType | null = null; + let bailTimer: ReturnType | null = null; + let bailResolve: (() => void) | null = null; + // Bail promise resolves LOG_STREAM_BAIL_TIMEOUT_MS after stopSignal fires, + // even if logApi.log has not resolved by then. This is a safety net for the + // case where writable.destroy() fails to propagate to an abort of the HTTP + // request (e.g. the K8s client is awaiting a response that never comes). + const bailPromise = new Promise((resolve) => { + bailResolve = resolve; + }); if (stopSignal) { stopPoller = setInterval(() => { - if (stopSignal.stopped && !writable.destroyed) { - writable.destroy(); + if (stopSignal.stopped) { + if (!writable.destroyed) writable.destroy(); + if (!bailTimer && bailResolve) { + bailTimer = setTimeout(bailResolve, LOG_STREAM_BAIL_TIMEOUT_MS); + } } }, 200); } - try { - await logApi.log(namespace, podName, "claude", writable, { - follow: true, - pretty: false, - ...(sinceSeconds ? { sinceSeconds } : {}), - }); - } catch { + const logPromise = logApi.log(namespace, podName, "claude", writable, { + follow: true, + pretty: false, + ...(sinceSeconds ? { sinceSeconds } : {}), + }).catch(() => { // follow may fail if the container already exited, the API connection // dropped, or we aborted via writable.destroy() — not fatal. + }); + + try { + if (stopSignal) { + await Promise.race([logPromise, bailPromise]); + } else { + await logPromise; + } } finally { if (stopPoller) clearInterval(stopPoller); + if (bailTimer) clearTimeout(bailTimer); } return chunks.join("");