From c7706d742f2bf94f2b38d5fc0c93fcee78b9cd8e Mon Sep 17 00:00:00 2001 From: Gandalf the Greybeard Date: Thu, 23 Apr 2026 16:36:51 +0000 Subject: [PATCH] 0.1.31: harden streamPodLogsOnce with Promise.race bail (FAR-10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defensive follow-up to the FAR-10 fix. The original patch aborts the in-flight follow stream by destroying the Writable once stopSignal fires, and relies on the @kubernetes/client-node library propagating that destroy into an abort of the underlying HTTP request. If that propagation ever fails (e.g. the client is awaiting a response that never arrives), logApi.log() can still hang forever. Adds a Promise.race with a 3s bail timer that starts when stopSignal fires. In the happy path (destroy-propagation works), logApi.log() resolves first and the bail timer is cleared. In the failure path, the bail timer fires and streamPodLogsOnce returns with whatever chunks were captured — preventing the hang from reaching execute(). No test change: existing 267 tests pass and the race path needs a k8s mock to exercise end-to-end; validated by monitoring real runs. Co-Authored-By: Paperclip --- package-lock.json | 4 ++-- package.json | 2 +- src/server/execute.ts | 42 +++++++++++++++++++++++++++++++++--------- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/package-lock.json b/package-lock.json index cc175b6..97b5e4f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "license": "MIT", "dependencies": { "@kubernetes/client-node": "^1.0.0", diff --git a/package.json b/package.json index f62db1e..fb6c5c9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "paperclip-adapter-claude-k8s", - "version": "0.1.30", + "version": "0.1.31", "description": "Paperclip adapter plugin that runs Claude Code agents as Kubernetes Jobs", "license": "MIT", "repository": { diff --git a/src/server/execute.ts b/src/server/execute.ts index 026da2a..da1df61 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -20,6 +20,11 @@ const MAX_LOG_RECONNECT_ATTEMPTS = 50; // Covers the cleanup path (delete job, parse stdout) so a slow K8s API call // doesn't trip the 5-minute reaper staleness window. const POST_TERMINAL_KEEPALIVE_MS = 90_000; +// Upper bound on how long streamPodLogsOnce will wait after stopSignal fires +// before force-returning, even if logApi.log has not yet resolved. Defensive +// against the K8s client library not propagating writable.destroy() into an +// abort of the underlying HTTP request. +const LOG_STREAM_BAIL_TIMEOUT_MS = 3_000; /** * Detect a Kubernetes 404 (Not Found) error from @kubernetes/client-node. @@ -272,25 +277,44 @@ async function streamPodLogsOnce( // in-flight follow stream. Without this, logApi.log can hang indefinitely // when the pod terminates without closing the HTTP connection cleanly. let stopPoller: ReturnType | null = null; + let bailTimer: ReturnType | null = null; + let bailResolve: (() => void) | null = null; + // Bail promise resolves LOG_STREAM_BAIL_TIMEOUT_MS after stopSignal fires, + // even if logApi.log has not resolved by then. This is a safety net for the + // case where writable.destroy() fails to propagate to an abort of the HTTP + // request (e.g. the K8s client is awaiting a response that never comes). + const bailPromise = new Promise((resolve) => { + bailResolve = resolve; + }); if (stopSignal) { stopPoller = setInterval(() => { - if (stopSignal.stopped && !writable.destroyed) { - writable.destroy(); + if (stopSignal.stopped) { + if (!writable.destroyed) writable.destroy(); + if (!bailTimer && bailResolve) { + bailTimer = setTimeout(bailResolve, LOG_STREAM_BAIL_TIMEOUT_MS); + } } }, 200); } - try { - await logApi.log(namespace, podName, "claude", writable, { - follow: true, - pretty: false, - ...(sinceSeconds ? { sinceSeconds } : {}), - }); - } catch { + const logPromise = logApi.log(namespace, podName, "claude", writable, { + follow: true, + pretty: false, + ...(sinceSeconds ? { sinceSeconds } : {}), + }).catch(() => { // follow may fail if the container already exited, the API connection // dropped, or we aborted via writable.destroy() — not fatal. + }); + + try { + if (stopSignal) { + await Promise.race([logPromise, bailPromise]); + } else { + await logPromise; + } } finally { if (stopPoller) clearInterval(stopPoller); + if (bailTimer) clearTimeout(bailTimer); } return chunks.join("");