Reconnect K8s log stream on silent API disconnects

The adapter opened a single follow-stream to the K8s API for pod logs.
If that TCP connection silently dropped (API server hiccup, network
timeout, load-balancer idle cut), streamPodLogs returned early and no
more real Claude output reached the UI — only keepalive pings.  The
pod kept running and producing logs (visible via kubectl), but the
adapter never reconnected.

Splits streamPodLogs into streamPodLogsOnce (single follow attempt) and
a reconnecting wrapper that retries with sinceSeconds until a shared
stop signal fires when waitForJobCompletion resolves.  On reconnect,
requests logs from the original stream start time (+5s overlap) so no
output is lost; the UI deduplicates chunks.

Bumps version to 0.1.12.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
2026-04-13 10:34:41 +00:00
parent e760bf9386
commit 77ba40d9bf
5 changed files with 80 additions and 11 deletions
+17 -1
View File
@@ -4,6 +4,7 @@ import { getSelfPodInfo, getBatchApi, getCoreApi, getLogApi } from "./k8s-client
import { buildJobManifest } from "./job-manifest.js";
import { Writable } from "node:stream";
const POLL_INTERVAL_MS = 2000;
const KEEPALIVE_INTERVAL_MS = 15_000;
/**
* Wait for the Job's pod to reach a terminal or running state.
* Returns the pod name once logs can be streamed, or throws on failure.
@@ -270,6 +271,7 @@ export async function execute(ctx) {
let stdout = "";
let exitCode = null;
let jobTimedOut = false;
let keepaliveTimer = null;
try {
// Wait for pod to be ready for log streaming
const scheduleTimeoutMs = 120_000; // 2 minutes for scheduling
@@ -294,8 +296,20 @@ export async function execute(ctx) {
// We also poll the Job status to detect deadline exceeded.
// 0 = no timeout (run indefinitely, matching claude_local behavior)
const completionTimeoutMs = timeoutSec > 0 ? (timeoutSec + graceSec) * 1000 : 0;
// Keepalive: periodically send a status line via onLog so the
// Paperclip server knows the adapter is still alive even when the
// pod produces no output (e.g. Claude is in a long thinking phase).
let lastLogAt = Date.now();
keepaliveTimer = setInterval(() => {
const silenceSec = Math.round((Date.now() - lastLogAt) / 1000);
void onLog("stdout", `[paperclip] keepalive — job ${jobName} running (${silenceSec}s since last output)\n`);
}, KEEPALIVE_INTERVAL_MS);
const wrappedOnLog = async (stream, chunk) => {
lastLogAt = Date.now();
return onLog(stream, chunk);
};
const [logResult, completionResult] = await Promise.allSettled([
streamPodLogs(namespace, podName, onLog, kubeconfigPath),
streamPodLogs(namespace, podName, wrappedOnLog, kubeconfigPath),
waitForJobCompletion(namespace, jobName, completionTimeoutMs, kubeconfigPath),
]);
if (logResult.status === "fulfilled") {
@@ -319,6 +333,8 @@ export async function execute(ctx) {
exitCode = await getPodExitCode(namespace, jobName, kubeconfigPath);
}
finally {
if (keepaliveTimer)
clearInterval(keepaliveTimer);
if (!retainJobs) {
await cleanupJob(namespace, jobName, onLog, kubeconfigPath);
}