From 5f5ae92ce7d5e0676b5c3134c8c2c663d774497e Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 17 Apr 2026 02:57:05 +0000 Subject: [PATCH] fix: skip keepalive updatedAt refresh once K8s Job is terminal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix (df856e6) made the keepalive timer call onSpawn every ~4 minutes to refresh the run's updatedAt in the DB, so the stale-run reaper wouldn't kill live runs in multi-instance deployments. That was correct for live jobs, but it was unconditional — if execute() stalled after the pod terminated (slow K8s API call, hung log stream drain, or a Job whose Complete condition lags pod termination), the keepalive kept the run marked "alive" indefinitely even though the pod was gone. That manifests as the opposite of the original bug: the UI shows jobs as running when they have actually finished. Two changes: 1. Verify the Job is still alive before the keepalive refreshes updatedAt. If the Job has reached a terminal Complete/Failed condition (or has been deleted / the API read fails), stop refreshing. If execute() truly ends up stuck past that point, the reaper will catch the run within the normal 5-minute staleness window instead of never. 2. Clear the keepalive interval immediately once Promise.allSettled resolves, rather than only in the finally block. Post-completion work (exit-code fetch, log fallback read, job cleanup) must not be able to emit another onSpawn refresh that keeps the run "alive". Co-Authored-By: Paperclip --- src/server/execute.ts | 61 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/src/server/execute.ts b/src/server/execute.ts index 6770c5a..b5a4b91 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -456,18 +456,54 @@ export async function execute(ctx: AdapterExecutionContext): Promise { - const silenceSec = Math.round((Date.now() - lastLogAt) / 1000); - void onLog("stdout", `[paperclip] keepalive — job ${jobName} running (${silenceSec}s since last output)\n`); + // Fire-and-forget the async work; setInterval callbacks must be + // synchronous or the timer will drift. + void (async () => { + if (keepaliveJobTerminal) return; - // Refresh updatedAt every ~4 minutes (16 ticks × 15s) to stay - // well within the 5-minute reaper staleness window. - keepaliveTick++; - if (ctx.onSpawn && keepaliveTick % 16 === 0) { - void ctx.onSpawn({ pid: -1, processGroupId: null, startedAt: new Date().toISOString() }).catch(() => {}); - } + // Verify the Job is still alive before announcing or refreshing. + try { + const job = await batchApi.readNamespacedJob({ name: jobName, namespace }); + const terminal = job.status?.conditions?.some( + (c) => (c.type === "Complete" || c.type === "Failed") && c.status === "True", + ); + if (terminal) { + keepaliveJobTerminal = true; + return; + } + } catch { + // Job may have been deleted out from under us, or the API call + // transiently failed. Either way, do not refresh updatedAt — + // either the Job really is gone, or the next tick will re-check. + keepaliveJobTerminal = true; + return; + } + + const silenceSec = Math.round((Date.now() - lastLogAt) / 1000); + void onLog("stdout", `[paperclip] keepalive — job ${jobName} running (${silenceSec}s since last output)\n`); + + // Refresh updatedAt every ~4 minutes (16 ticks × 15s) to stay + // well within the 5-minute reaper staleness window. + keepaliveTick++; + if (ctx.onSpawn && keepaliveTick % 16 === 0) { + void ctx.onSpawn({ pid: -1, processGroupId: null, startedAt: new Date().toISOString() }).catch(() => {}); + } + })(); }, KEEPALIVE_INTERVAL_MS); const wrappedOnLog: typeof onLog = async (stream, chunk) => { lastLogAt = Date.now(); @@ -486,6 +522,15 @@ export async function execute(ctx: AdapterExecutionContext): Promise