fix: P0 correctness fixes from FAR-104/FAR-105 analysis

1. Inherit envFrom and env.valueFrom from self pod — secrets wired via valueFrom.secretKeyRef or envFrom.secretRef are now forwarded to Job pods, fixing credentials silently dropped for K8s-idiomatic secret patterns (e.g. ANTHROPIC_API_KEY via Secret). 2. Distinguish 404 vs transient errors in keepalive — only mark the keepalive as terminal on 404 (Job deleted). Transient 5xx/connection errors are logged and retried on the next tick, preventing premature reaper kills during API instability. 3. Fail closed on concurrency-guard read failure — a failing listNamespacedJob now returns k8s_concurrency_guard_unreachable instead of silently proceeding, protecting against zombie Jobs on shared PVCs. 4. Bound the waitForJobCompletion re-check — pass a 60s timeout instead of polling forever, preventing indefinite hangs when the K8s API is degraded. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-20 18:57:16 +00:00
parent c35253ddd4
commit d74b6d34b3
4 changed files with 111 additions and 14 deletions
@@ -356,8 +356,20 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
        };
      }
    }
-  } catch {
-    // If we can't check, proceed — the heartbeat service enforces concurrency too
+  } catch (err: unknown) {
+    // If we can't list jobs, fail closed — the K8s concurrency guard is the
+    // only thing preventing zombie Jobs on a shared PVC from corrupting
+    // sessions.  404 (namespace not found) is treated as a hard failure;
+    // other errors (5xx, network) are also surfaced.
+    const msg = err instanceof Error ? err.message : String(err);
+    await onLog("stderr", `[paperclip] Concurrency guard failed: unable to list jobs: ${msg}\n`);
+    return {
+      exitCode: null,
+      signal: null,
+      timedOut: false,
+      errorMessage: `Concurrency guard unreachable: ${msg}`,
+      errorCode: "k8s_concurrency_guard_unreachable",
+    };
  }

  // Build Job manifest
@@ -486,11 +498,21 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
            keepaliveJobTerminal = true;
            return;
          }
-        } catch {
-          // Job may have been deleted out from under us, or the API call
-          // transiently failed.  Either way, do not refresh updatedAt —
-          // either the Job really is gone, or the next tick will re-check.
-          keepaliveJobTerminal = true;
+        } catch (err: unknown) {
+          // Only treat 404 (Job deleted) as terminal.  Transient 5xx or
+          // connection resets should NOT permanently disable the keepalive —
+          // the next tick will re-check and the reaper uses the staleness
+          // window as a safety net.
+          const statusCode = (err as { response?: { statusCode?: number } })?.response?.statusCode
+            ?? (err as { statusCode?: number })?.statusCode;
+          if (statusCode === 404) {
+            keepaliveJobTerminal = true;
+            return;
+          }
+          // Log transient errors but leave keepaliveJobTerminal false so
+          // the next tick retries.
+          const msg = err instanceof Error ? err.message : String(err);
+          void onLog("stderr", `[paperclip] keepalive: transient error checking job status: ${msg}\n`).catch(() => {});
          return;
        }

@@ -550,11 +572,14 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
    } else {
      // waitForJobCompletion threw — re-check job state to avoid returning
      // while the job is still running (which would cause UI staleness and
-      // concurrency errors on retry).
+      // concurrency errors on retry).  Use a bounded timeout (60s) so we
+      // don't hang the heartbeat indefinitely if the K8s API is degraded.
      jobTimedOut = false;
-      const actualState = await waitForJobCompletion(namespace, jobName, 0, kubeconfigPath);
+      const RECHECK_TIMEOUT_MS = 60_000;
+      const actualState = await waitForJobCompletion(namespace, jobName, RECHECK_TIMEOUT_MS, kubeconfigPath);
      if (actualState.timedOut) {
-        // Truly a timeout after re-check — treat as timed out.
+        // Re-check itself timed out — the job may still be running.
+        // Return an error so the UI knows the run is not done.
        jobTimedOut = true;
      } else if (!actualState.succeeded) {
        // Job still not terminal — the completion error was likely transient.