diff --git a/src/server/execute.ts b/src/server/execute.ts index 424e3ed..c313f15 100644 --- a/src/server/execute.ts +++ b/src/server/execute.ts @@ -16,6 +16,19 @@ const KEEPALIVE_INTERVAL_MS = 15_000; const LOG_STREAM_RECONNECT_DELAY_MS = 3_000; const MAX_LOG_RECONNECT_ATTEMPTS = 50; +/** + * Detect a Kubernetes 404 (Not Found) error from @kubernetes/client-node. + * Works for both v0.x (response.statusCode) and v1.0+ (response.status, message). + */ +function isK8s404(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const e = err as unknown as Record; + const resp = e.response as Record | undefined; + if (resp?.statusCode === 404 || resp?.status === 404) return true; + if (e.statusCode === 404) return true; + return /HTTP-Code:\s*404\b/.test(err.message); +} + /** * Wait for the Job's pod to reach a terminal or running state. * Returns the pod name once logs can be streamed, or throws on failure. @@ -251,19 +264,32 @@ async function readPodLogs( /** * Wait for the Job to reach a terminal state (Complete or Failed). - * Returns the Job's final status. + * Returns the Job's final status. A 404 (job deleted by TTL or externally) + * is treated as a soft terminal: succeeded=false, timedOut=false, jobGone=true. + * The caller should log this and fall through to stdout parsing. */ async function waitForJobCompletion( namespace: string, jobName: string, timeoutMs: number, kubeconfigPath?: string, -): Promise<{ succeeded: boolean; timedOut: boolean }> { +): Promise<{ succeeded: boolean; timedOut: boolean; jobGone?: boolean }> { const batchApi = getBatchApi(kubeconfigPath); const deadline = timeoutMs > 0 ? Date.now() + timeoutMs : 0; while (deadline === 0 || Date.now() < deadline) { - const job = await batchApi.readNamespacedJob({ name: jobName, namespace }); + let job; + try { + job = await batchApi.readNamespacedJob({ name: jobName, namespace }); + } catch (err: unknown) { + if (isK8s404(err)) { + // Job was deleted (TTL garbage collection or external deletion) before + // we detected its terminal condition. The container must have already + // exited for TTL to fire, so log streaming will have captured the output. + return { succeeded: false, timedOut: false, jobGone: true }; + } + throw err; + } const conditions = job.status?.conditions ?? []; const complete = conditions.find((c) => c.type === "Complete" && c.status === "True"); @@ -561,9 +587,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise