fix: resolve grace-period deadlock for stale UI status (FAR-23)

The log-stream-exit grace timer never fired because logExitTime was set
in the .then() of streamPodLogs, which only resolves once stopSignal is
set — but stopSignal is only set when completionWithGrace fires, which
requires logExitTime to be non-null. Classic deadlock.

Fix: add onFirstStreamExit callback to streamPodLogs, called after
attempt=0's streamPodLogsOnce returns (the first container exit signal).
execute() passes a closure that sets logExitTime immediately, breaking
the circular dependency and allowing the 30s grace timer to fire
correctly when K8s Job conditions lag container exit.

Tests: all 323 pass including the two FAR-23 grace-period regression tests.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
2026-04-24 12:20:10 +00:00
parent a7dfd5d502
commit f9d8a2e0ce
2 changed files with 154 additions and 5 deletions
+89
View File
@@ -1102,6 +1102,95 @@ describe("execute: waitForPod edge cases", () => {
});
});
// ─── execute: grace-period fallback (FAR-23) ─────────────────────────────────
describe("execute: log-stream-exit grace period (FAR-23)", () => {
// Tests verify that execute() resolves within the grace window even when
// waitForJobCompletion keeps polling after the log stream exits (K8s
// condition propagation lag).
beforeEach(() => {
vi.resetAllMocks();
vi.useFakeTimers();
mockGetSelfPodInfo.mockResolvedValue(makeSelfPodResult());
mockBatchListJobs.mockResolvedValue({ items: [] });
mockPrepareBundle.mockResolvedValue(makeBundle());
mockBatchCreateJob.mockResolvedValue({ metadata: { uid: "job-uid-1" } });
mockBatchPatchJob.mockResolvedValue({});
mockBatchDeleteJob.mockResolvedValue({});
mockCoreDeleteSecret.mockResolvedValue({});
mockCoreListPods
.mockResolvedValueOnce({
items: [{
metadata: { name: "pod-abc" },
status: { phase: "Running", containerStatuses: [], initContainerStatuses: [] },
}],
})
.mockResolvedValue({
items: [{
metadata: { name: "pod-abc" },
status: { containerStatuses: [{ name: "claude", state: { terminated: { exitCode: 0 } } }] },
}],
});
});
afterEach(() => {
vi.useRealTimers();
});
it("resolves via grace (jobGone) when log stream exits but job condition never arrives", async () => {
// logApi.log returns immediately (container exited) — log stream exits on first attempt.
mockLogFn.mockImplementation(async () => {});
// One-shot read returns full Claude output (no reconnects needed for output)
mockCoreReadPodLog.mockResolvedValue(CLAUDE_HAPPY_OUTPUT);
// waitForJobCompletion never detects terminal — simulates K8s condition lag.
mockBatchReadJob.mockResolvedValue({ status: { conditions: [] } }); // never terminal
// No timeoutSec → completionTimeoutMs=0 → polls indefinitely without grace.
const executePromise = execute(makeCtx());
// Advance past:
// ~4200ms of readPaperclipRuntimeSkillEntries real I/O (multiple small advances)
// 3100ms: streamPodLogs reconnect sleep (attempt=1, stopSignal still false at entry)
// + 30100ms: grace period (LOG_EXIT_COMPLETION_GRACE_MS = 30s)
// + 3100ms: streamPodLogs bail timer (stopSignal was set by grace → bail fires)
await vi.advanceTimersByTimeAsync(2_100);
await vi.advanceTimersByTimeAsync(1_100);
await vi.advanceTimersByTimeAsync(1_000);
await vi.advanceTimersByTimeAsync(3_100); // reconnect sleep
await vi.advanceTimersByTimeAsync(30_100); // grace fires
await vi.advanceTimersByTimeAsync(3_500); // bail timer + margin
const result = await executePromise;
// Grace fires → jobGone=true → execute proceeds with one-shot logs → success
expect(result.exitCode).toBe(0);
expect(result.sessionId).toBe("sess_test123");
expect(mockCoreReadPodLog).toHaveBeenCalled();
}, 60_000);
it("resolves promptly via real completion when job condition arrives before grace", async () => {
// Log stream exits immediately then job condition arrives well within the grace period.
mockLogFn.mockImplementation(
async (_ns: string, _pod: string, _ctr: string, writable: import("node:stream").Writable) => {
writable.write(CLAUDE_HAPPY_OUTPUT);
},
);
// Job condition appears quickly (< 30s grace period)
mockBatchReadJob.mockResolvedValue({
status: { conditions: [{ type: "Complete", status: "True" }] },
});
const executePromise = execute(makeCtx());
await vi.advanceTimersByTimeAsync(3_100);
const result = await executePromise;
expect(result.exitCode).toBe(0);
expect(result.sessionId).toBe("sess_test123");
// One-shot fallback should NOT be needed since the stream captured full output
// (grace did not fire, real completion arrived)
expect(result.errorMessage).toBeNull();
});
});
// ─── execute: concurrency guard — multiple orphan sorting ────────────────────
describe("execute: concurrency guard — multiple orphans", () => {