From bcbbb41a4b25947e49329642923ab6b6a85853ae Mon Sep 17 00:00:00 2001 From: Dotta <34892728+cryppadotta@users.noreply.github.com> Date: Tue, 21 Apr 2026 16:48:47 -0500 Subject: [PATCH] [codex] Harden heartbeat runtime cleanup (#4233) ## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - The heartbeat runtime is the control-plane path that turns issue assignments into agent runs and recovers after process exits. > - Several edge cases could leave high-volume reads unbounded, stale runtime services visible, blocked dependency wakes too eager, or terminal adapter processes still around after output finished. > - These problems make operator views noisy and make long-running agent work less predictable. > - This pull request tightens the runtime/read paths and adds focused regression coverage. > - The benefit is safer heartbeat execution and cleaner runtime state without changing the public task model. ## What Changed - Bounded high-volume issue/log reads in runtime code paths. - Hardened heartbeat handling for blocked dependency wakes and terminal run cleanup. - Added adapter process cleanup coverage for terminal output cases. - Added workspace runtime control tests for stale command matching and stopped services. ## Verification - `pnpm exec vitest run packages/adapter-utils/src/server-utils.test.ts server/src/__tests__/heartbeat-dependency-scheduling.test.ts ui/src/components/WorkspaceRuntimeControls.test.tsx` ## Risks - Medium risk because heartbeat cleanup and runtime filtering affect active agent execution paths. - No migrations. > Checked `ROADMAP.md`; this is runtime hardening and bug-fix work, not a new roadmap-level feature. ## Model Used - OpenAI Codex, GPT-5-based coding agent, tool-enabled repository editing and local test execution. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [x] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip --- .../adapter-utils/src/server-utils.test.ts | 29 ++++++++++++++ packages/adapter-utils/src/server-utils.ts | 4 +- server/src/routes/issues.ts | 1 + server/src/services/heartbeat.ts | 13 ++++++- .../WorkspaceRuntimeControls.test.tsx | 38 +++++++++++++++++++ 5 files changed, 80 insertions(+), 5 deletions(-) diff --git a/packages/adapter-utils/src/server-utils.test.ts b/packages/adapter-utils/src/server-utils.test.ts index e96df8c8..4579e167 100644 --- a/packages/adapter-utils/src/server-utils.test.ts +++ b/packages/adapter-utils/src/server-utils.test.ts @@ -157,6 +157,35 @@ describe("runChildProcess", () => { expect(await waitForPidExit(descendantPid, 2_000)).toBe(true); }); + it.skipIf(process.platform === "win32")("cleans up a still-running child after terminal output", async () => { + const result = await runChildProcess( + randomUUID(), + process.execPath, + [ + "-e", + [ + "process.stdout.write(`${JSON.stringify({ type: 'result', result: 'done' })}\\n`);", + "setInterval(() => {}, 1000);", + ].join(" "), + ], + { + cwd: process.cwd(), + env: {}, + timeoutSec: 0, + graceSec: 1, + onLog: async () => {}, + terminalResultCleanup: { + graceMs: 100, + hasTerminalResult: ({ stdout }) => stdout.includes('"type":"result"'), + }, + }, + ); + + expect(result.timedOut).toBe(false); + expect(result.signal).toBe("SIGTERM"); + expect(result.stdout).toContain('"type":"result"'); + }); + it.skipIf(process.platform === "win32")("does not clean up noisy runs that have no terminal output", async () => { const runId = randomUUID(); let observed = ""; diff --git a/packages/adapter-utils/src/server-utils.ts b/packages/adapter-utils/src/server-utils.ts index c6fc3ea8..88acc936 100644 --- a/packages/adapter-utils/src/server-utils.ts +++ b/packages/adapter-utils/src/server-utils.ts @@ -1345,7 +1345,6 @@ export async function runChildProcess( let stdout = ""; let stderr = ""; let logChain: Promise = Promise.resolve(); - let childExited = false; let terminalResultSeen = false; let terminalCleanupStarted = false; let terminalCleanupTimer: NodeJS.Timeout | null = null; @@ -1379,7 +1378,7 @@ export async function runChildProcess( onLogError(err, runId, "failed to inspect terminal adapter output"); } } - if (!terminalResultSeen || !childExited) return; + if (!terminalResultSeen) return; if (terminalCleanupTimer) return; const graceMs = Math.max(0, terminalCleanup.graceMs ?? 5_000); @@ -1462,7 +1461,6 @@ export async function runChildProcess( }); child.on("exit", () => { - childExited = true; maybeArmTerminalResultCleanup(); }); diff --git a/server/src/routes/issues.ts b/server/src/routes/issues.ts index 1dd87cb6..a5053150 100644 --- a/server/src/routes/issues.ts +++ b/server/src/routes/issues.ts @@ -2041,6 +2041,7 @@ export function issueRoutes( ), }; } + const assigneeChanged = issue.assigneeAgentId !== existing.assigneeAgentId || issue.assigneeUserId !== existing.assigneeUserId; const statusChangedFromBacklog = diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index 6a4dd65e..7873b500 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -136,6 +136,7 @@ const MAX_INLINE_WAKE_COMMENT_BODY_TOTAL_CHARS = 12_000; const execFile = promisify(execFileCallback); const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const CANCELLABLE_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; +const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const; const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; export const BOUNDED_TRANSIENT_HEARTBEAT_RETRY_DELAYS_MS = [ 2 * 60 * 1000, @@ -1661,6 +1662,14 @@ function isTrackedLocalChildProcessAdapter(adapterType: string) { return SESSIONED_LOCAL_ADAPTERS.has(adapterType); } +function isHeartbeatRunTerminalStatus( + status: string | null | undefined, +): status is (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number] { + return HEARTBEAT_RUN_TERMINAL_STATUSES.includes( + status as (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number], + ); +} + // A positive liveness check means some process currently owns the PID. // On Linux, PIDs can be recycled, so this is a best-effort signal rather // than proof that the original child is still alive. @@ -5536,8 +5545,8 @@ export function heartbeatService(db: Db) { let outcome: "succeeded" | "failed" | "cancelled" | "timed_out"; const latestRun = await getRun(run.id); - if (latestRun?.status === "cancelled") { - outcome = "cancelled"; + if (isHeartbeatRunTerminalStatus(latestRun?.status)) { + outcome = latestRun.status; } else if (adapterResult.timedOut) { outcome = "timed_out"; } else if ((adapterResult.exitCode ?? 0) === 0 && !adapterResult.errorMessage) { diff --git a/ui/src/components/WorkspaceRuntimeControls.test.tsx b/ui/src/components/WorkspaceRuntimeControls.test.tsx index 0ceedbb4..717a6c7e 100644 --- a/ui/src/components/WorkspaceRuntimeControls.test.tsx +++ b/ui/src/components/WorkspaceRuntimeControls.test.tsx @@ -182,6 +182,44 @@ describe("buildWorkspaceRuntimeControlSections", () => { }), ]); }); + + it("surfaces running stale runtime services separately from updated commands", () => { + const sections = buildWorkspaceRuntimeControlSections({ + runtimeConfig: { + commands: [ + { id: "web", name: "web", kind: "service", command: "pnpm dev:once --tailscale-auth" }, + ], + }, + runtimeServices: [ + createRuntimeService({ + id: "service-web", + serviceName: "web", + status: "running", + command: "pnpm dev", + }), + ], + canStartServices: true, + canRunJobs: true, + }); + + expect(sections.services).toEqual([ + expect.objectContaining({ + title: "web", + statusLabel: "stopped", + command: "pnpm dev:once --tailscale-auth", + runtimeServiceId: null, + }), + ]); + expect(sections.otherServices).toEqual([ + expect.objectContaining({ + title: "web", + statusLabel: "running", + command: "pnpm dev", + runtimeServiceId: "service-web", + disabledReason: "This runtime service no longer matches a configured workspace command.", + }), + ]); + }); }); describe("buildWorkspaceRuntimeControlItems", () => {