fix: harden heartbeat and adapter runtime workflows

This commit is contained in:
Dotta
2026-04-10 22:26:21 -05:00
parent 548721248e
commit c566a9236c
48 changed files with 14922 additions and 600 deletions
@@ -49,10 +49,70 @@ function spawnAliveProcess() {
});
}
function isPidAlive(pid: number | null | undefined) {
if (typeof pid !== "number" || !Number.isInteger(pid) || pid <= 0) return false;
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}
async function waitForPidExit(pid: number, timeoutMs = 2_000) {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
if (!isPidAlive(pid)) return true;
await new Promise((resolve) => setTimeout(resolve, 50));
}
return !isPidAlive(pid);
}
async function spawnOrphanedProcessGroup() {
const leader = spawn(
process.execPath,
[
"-e",
[
"const { spawn } = require('node:child_process');",
"const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)'], { stdio: 'ignore' });",
"process.stdout.write(String(child.pid));",
"setTimeout(() => process.exit(0), 25);",
].join(" "),
],
{
detached: true,
stdio: ["ignore", "pipe", "ignore"],
},
);
let stdout = "";
leader.stdout?.on("data", (chunk) => {
stdout += String(chunk);
});
await new Promise<void>((resolve, reject) => {
leader.once("error", reject);
leader.once("exit", () => resolve());
});
const descendantPid = Number.parseInt(stdout.trim(), 10);
if (!Number.isInteger(descendantPid) || descendantPid <= 0) {
throw new Error(`Failed to capture orphaned descendant pid from detached process group: ${stdout}`);
}
return {
processPid: leader.pid ?? null,
processGroupId: leader.pid ?? null,
descendantPid,
};
}
describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
let db!: ReturnType<typeof createDb>;
let tempDb: Awaited<ReturnType<typeof startEmbeddedPostgresTestDatabase>> | null = null;
const childProcesses = new Set<ChildProcess>();
const cleanupPids = new Set<number>();
beforeAll(async () => {
tempDb = await startEmbeddedPostgresTestDatabase("paperclip-heartbeat-recovery-");
@@ -66,6 +126,14 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
child.kill("SIGKILL");
}
childProcesses.clear();
for (const pid of cleanupPids) {
try {
process.kill(pid, "SIGKILL");
} catch {
// Ignore already-dead cleanup targets.
}
}
cleanupPids.clear();
await db.delete(issues);
await db.delete(heartbeatRunEvents);
await db.delete(heartbeatRuns);
@@ -79,6 +147,14 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
child.kill("SIGKILL");
}
childProcesses.clear();
for (const pid of cleanupPids) {
try {
process.kill(pid, "SIGKILL");
} catch {
// Ignore already-dead cleanup targets.
}
}
cleanupPids.clear();
runningProcesses.clear();
await tempDb?.cleanup();
});
@@ -88,6 +164,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
agentStatus?: "paused" | "idle" | "running";
runStatus?: "running" | "queued" | "failed";
processPid?: number | null;
processGroupId?: number | null;
processLossRetryCount?: number;
includeIssue?: boolean;
runErrorCode?: string | null;
@@ -143,6 +220,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
wakeupRequestId,
contextSnapshot: input?.includeIssue === false ? {} : { issueId },
processPid: input?.processPid ?? null,
processGroupId: input?.processGroupId ?? null,
processLossRetryCount: input?.processLossRetryCount ?? 0,
errorCode: input?.runErrorCode ?? null,
error: input?.runError ?? null,
@@ -228,6 +306,45 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
expect(issue?.checkoutRunId).toBe(runId);
});
it.skipIf(process.platform === "win32")("reaps orphaned descendant process groups when the parent pid is already gone", async () => {
const orphan = await spawnOrphanedProcessGroup();
cleanupPids.add(orphan.descendantPid);
expect(isPidAlive(orphan.descendantPid)).toBe(true);
const { agentId, runId, issueId } = await seedRunFixture({
processPid: orphan.processPid,
processGroupId: orphan.processGroupId,
});
const heartbeat = heartbeatService(db);
const result = await heartbeat.reapOrphanedRuns();
expect(result.reaped).toBe(1);
expect(result.runIds).toEqual([runId]);
expect(await waitForPidExit(orphan.descendantPid, 2_000)).toBe(true);
const runs = await db
.select()
.from(heartbeatRuns)
.where(eq(heartbeatRuns.agentId, agentId));
expect(runs).toHaveLength(2);
const failedRun = runs.find((row) => row.id === runId);
expect(failedRun?.status).toBe("failed");
expect(failedRun?.errorCode).toBe("process_lost");
expect(failedRun?.error).toContain("descendant process group");
const retryRun = runs.find((row) => row.id !== runId);
expect(retryRun?.status).toBe("queued");
const issue = await db
.select()
.from(issues)
.where(eq(issues.id, issueId))
.then((rows) => rows[0] ?? null);
expect(issue?.executionRunId).toBe(retryRun?.id ?? null);
});
it("does not queue a second retry after the first process-loss retry was already used", async () => {
const { agentId, runId, issueId } = await seedRunFixture({
processPid: 999_999_999,