forked from farhoodlabs/paperclip
fix: harden heartbeat and adapter runtime workflows
This commit is contained in:
@@ -49,10 +49,70 @@ function spawnAliveProcess() {
|
||||
});
|
||||
}
|
||||
|
||||
function isPidAlive(pid: number | null | undefined) {
|
||||
if (typeof pid !== "number" || !Number.isInteger(pid) || pid <= 0) return false;
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function waitForPidExit(pid: number, timeoutMs = 2_000) {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
if (!isPidAlive(pid)) return true;
|
||||
await new Promise((resolve) => setTimeout(resolve, 50));
|
||||
}
|
||||
return !isPidAlive(pid);
|
||||
}
|
||||
|
||||
async function spawnOrphanedProcessGroup() {
|
||||
const leader = spawn(
|
||||
process.execPath,
|
||||
[
|
||||
"-e",
|
||||
[
|
||||
"const { spawn } = require('node:child_process');",
|
||||
"const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)'], { stdio: 'ignore' });",
|
||||
"process.stdout.write(String(child.pid));",
|
||||
"setTimeout(() => process.exit(0), 25);",
|
||||
].join(" "),
|
||||
],
|
||||
{
|
||||
detached: true,
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
},
|
||||
);
|
||||
|
||||
let stdout = "";
|
||||
leader.stdout?.on("data", (chunk) => {
|
||||
stdout += String(chunk);
|
||||
});
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
leader.once("error", reject);
|
||||
leader.once("exit", () => resolve());
|
||||
});
|
||||
|
||||
const descendantPid = Number.parseInt(stdout.trim(), 10);
|
||||
if (!Number.isInteger(descendantPid) || descendantPid <= 0) {
|
||||
throw new Error(`Failed to capture orphaned descendant pid from detached process group: ${stdout}`);
|
||||
}
|
||||
|
||||
return {
|
||||
processPid: leader.pid ?? null,
|
||||
processGroupId: leader.pid ?? null,
|
||||
descendantPid,
|
||||
};
|
||||
}
|
||||
|
||||
describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
let db!: ReturnType<typeof createDb>;
|
||||
let tempDb: Awaited<ReturnType<typeof startEmbeddedPostgresTestDatabase>> | null = null;
|
||||
const childProcesses = new Set<ChildProcess>();
|
||||
const cleanupPids = new Set<number>();
|
||||
|
||||
beforeAll(async () => {
|
||||
tempDb = await startEmbeddedPostgresTestDatabase("paperclip-heartbeat-recovery-");
|
||||
@@ -66,6 +126,14 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
child.kill("SIGKILL");
|
||||
}
|
||||
childProcesses.clear();
|
||||
for (const pid of cleanupPids) {
|
||||
try {
|
||||
process.kill(pid, "SIGKILL");
|
||||
} catch {
|
||||
// Ignore already-dead cleanup targets.
|
||||
}
|
||||
}
|
||||
cleanupPids.clear();
|
||||
await db.delete(issues);
|
||||
await db.delete(heartbeatRunEvents);
|
||||
await db.delete(heartbeatRuns);
|
||||
@@ -79,6 +147,14 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
child.kill("SIGKILL");
|
||||
}
|
||||
childProcesses.clear();
|
||||
for (const pid of cleanupPids) {
|
||||
try {
|
||||
process.kill(pid, "SIGKILL");
|
||||
} catch {
|
||||
// Ignore already-dead cleanup targets.
|
||||
}
|
||||
}
|
||||
cleanupPids.clear();
|
||||
runningProcesses.clear();
|
||||
await tempDb?.cleanup();
|
||||
});
|
||||
@@ -88,6 +164,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
agentStatus?: "paused" | "idle" | "running";
|
||||
runStatus?: "running" | "queued" | "failed";
|
||||
processPid?: number | null;
|
||||
processGroupId?: number | null;
|
||||
processLossRetryCount?: number;
|
||||
includeIssue?: boolean;
|
||||
runErrorCode?: string | null;
|
||||
@@ -143,6 +220,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
wakeupRequestId,
|
||||
contextSnapshot: input?.includeIssue === false ? {} : { issueId },
|
||||
processPid: input?.processPid ?? null,
|
||||
processGroupId: input?.processGroupId ?? null,
|
||||
processLossRetryCount: input?.processLossRetryCount ?? 0,
|
||||
errorCode: input?.runErrorCode ?? null,
|
||||
error: input?.runError ?? null,
|
||||
@@ -228,6 +306,45 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
expect(issue?.checkoutRunId).toBe(runId);
|
||||
});
|
||||
|
||||
it.skipIf(process.platform === "win32")("reaps orphaned descendant process groups when the parent pid is already gone", async () => {
|
||||
const orphan = await spawnOrphanedProcessGroup();
|
||||
cleanupPids.add(orphan.descendantPid);
|
||||
expect(isPidAlive(orphan.descendantPid)).toBe(true);
|
||||
|
||||
const { agentId, runId, issueId } = await seedRunFixture({
|
||||
processPid: orphan.processPid,
|
||||
processGroupId: orphan.processGroupId,
|
||||
});
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
const result = await heartbeat.reapOrphanedRuns();
|
||||
expect(result.reaped).toBe(1);
|
||||
expect(result.runIds).toEqual([runId]);
|
||||
|
||||
expect(await waitForPidExit(orphan.descendantPid, 2_000)).toBe(true);
|
||||
|
||||
const runs = await db
|
||||
.select()
|
||||
.from(heartbeatRuns)
|
||||
.where(eq(heartbeatRuns.agentId, agentId));
|
||||
expect(runs).toHaveLength(2);
|
||||
|
||||
const failedRun = runs.find((row) => row.id === runId);
|
||||
expect(failedRun?.status).toBe("failed");
|
||||
expect(failedRun?.errorCode).toBe("process_lost");
|
||||
expect(failedRun?.error).toContain("descendant process group");
|
||||
|
||||
const retryRun = runs.find((row) => row.id !== runId);
|
||||
expect(retryRun?.status).toBe("queued");
|
||||
|
||||
const issue = await db
|
||||
.select()
|
||||
.from(issues)
|
||||
.where(eq(issues.id, issueId))
|
||||
.then((rows) => rows[0] ?? null);
|
||||
expect(issue?.executionRunId).toBe(retryRun?.id ?? null);
|
||||
});
|
||||
|
||||
it("does not queue a second retry after the first process-loss retry was already used", async () => {
|
||||
const { agentId, runId, issueId } = await seedRunFixture({
|
||||
processPid: 999_999_999,
|
||||
|
||||
Reference in New Issue
Block a user