[codex] Add runtime lifecycle recovery and live issue visibility (#4419)

This commit is contained in:
Dotta
2026-04-24 15:50:32 -05:00
committed by GitHub
parent 9a8d219949
commit 5a0c1979cf
121 changed files with 9625 additions and 2044 deletions
@@ -1,6 +1,6 @@
import { randomUUID } from "node:crypto";
import { spawn, type ChildProcess } from "node:child_process";
import { eq, or, inArray } from "drizzle-orm";
import { and, eq, or, inArray } from "drizzle-orm";
import { afterAll, afterEach, beforeAll, describe, expect, it, vi } from "vitest";
import {
activityLog,
@@ -17,6 +17,8 @@ import {
issueComments,
issueDocuments,
issueRelations,
issueTreeHoldMembers,
issueTreeHolds,
issues,
} from "@paperclipai/db";
import {
@@ -309,6 +311,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
await db.delete(documentRevisions);
await db.delete(documents);
await db.delete(issueRelations);
await db.delete(issueTreeHoldMembers);
await db.delete(issueTreeHolds);
for (let attempt = 0; attempt < 5; attempt += 1) {
await db.delete(issueComments);
await db.delete(issueDocuments);
@@ -454,11 +458,13 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
runStatus: "failed" | "timed_out" | "cancelled" | "succeeded";
retryReason?: "assignment_recovery" | "issue_continuation_needed" | null;
assignToUser?: boolean;
activePauseHold?: boolean;
}) {
const companyId = randomUUID();
const agentId = randomUUID();
const runId = randomUUID();
const wakeupRequestId = randomUUID();
const rootIssueId = randomUUID();
const issueId = randomUUID();
const now = new Date("2026-03-19T00:00:00.000Z");
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
@@ -520,22 +526,128 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
error: input.runStatus === "succeeded" ? null : "run failed before issue advanced",
});
await db.insert(issues).values({
id: issueId,
companyId,
title: "Recover stranded assigned work",
status: input.status,
await db.insert(issues).values([
...(input.activePauseHold
? [{
id: rootIssueId,
companyId,
title: "Paused recovery root",
status: "todo",
priority: "medium",
issueNumber: 1,
identifier: `${issuePrefix}-1`,
}]
: []),
{
id: issueId,
companyId,
parentId: input.activePauseHold ? rootIssueId : null,
title: "Recover stranded assigned work",
status: input.status,
priority: "medium",
assigneeAgentId: input.assignToUser ? null : agentId,
assigneeUserId: input.assignToUser ? "user-1" : null,
checkoutRunId: input.status === "in_progress" ? runId : null,
executionRunId: null,
issueNumber: input.activePauseHold ? 2 : 1,
identifier: `${issuePrefix}-${input.activePauseHold ? 2 : 1}`,
startedAt: input.status === "in_progress" ? now : null,
},
]);
if (input.activePauseHold) {
await db.insert(issueTreeHolds).values({
companyId,
rootIssueId,
mode: "pause",
status: "active",
reason: "pause recovery subtree",
releasePolicy: { strategy: "manual" },
});
}
return { companyId, agentId, runId, wakeupRequestId, issueId, rootIssueId };
}
async function expectStrandedRecoveryArtifacts(input: {
companyId: string;
agentId: string;
issueId: string;
runId: string;
previousStatus: "todo" | "in_progress";
retryReason: "assignment_recovery" | "issue_continuation_needed";
}) {
const recovery = await waitForValue(async () =>
db.select().from(issues).where(
and(
eq(issues.companyId, input.companyId),
eq(issues.originKind, "stranded_issue_recovery"),
eq(issues.originId, input.issueId),
),
).then((rows) => rows[0] ?? null),
);
if (!recovery) throw new Error("Expected stranded issue recovery issue to be created");
expect(recovery).toMatchObject({
companyId: input.companyId,
parentId: input.issueId,
assigneeAgentId: input.agentId,
originKind: "stranded_issue_recovery",
originId: input.issueId,
originRunId: input.runId,
priority: "medium",
assigneeAgentId: input.assignToUser ? null : agentId,
assigneeUserId: input.assignToUser ? "user-1" : null,
checkoutRunId: input.status === "in_progress" ? runId : null,
executionRunId: null,
issueNumber: 1,
identifier: `${issuePrefix}-1`,
startedAt: input.status === "in_progress" ? now : null,
});
expect(recovery.title).toContain("Recover stalled issue");
expect(recovery.description).toContain(`Previous source status: \`${input.previousStatus}\``);
expect(recovery.description).toContain(`Retry reason: \`${input.retryReason}\``);
expect(recovery.description).toContain("Fix the runtime/adapter problem");
const relation = await db
.select()
.from(issueRelations)
.where(
and(
eq(issueRelations.companyId, input.companyId),
eq(issueRelations.issueId, recovery.id),
eq(issueRelations.relatedIssueId, input.issueId),
eq(issueRelations.type, "blocks"),
),
)
.then((rows) => rows[0] ?? null);
expect(relation).toBeTruthy();
const wakeups = await db
.select()
.from(agentWakeupRequests)
.where(eq(agentWakeupRequests.agentId, input.agentId));
const recoveryWakeup = wakeups.find((wakeup) => {
const payload = wakeup.payload as Record<string, unknown> | null;
return payload?.issueId === recovery.id &&
payload?.sourceIssueId === input.issueId &&
payload?.strandedRunId === input.runId;
});
expect(recoveryWakeup).toMatchObject({
companyId: input.companyId,
reason: "issue_assigned",
source: "assignment",
});
return { companyId, agentId, runId, wakeupRequestId, issueId };
const recoveryRun = recoveryWakeup?.runId
? await db
.select()
.from(heartbeatRuns)
.where(eq(heartbeatRuns.id, recoveryWakeup.runId))
.then((rows) => rows[0] ?? null)
: null;
expect(recoveryRun?.contextSnapshot).toMatchObject({
issueId: recovery.id,
taskId: recovery.id,
source: "stranded_issue_recovery",
sourceIssueId: input.issueId,
strandedRunId: input.runId,
});
return recovery;
}
async function seedQueuedIssueRunFixture() {
@@ -728,11 +840,28 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
it("blocks the issue when process-loss retry is exhausted and the immediate continuation recovery also fails", async () => {
mockAdapterExecute.mockRejectedValueOnce(new Error("continuation recovery failed"));
const { agentId, runId, issueId } = await seedRunFixture({
const { companyId, agentId, runId, issueId } = await seedRunFixture({
agentStatus: "idle",
processPid: 999_999_999,
processLossRetryCount: 1,
});
const resolvedBlockerId = randomUUID();
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
await db.insert(issues).values({
id: resolvedBlockerId,
companyId,
title: "Already completed prerequisite",
status: "done",
priority: "medium",
issueNumber: 2,
identifier: `${issuePrefix}-2`,
});
await db.insert(issueRelations).values({
companyId,
issueId: resolvedBlockerId,
relatedIssueId: issueId,
type: "blocks",
});
const heartbeat = heartbeatService(db);
const result = await heartbeat.reapOrphanedRuns();
@@ -759,7 +888,29 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
);
expect(blockedIssue?.status).toBe("blocked");
expect(blockedIssue?.executionRunId).toBeNull();
expect(blockedIssue?.checkoutRunId).toBe(continuationRun?.id ?? null);
expect(blockedIssue?.checkoutRunId).toBeNull();
if (!continuationRun?.id) throw new Error("Expected continuation recovery run to exist");
const recovery = await expectStrandedRecoveryArtifacts({
companyId,
agentId,
issueId,
runId: continuationRun.id,
previousStatus: "in_progress",
retryReason: "issue_continuation_needed",
});
const blockerRelations = await db
.select()
.from(issueRelations)
.where(
and(
eq(issueRelations.companyId, companyId),
eq(issueRelations.relatedIssueId, issueId),
eq(issueRelations.type, "blocks"),
),
);
expect(blockerRelations.map((relation) => relation.issueId)).toEqual([recovery.id]);
const comments = await waitForValue(async () => {
const rows = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
@@ -767,6 +918,49 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
});
expect(comments).toHaveLength(1);
expect(comments[0]?.body).toContain("retried continuation");
expect(comments[0]?.body).toContain(`Recovery issue: [${recovery.identifier}]`);
});
it("does not block paused-tree work when immediate continuation recovery is suppressed by the hold", async () => {
const { companyId, agentId, runId, issueId } = await seedRunFixture({
agentStatus: "idle",
processPid: 999_999_999,
processLossRetryCount: 1,
});
await db.insert(issueTreeHolds).values({
companyId,
rootIssueId: issueId,
mode: "pause",
status: "active",
reason: "pause immediate recovery subtree",
releasePolicy: { strategy: "manual" },
});
const heartbeat = heartbeatService(db);
const result = await heartbeat.reapOrphanedRuns();
expect(result.reaped).toBe(1);
expect(result.runIds).toEqual([runId]);
const runs = await db
.select()
.from(heartbeatRuns)
.where(eq(heartbeatRuns.agentId, agentId));
expect(runs).toHaveLength(1);
expect(runs[0]?.status).toBe("failed");
const issue = await db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => rows[0] ?? null);
expect(issue?.status).toBe("in_progress");
expect(issue?.executionRunId).toBeNull();
expect(issue?.checkoutRunId).toBe(runId);
const recoveryIssues = await db
.select()
.from(issues)
.where(and(eq(issues.companyId, companyId), eq(issues.originKind, "stranded_issue_recovery")));
expect(recoveryIssues).toHaveLength(0);
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
expect(comments).toHaveLength(0);
});
it("schedules a bounded retry for codex transient upstream failures instead of blocking the issue immediately", async () => {
@@ -901,7 +1095,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
});
it("blocks assigned todo work after the one automatic dispatch recovery was already used", async () => {
const { issueId } = await seedStrandedIssueFixture({
const { companyId, agentId, issueId, runId } = await seedStrandedIssueFixture({
status: "todo",
runStatus: "failed",
retryReason: "assignment_recovery",
@@ -916,10 +1110,20 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
const issue = await db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => rows[0] ?? null);
expect(issue?.status).toBe("blocked");
const recovery = await expectStrandedRecoveryArtifacts({
companyId,
agentId,
issueId,
runId,
previousStatus: "todo",
retryReason: "assignment_recovery",
});
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
expect(comments).toHaveLength(1);
expect(comments[0]?.body).toContain("retried dispatch");
expect(comments[0]?.body).toContain("Latest retry failure: `process_lost` - run failed before issue advanced.");
expect(comments[0]?.body).toContain(`Recovery issue: [${recovery.identifier}]`);
});
it("assigns open unassigned blockers back to their creator agent", async () => {
@@ -1206,7 +1410,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
expect(wakes.some((row) => row.reason === "run_liveness_continuation")).toBe(false);
});
it("blocks stranded in-progress work after the continuation retry was already used", async () => {
const { issueId } = await seedStrandedIssueFixture({
const { companyId, agentId, issueId, runId } = await seedStrandedIssueFixture({
status: "in_progress",
runStatus: "failed",
retryReason: "issue_continuation_needed",
@@ -1221,10 +1425,65 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
const issue = await db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => rows[0] ?? null);
expect(issue?.status).toBe("blocked");
const recovery = await expectStrandedRecoveryArtifacts({
companyId,
agentId,
issueId,
runId,
previousStatus: "in_progress",
retryReason: "issue_continuation_needed",
});
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
expect(comments).toHaveLength(1);
expect(comments[0]?.body).toContain("retried continuation");
expect(comments[0]?.body).toContain("Latest retry failure: `process_lost` - run failed before issue advanced.");
expect(comments[0]?.body).toContain(`Recovery issue: [${recovery.identifier}]`);
});
it("does not escalate paused-tree recovery when the automatic continuation retry was cancelled by the hold", async () => {
const { companyId, agentId, issueId } = await seedStrandedIssueFixture({
status: "in_progress",
runStatus: "cancelled",
retryReason: "issue_continuation_needed",
activePauseHold: true,
});
const heartbeat = heartbeatService(db);
const result = await heartbeat.reconcileStrandedAssignedIssues();
expect(result.dispatchRequeued).toBe(0);
expect(result.continuationRequeued).toBe(0);
expect(result.escalated).toBe(0);
expect(result.skipped).toBe(1);
expect(result.issueIds).toEqual([]);
const issue = await db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => rows[0] ?? null);
expect(issue?.status).toBe("in_progress");
expect(issue?.checkoutRunId).toBeTruthy();
const recoveryIssues = await db
.select()
.from(issues)
.where(and(eq(issues.companyId, companyId), eq(issues.originKind, "stranded_issue_recovery")));
expect(recoveryIssues).toHaveLength(0);
const blockerRelations = await db
.select()
.from(issueRelations)
.where(
and(
eq(issueRelations.companyId, companyId),
eq(issueRelations.relatedIssueId, issueId),
eq(issueRelations.type, "blocks"),
),
);
expect(blockerRelations).toHaveLength(0);
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
expect(comments).toHaveLength(0);
const wakeups = await db.select().from(agentWakeupRequests).where(eq(agentWakeupRequests.agentId, agentId));
expect(wakeups).toHaveLength(1);
});
it("re-enqueues continuation when the latest automatic continuation succeeded without closing the issue", async () => {