forked from farhoodlabs/paperclip
[codex] Harden heartbeat scheduling and runtime controls (#4223)
## Thinking Path > - Paperclip orchestrates AI agents through issue checkout, heartbeat runs, routines, and auditable control-plane state > - The runtime path has to recover from lost local processes, transient adapter failures, blocked dependencies, and routine coalescing without stranding work > - The existing branch carried several reliability fixes across heartbeat scheduling, issue runtime controls, routine dispatch, and operator-facing run state > - These changes belong together because they share backend contracts, migrations, and runtime status semantics > - This pull request groups the control-plane/runtime slice so it can merge independently from board UI polish and adapter sandbox work > - The benefit is safer heartbeat recovery, clearer runtime controls, and more predictable recurring execution behavior ## What Changed - Adds bounded heartbeat retry scheduling, scheduled retry state, and Codex transient failure recovery handling. - Tightens heartbeat process recovery, blocker wake behavior, issue comment wake handling, routine dispatch coalescing, and activity/dashboard bounds. - Adds runtime-control MCP tools and Paperclip skill docs for issue workspace runtime management. - Adds migrations `0061_lively_thor_girl.sql` and `0062_routine_run_dispatch_fingerprint.sql`. - Surfaces retry state in run ledger/agent UI and keeps related shared types synchronized. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-retry-scheduling.test.ts server/src/__tests__/heartbeat-process-recovery.test.ts server/src/__tests__/routines-service.test.ts` - `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server` ## Risks - Medium risk: this touches heartbeat recovery and routine dispatch, which are central execution paths. - Migration order matters if split branches land out of order: merge this PR before branches that assume the new runtime/routine fields. - Runtime retry behavior should be watched in CI and in local operator smoke tests because it changes how transient failures are resumed. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use enabled. Exact hosted model build and context window are not exposed in this Paperclip heartbeat environment. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
This commit is contained in:
@@ -16,6 +16,7 @@ import {
|
||||
heartbeatRuns,
|
||||
issueComments,
|
||||
issueDocuments,
|
||||
issueRelations,
|
||||
issues,
|
||||
} from "@paperclipai/db";
|
||||
import {
|
||||
@@ -231,6 +232,7 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
await db.delete(issueDocuments);
|
||||
await db.delete(documentRevisions);
|
||||
await db.delete(documents);
|
||||
await db.delete(issueRelations);
|
||||
await db.delete(issues);
|
||||
await db.delete(heartbeatRunEvents);
|
||||
await db.delete(heartbeatRuns);
|
||||
@@ -441,6 +443,87 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
return { companyId, agentId, runId, wakeupRequestId, issueId };
|
||||
}
|
||||
|
||||
async function seedQueuedIssueRunFixture() {
|
||||
const companyId = randomUUID();
|
||||
const agentId = randomUUID();
|
||||
const runId = randomUUID();
|
||||
const wakeupRequestId = randomUUID();
|
||||
const issueId = randomUUID();
|
||||
const now = new Date("2026-03-19T00:00:00.000Z");
|
||||
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
|
||||
|
||||
await db.insert(companies).values({
|
||||
id: companyId,
|
||||
name: "Paperclip",
|
||||
issuePrefix,
|
||||
requireBoardApprovalForNewAgents: false,
|
||||
});
|
||||
|
||||
await db.insert(agents).values({
|
||||
id: agentId,
|
||||
companyId,
|
||||
name: "CodexCoder",
|
||||
role: "engineer",
|
||||
status: "idle",
|
||||
adapterType: "codex_local",
|
||||
adapterConfig: {},
|
||||
runtimeConfig: {
|
||||
heartbeat: {
|
||||
wakeOnDemand: true,
|
||||
maxConcurrentRuns: 1,
|
||||
},
|
||||
},
|
||||
permissions: {},
|
||||
});
|
||||
|
||||
await db.insert(agentWakeupRequests).values({
|
||||
id: wakeupRequestId,
|
||||
companyId,
|
||||
agentId,
|
||||
source: "assignment",
|
||||
triggerDetail: "system",
|
||||
reason: "issue_assigned",
|
||||
payload: { issueId },
|
||||
status: "queued",
|
||||
runId,
|
||||
requestedAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
|
||||
await db.insert(heartbeatRuns).values({
|
||||
id: runId,
|
||||
companyId,
|
||||
agentId,
|
||||
invocationSource: "assignment",
|
||||
triggerDetail: "system",
|
||||
status: "queued",
|
||||
wakeupRequestId,
|
||||
contextSnapshot: {
|
||||
issueId,
|
||||
taskId: issueId,
|
||||
wakeReason: "issue_assigned",
|
||||
},
|
||||
updatedAt: now,
|
||||
createdAt: now,
|
||||
});
|
||||
|
||||
await db.insert(issues).values({
|
||||
id: issueId,
|
||||
companyId,
|
||||
title: "Retry transient Codex failure without blocking",
|
||||
status: "in_progress",
|
||||
priority: "medium",
|
||||
assigneeAgentId: agentId,
|
||||
checkoutRunId: runId,
|
||||
executionRunId: runId,
|
||||
issueNumber: 1,
|
||||
identifier: `${issuePrefix}-1`,
|
||||
startedAt: now,
|
||||
});
|
||||
|
||||
return { companyId, agentId, runId, wakeupRequestId, issueId };
|
||||
}
|
||||
|
||||
it("keeps a local run active when the recorded pid is still alive", async () => {
|
||||
const child = spawnAliveProcess();
|
||||
childProcesses.add(child);
|
||||
@@ -547,8 +630,11 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
expect(issue?.executionRunId).toBe(retryRun?.id ?? null);
|
||||
});
|
||||
|
||||
it("does not queue a second retry after the first process-loss retry was already used", async () => {
|
||||
it("blocks the issue when process-loss retry is exhausted and the immediate continuation recovery also fails", async () => {
|
||||
mockAdapterExecute.mockRejectedValueOnce(new Error("continuation recovery failed"));
|
||||
|
||||
const { agentId, runId, issueId } = await seedRunFixture({
|
||||
agentStatus: "idle",
|
||||
processPid: 999_999_999,
|
||||
processLossRetryCount: 1,
|
||||
});
|
||||
@@ -562,16 +648,74 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
.select()
|
||||
.from(heartbeatRuns)
|
||||
.where(eq(heartbeatRuns.agentId, agentId));
|
||||
expect(runs).toHaveLength(1);
|
||||
expect(runs[0]?.status).toBe("failed");
|
||||
expect(runs).toHaveLength(2);
|
||||
expect(runs.find((row) => row.id === runId)?.status).toBe("failed");
|
||||
const continuationRun = runs.find((row) => row.id !== runId);
|
||||
expect(continuationRun?.contextSnapshot as Record<string, unknown> | undefined).toMatchObject({
|
||||
retryReason: "issue_continuation_needed",
|
||||
retryOfRunId: runId,
|
||||
});
|
||||
|
||||
const blockedIssue = await waitForValue(async () =>
|
||||
db.select().from(issues).where(eq(issues.id, issueId)).then((rows) => {
|
||||
const issue = rows[0] ?? null;
|
||||
return issue?.status === "blocked" ? issue : null;
|
||||
})
|
||||
);
|
||||
expect(blockedIssue?.status).toBe("blocked");
|
||||
expect(blockedIssue?.executionRunId).toBeNull();
|
||||
expect(blockedIssue?.checkoutRunId).toBe(continuationRun?.id ?? null);
|
||||
|
||||
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
|
||||
expect(comments).toHaveLength(1);
|
||||
expect(comments[0]?.body).toContain("retried continuation");
|
||||
});
|
||||
|
||||
it("schedules a bounded retry for codex transient upstream failures instead of blocking the issue immediately", async () => {
|
||||
mockAdapterExecute.mockResolvedValueOnce({
|
||||
exitCode: 1,
|
||||
signal: null,
|
||||
timedOut: false,
|
||||
errorCode: "codex_transient_upstream",
|
||||
errorMessage:
|
||||
"Error running remote compact task: We're currently experiencing high demand, which may cause temporary errors.",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
});
|
||||
|
||||
const { agentId, runId, issueId } = await seedQueuedIssueRunFixture();
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
await heartbeat.resumeQueuedRuns();
|
||||
await waitForRunToSettle(heartbeat, runId);
|
||||
|
||||
const runs = await waitForValue(async () => {
|
||||
const rows = await db
|
||||
.select()
|
||||
.from(heartbeatRuns)
|
||||
.where(eq(heartbeatRuns.agentId, agentId));
|
||||
return rows.length >= 2 ? rows : null;
|
||||
});
|
||||
expect(runs).toHaveLength(2);
|
||||
|
||||
const failedRun = runs?.find((row) => row.id === runId);
|
||||
const retryRun = runs?.find((row) => row.id !== runId);
|
||||
expect(failedRun?.status).toBe("failed");
|
||||
expect(failedRun?.errorCode).toBe("codex_transient_upstream");
|
||||
expect(retryRun?.status).toBe("scheduled_retry");
|
||||
expect(retryRun?.scheduledRetryReason).toBe("transient_failure");
|
||||
expect((retryRun?.contextSnapshot as Record<string, unknown> | null)?.codexTransientFallbackMode).toBe("same_session");
|
||||
|
||||
const issue = await db
|
||||
.select()
|
||||
.from(issues)
|
||||
.where(eq(issues.id, issueId))
|
||||
.then((rows) => rows[0] ?? null);
|
||||
expect(issue?.executionRunId).toBeNull();
|
||||
expect(issue?.checkoutRunId).toBe(runId);
|
||||
expect(issue?.status).toBe("in_progress");
|
||||
expect(issue?.executionRunId).toBe(retryRun?.id ?? null);
|
||||
|
||||
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, issueId));
|
||||
expect(comments).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("clears the detached warning when the run reports activity again", async () => {
|
||||
@@ -675,6 +819,107 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
expect(comments[0]?.body).toContain("Latest retry failure: `process_lost` - run failed before issue advanced.");
|
||||
});
|
||||
|
||||
it("assigns open unassigned blockers back to their creator agent", async () => {
|
||||
const companyId = randomUUID();
|
||||
const creatorAgentId = randomUUID();
|
||||
const blockedAssigneeAgentId = randomUUID();
|
||||
const blockerIssueId = randomUUID();
|
||||
const blockedIssueId = randomUUID();
|
||||
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
|
||||
await db.insert(companies).values({
|
||||
id: companyId,
|
||||
name: "Paperclip",
|
||||
issuePrefix,
|
||||
requireBoardApprovalForNewAgents: false,
|
||||
});
|
||||
await db.insert(agents).values([
|
||||
{
|
||||
id: creatorAgentId,
|
||||
companyId,
|
||||
name: "SecurityEngineer",
|
||||
role: "engineer",
|
||||
status: "idle",
|
||||
adapterType: "codex_local",
|
||||
adapterConfig: {},
|
||||
runtimeConfig: {},
|
||||
permissions: {},
|
||||
},
|
||||
{
|
||||
id: blockedAssigneeAgentId,
|
||||
companyId,
|
||||
name: "CodexCoder",
|
||||
role: "engineer",
|
||||
status: "idle",
|
||||
adapterType: "codex_local",
|
||||
adapterConfig: {},
|
||||
runtimeConfig: {},
|
||||
permissions: {},
|
||||
},
|
||||
]);
|
||||
await db.insert(issues).values([
|
||||
{
|
||||
id: blockerIssueId,
|
||||
companyId,
|
||||
title: "Fix blocker",
|
||||
status: "todo",
|
||||
priority: "high",
|
||||
createdByAgentId: creatorAgentId,
|
||||
issueNumber: 1,
|
||||
identifier: `${issuePrefix}-1`,
|
||||
},
|
||||
{
|
||||
id: blockedIssueId,
|
||||
companyId,
|
||||
title: "Blocked work",
|
||||
status: "blocked",
|
||||
priority: "high",
|
||||
assigneeAgentId: blockedAssigneeAgentId,
|
||||
issueNumber: 2,
|
||||
identifier: `${issuePrefix}-2`,
|
||||
},
|
||||
]);
|
||||
await db.insert(issueRelations).values({
|
||||
companyId,
|
||||
issueId: blockerIssueId,
|
||||
relatedIssueId: blockedIssueId,
|
||||
type: "blocks",
|
||||
createdByAgentId: creatorAgentId,
|
||||
});
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
const result = await heartbeat.reconcileStrandedAssignedIssues();
|
||||
|
||||
expect(result.orphanBlockersAssigned).toBe(1);
|
||||
expect(result.issueIds).toContain(blockerIssueId);
|
||||
|
||||
const blocker = await db
|
||||
.select()
|
||||
.from(issues)
|
||||
.where(eq(issues.id, blockerIssueId))
|
||||
.then((rows) => rows[0] ?? null);
|
||||
expect(blocker?.assigneeAgentId).toBe(creatorAgentId);
|
||||
|
||||
const comments = await db.select().from(issueComments).where(eq(issueComments.issueId, blockerIssueId));
|
||||
expect(comments[0]?.body).toContain("Assigned Orphan Blocker");
|
||||
expect(comments[0]?.body).toContain(`[${issuePrefix}-2](/${issuePrefix}/issues/${issuePrefix}-2)`);
|
||||
|
||||
const wakeups = await db.select().from(agentWakeupRequests).where(eq(agentWakeupRequests.agentId, creatorAgentId));
|
||||
expect(wakeups).toEqual([
|
||||
expect.objectContaining({
|
||||
reason: "issue_assigned",
|
||||
payload: expect.objectContaining({
|
||||
issueId: blockerIssueId,
|
||||
mutation: "unassigned_blocker_recovery",
|
||||
}),
|
||||
}),
|
||||
]);
|
||||
|
||||
const runId = wakeups[0]?.runId;
|
||||
if (runId) {
|
||||
await waitForRunToSettle(heartbeat, runId);
|
||||
}
|
||||
});
|
||||
|
||||
it("re-enqueues continuation for stranded in-progress work with no active run", async () => {
|
||||
const { agentId, issueId, runId } = await seedStrandedIssueFixture({
|
||||
status: "in_progress",
|
||||
@@ -851,7 +1096,6 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
||||
const wakes = await db.select().from(agentWakeupRequests).where(eq(agentWakeupRequests.agentId, agentId));
|
||||
expect(wakes.some((row) => row.reason === "run_liveness_continuation")).toBe(false);
|
||||
});
|
||||
|
||||
it("blocks stranded in-progress work after the continuation retry was already used", async () => {
|
||||
const { issueId } = await seedStrandedIssueFixture({
|
||||
status: "in_progress",
|
||||
|
||||
Reference in New Issue
Block a user