Files
paperclip/server/src/services/recovery/run-liveness-continuations.ts
T
Dotta 454edfe81e Add recovery handoff system notices (#5289)
## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - Agent runs can end productively while the source issue still lacks a
durable final disposition.
> - That leaves the control plane unsure whether to resume, escalate, or
close the work.
> - Issue comments also need a presentation contract so system-authored
recovery notices can render as first-class thread messages without
overloading normal comments.
> - This pull request adds successful-run handoff recovery, comment
presentation metadata, and system notice rendering.
> - The benefit is stricter task liveness with clearer operator-facing
recovery state.

## What Changed

- Added successful-run handoff decisions, wake payloads, escalation
behavior, and recovery tests.
- Added issue comment presentation metadata with migration
`0078_white_darwin.sql` and shared/server/company portability support.
- Rendered recovery/system notices in issue chat with dedicated UI
components, fixtures, tests, and storybook/lab coverage.
- Included the current recovery model-profile hint patch so automatic
recovery follow-ups use the cheap profile.

## Verification

- `pnpm install --frozen-lockfile`
- `pnpm exec vitest run
server/src/services/recovery/successful-run-handoff.test.ts
ui/src/components/SystemNotice.test.tsx
ui/src/lib/system-notice-comment.test.ts
ui/src/components/IssueChatThreadSystemNotice.test.tsx`

## Risks

- Migration-bearing PR: merge this before any other branch that might
later add a migration.
- The branch touches both recovery services and issue-thread rendering,
so review should pay attention to recovery wake idempotency and comment
metadata compatibility.

## Model Used

- OpenAI GPT-5 Codex via Paperclip `codex_local` adapter, with
shell/git/GitHub CLI tool use.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
2026-05-06 06:05:58 -05:00

190 lines
6.4 KiB
TypeScript

import { and, eq, inArray } from "drizzle-orm";
import type { Db } from "@paperclipai/db";
import { agentWakeupRequests, agents, heartbeatRuns, issues } from "@paperclipai/db";
import type { RunLivenessState } from "@paperclipai/shared";
import { withRecoveryModelProfileHint } from "./model-profile-hint.js";
import { RECOVERY_REASON_KINDS } from "./origins.js";
export const RUN_LIVENESS_CONTINUATION_REASON = RECOVERY_REASON_KINDS.runLivenessContinuation;
export const DEFAULT_MAX_LIVENESS_CONTINUATION_ATTEMPTS = 2;
const ACTIONABLE_LIVENESS_STATES = new Set<RunLivenessState>(["plan_only", "empty_response"]);
const CONTINUATION_ACTIVE_ISSUE_STATUSES = new Set(["todo", "in_progress"]);
// A prior adapter error should not permanently suppress bounded liveness
// continuations; the max-attempt/idempotency guards prevent unbounded retries.
const CONTINUATION_AGENT_STATUSES = new Set(["active", "idle", "running", "error"]);
const IDEMPOTENT_WAKE_STATUSES = ["queued", "deferred_issue_execution", "completed"];
type HeartbeatRunRow = typeof heartbeatRuns.$inferSelect;
type IssueRow = Pick<
typeof issues.$inferSelect,
"id" | "companyId" | "identifier" | "title" | "status" | "assigneeAgentId" | "executionState" | "projectId"
>;
type AgentRow = Pick<typeof agents.$inferSelect, "id" | "companyId" | "status">;
export type RunContinuationDecision =
| {
kind: "enqueue";
nextAttempt: number;
idempotencyKey: string;
payload: Record<string, unknown>;
contextSnapshot: Record<string, unknown>;
}
| {
kind: "exhausted";
attempt: number;
maxAttempts: number;
comment: string;
}
| {
kind: "skip";
reason: string;
};
export function readContinuationAttempt(value: unknown): number {
const numeric = typeof value === "number" ? value : Number.parseInt(String(value ?? ""), 10);
return Number.isFinite(numeric) && numeric > 0 ? Math.floor(numeric) : 0;
}
export function buildRunLivenessContinuationIdempotencyKey(input: {
issueId: string;
sourceRunId: string;
livenessState: RunLivenessState;
nextAttempt: number;
}) {
return [
RUN_LIVENESS_CONTINUATION_REASON,
input.issueId,
input.sourceRunId,
input.livenessState,
String(input.nextAttempt),
].join(":");
}
export async function findExistingRunLivenessContinuationWake(
db: Db,
input: {
companyId: string;
idempotencyKey: string;
},
) {
return db
.select({ id: agentWakeupRequests.id, status: agentWakeupRequests.status })
.from(agentWakeupRequests)
.where(
and(
eq(agentWakeupRequests.companyId, input.companyId),
eq(agentWakeupRequests.idempotencyKey, input.idempotencyKey),
inArray(agentWakeupRequests.status, IDEMPOTENT_WAKE_STATUSES),
),
)
.limit(1)
.then((rows) => rows[0] ?? null);
}
export function decideRunLivenessContinuation(input: {
run: HeartbeatRunRow;
issue: IssueRow | null;
agent: AgentRow | null;
livenessState: RunLivenessState | null;
livenessReason: string | null;
nextAction: string | null;
budgetBlocked: boolean;
idempotentWakeExists: boolean;
maxAttempts?: number;
}): RunContinuationDecision {
const {
run,
issue,
agent,
livenessState,
livenessReason,
nextAction,
budgetBlocked,
idempotentWakeExists,
} = input;
const maxAttempts = input.maxAttempts ?? DEFAULT_MAX_LIVENESS_CONTINUATION_ATTEMPTS;
if (!livenessState || !ACTIONABLE_LIVENESS_STATES.has(livenessState)) {
return { kind: "skip", reason: "liveness state is not actionable for continuation" };
}
if (!issue) return { kind: "skip", reason: "issue not found" };
if (!agent) return { kind: "skip", reason: "agent not found" };
if (issue.companyId !== run.companyId || agent.companyId !== run.companyId) {
return { kind: "skip", reason: "company scope mismatch" };
}
if (issue.assigneeAgentId !== run.agentId) {
return { kind: "skip", reason: "issue is no longer assigned to the source run agent" };
}
if (!CONTINUATION_ACTIVE_ISSUE_STATUSES.has(issue.status)) {
return { kind: "skip", reason: `issue status ${issue.status} is not continuable` };
}
if (issue.executionState) {
return { kind: "skip", reason: "issue is blocked by execution policy state" };
}
if (!CONTINUATION_AGENT_STATUSES.has(agent.status)) {
return { kind: "skip", reason: `agent status ${agent.status} is not invokable` };
}
if (budgetBlocked) {
return { kind: "skip", reason: "budget hard stop blocks continuation" };
}
const currentAttempt = readContinuationAttempt(run.continuationAttempt);
if (currentAttempt >= maxAttempts) {
return {
kind: "exhausted",
attempt: currentAttempt,
maxAttempts,
comment: [
"Bounded liveness continuation exhausted",
"",
`- Last liveness state: \`${livenessState}\``,
`- Attempts used: ${currentAttempt}/${maxAttempts}`,
`- Reason: ${livenessReason ?? "Run ended without concrete progress"}`,
"- Next action: a human or manager should inspect the run and either clarify the task, mark it blocked, or assign a concrete follow-up.",
].join("\n"),
};
}
const nextAttempt = currentAttempt + 1;
const idempotencyKey = buildRunLivenessContinuationIdempotencyKey({
issueId: issue.id,
sourceRunId: run.id,
livenessState,
nextAttempt,
});
if (idempotentWakeExists) {
return { kind: "skip", reason: "continuation wake already exists for this source run and attempt" };
}
const payload = withRecoveryModelProfileHint({
issueId: issue.id,
sourceRunId: run.id,
livenessState,
livenessReason,
continuationAttempt: nextAttempt,
maxContinuationAttempts: maxAttempts,
instruction:
nextAction ??
"The previous run ended without concrete progress. Take the first concrete action now or mark the issue blocked with a specific unblock request.",
});
return {
kind: "enqueue",
nextAttempt,
idempotencyKey,
payload,
contextSnapshot: withRecoveryModelProfileHint({
issueId: issue.id,
taskId: issue.id,
taskKey: issue.id,
wakeReason: RUN_LIVENESS_CONTINUATION_REASON,
livenessContinuationAttempt: nextAttempt,
livenessContinuationMaxAttempts: maxAttempts,
livenessContinuationSourceRunId: run.id,
livenessContinuationState: livenessState,
livenessContinuationReason: livenessReason,
livenessContinuationInstruction: payload.instruction,
}),
};
}