[codex] Add configurable liveness auto-recovery controls (#4587)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - Heartbeat liveness recovery decides when stalled issue trees need
manager-visible follow-up.
> - Automatic recovery issue creation is useful, but operators need
instance-level controls for how aggressive it is.
> - Without controls, recovery behavior is harder to tune for local
development, production operations, and noisy edge cases.
> - This pull request adds configurable liveness auto-recovery settings
across shared contracts, API routes, services, and the instance
experimental settings UI.
> - The benefit is that operators can keep liveness findings advisory or
enable bounded recovery automation with explicit intervals and lookback
windows.

## What Changed

- Added shared types and validators for liveness auto-recovery settings.
- Extended instance settings routes and services to persist and validate
the new controls.
- Wired heartbeat/recovery services to honor enablement, minimum
interval, and lookback settings.
- Added UI controls for liveness recovery under instance experimental
settings.
- Covered the new server behavior with instance settings and liveness
escalation tests.

## Verification

- `pnpm exec vitest run --project @paperclipai/server
server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts
server/src/__tests__/instance-settings-routes.test.ts --pool=forks
--poolOptions.forks.isolate=true`
- `pnpm --filter @paperclipai/shared typecheck`
- `pnpm --filter @paperclipai/server typecheck`
- `pnpm --filter @paperclipai/ui typecheck`

## Risks

- Moderate behavioral risk because recovery automation timing changes
when enabled; defaults keep existing advisory behavior unless the
setting is turned on.
- No database migration in this PR; settings are stored through the
existing instance settings path.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, `gpt-5`, coding model with tool use and local command
execution; context window not exposed by the runtime.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta
2026-04-27 08:46:44 -05:00
committed by GitHub
parent f0f9460d1d
commit fda296ee4f
14 changed files with 679 additions and 54 deletions
+11 -1
View File
@@ -4502,7 +4502,15 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {})
return recovery.buildRunOutputSilence(run, now);
}
async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) {
async function buildIssueGraphLivenessAutoRecoveryPreview(opts?: { lookbackHours?: number; now?: Date }) {
return recovery.buildIssueGraphLivenessAutoRecoveryPreview(opts);
}
async function reconcileIssueGraphLiveness(opts?: {
runId?: string | null;
force?: boolean;
lookbackHours?: number;
}) {
return recovery.reconcileIssueGraphLiveness(opts);
}
@@ -7477,6 +7485,8 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {})
reconcileStrandedAssignedIssues,
buildIssueGraphLivenessAutoRecoveryPreview,
reconcileIssueGraphLiveness,
scanSilentActiveRuns,
+6
View File
@@ -3,6 +3,7 @@ import { companies, instanceSettings } from "@paperclipai/db";
import {
DEFAULT_FEEDBACK_DATA_SHARING_PREFERENCE,
DEFAULT_BACKUP_RETENTION,
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
instanceGeneralSettingsSchema,
type InstanceGeneralSettings,
instanceExperimentalSettingsSchema,
@@ -42,6 +43,9 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin
enableIsolatedWorkspaces: parsed.data.enableIsolatedWorkspaces ?? false,
autoRestartDevServerWhenIdle: parsed.data.autoRestartDevServerWhenIdle ?? false,
enableIssueGraphLivenessAutoRecovery: parsed.data.enableIssueGraphLivenessAutoRecovery ?? false,
issueGraphLivenessAutoRecoveryLookbackHours:
parsed.data.issueGraphLivenessAutoRecoveryLookbackHours ??
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
};
}
return {
@@ -49,6 +53,8 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin
enableIsolatedWorkspaces: false,
autoRestartDevServerWhenIdle: false,
enableIssueGraphLivenessAutoRecovery: false,
issueGraphLivenessAutoRecoveryLookbackHours:
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
};
}
+132 -17
View File
@@ -1,5 +1,12 @@
import { and, asc, desc, eq, gt, inArray, isNull, notInArray, sql } from "drizzle-orm";
import type { Db } from "@paperclipai/db";
import {
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
type IssueGraphLivenessAutoRecoveryPreview,
type IssueGraphLivenessAutoRecoveryPreviewItem,
} from "@paperclipai/shared";
import {
agents,
agentWakeupRequests,
@@ -38,7 +45,6 @@ import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js"
const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const;
const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const;
const ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS = 24 * 60 * 60 * 1000;
export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000;
export const ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS = 4 * 60 * 60 * 1000;
export const ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS = 30 * 60 * 1000;
@@ -1857,18 +1863,115 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
return result;
}
async function isLivenessFindingOldEnoughForAutoRecovery(finding: IssueLivenessFinding, now = new Date()) {
const issueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))];
if (issueIds.length === 0) return false;
function normalizeIssueGraphLivenessAutoRecoveryLookbackHours(raw: unknown) {
const numeric = Math.floor(asNumber(raw, DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS));
return Math.min(
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
Math.max(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, numeric),
);
}
function livenessDependencyIssueKey(companyId: string, issueId: string) {
return `${companyId}:${issueId}`;
}
async function loadLivenessDependencyUpdatedAtByIssue(findings: IssueLivenessFinding[]) {
const issueIds = [
...new Set(
findings.flatMap((finding) => finding.dependencyPath.map((entry) => entry.issueId)),
),
];
if (issueIds.length === 0) return new Map<string, Date>();
const rows = await db
.select({ id: issues.id, updatedAt: issues.updatedAt })
.select({ id: issues.id, companyId: issues.companyId, updatedAt: issues.updatedAt })
.from(issues)
.where(and(eq(issues.companyId, finding.companyId), inArray(issues.id, issueIds)));
if (rows.length !== issueIds.length) return false;
const latestUpdatedAt = rows.reduce((latest, row) =>
row.updatedAt > latest ? row.updatedAt : latest,
rows[0]!.updatedAt);
return now.getTime() - latestUpdatedAt.getTime() >= ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS;
.where(inArray(issues.id, issueIds));
return new Map(rows.map((row) => [
livenessDependencyIssueKey(row.companyId, row.id),
row.updatedAt,
]));
}
function latestDependencyUpdatedAtForLivenessFinding(
finding: IssueLivenessFinding,
updatedAtByIssueKey: Map<string, Date>,
) {
const dependencyIssueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))];
if (dependencyIssueIds.length === 0) return null;
const timestamps = dependencyIssueIds.map((issueId) =>
updatedAtByIssueKey.get(livenessDependencyIssueKey(finding.companyId, issueId)) ?? null
);
if (timestamps.some((timestamp) => !timestamp)) return null;
const [firstTimestamp, ...remainingTimestamps] = timestamps as Date[];
return remainingTimestamps.reduce((latest, updatedAt) =>
updatedAt > latest ? updatedAt : latest,
firstTimestamp!);
}
function isLivenessFindingInsideAutoRecoveryLookback(
finding: IssueLivenessFinding,
cutoff: Date,
updatedAtByIssueKey: Map<string, Date>,
) {
const latestUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(finding, updatedAtByIssueKey);
return Boolean(latestUpdatedAt && latestUpdatedAt >= cutoff);
}
async function buildIssueGraphLivenessAutoRecoveryPreview(
opts?: { lookbackHours?: number; now?: Date },
): Promise<IssueGraphLivenessAutoRecoveryPreview> {
const now = opts?.now ?? new Date();
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(opts?.lookbackHours);
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
const findings = await collectIssueGraphLivenessFindings();
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
const issueIds = [...new Set(findings.map((finding) => finding.recoveryIssueId))];
const recoveryRows = issueIds.length > 0
? await db
.select({ id: issues.id, identifier: issues.identifier, title: issues.title })
.from(issues)
.where(inArray(issues.id, issueIds))
: [];
const recoveryById = new Map(recoveryRows.map((row) => [row.id, row]));
const items: IssueGraphLivenessAutoRecoveryPreviewItem[] = [];
let skippedOutsideLookback = 0;
for (const finding of findings) {
const latestDependencyUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(
finding,
updatedAtByIssueKey,
);
if (!latestDependencyUpdatedAt || latestDependencyUpdatedAt < cutoff) {
skippedOutsideLookback += 1;
continue;
}
const recoveryIssue = recoveryById.get(finding.recoveryIssueId);
items.push({
issueId: finding.issueId,
identifier: finding.identifier,
title: finding.dependencyPath[0]?.title ?? finding.identifier ?? finding.issueId,
state: finding.state,
severity: finding.severity,
reason: finding.reason,
recoveryIssueId: finding.recoveryIssueId,
recoveryIdentifier: recoveryIssue?.identifier ?? null,
recoveryTitle: recoveryIssue?.title ?? null,
recommendedOwnerAgentId: finding.recommendedOwnerAgentId,
incidentKey: finding.incidentKey,
latestDependencyUpdatedAt: latestDependencyUpdatedAt.toISOString(),
dependencyPath: finding.dependencyPath,
});
}
return {
lookbackHours,
cutoff: cutoff.toISOString(),
generatedAt: now.toISOString(),
findings: findings.length,
recoverableFindings: items.length,
skippedOutsideLookback,
items,
};
}
async function resolveEscalationOwnerAgentId(
@@ -2131,22 +2234,34 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
return { kind: "created" as const, escalationIssueId: escalation.id };
}
async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) {
async function reconcileIssueGraphLiveness(opts?: {
runId?: string | null;
force?: boolean;
lookbackHours?: number;
}) {
const findings = await collectIssueGraphLivenessFindings();
const experimentalSettings = await instanceSettings.getExperimental();
const autoRecoveryEnabled = asBoolean(
experimentalSettings.enableIssueGraphLivenessAutoRecovery,
false,
true,
) || opts?.force === true;
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(
opts?.lookbackHours ?? experimentalSettings.issueGraphLivenessAutoRecoveryLookbackHours,
);
const now = new Date();
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
const obsoleteRecoveryCleanup = await retireObsoleteLivenessRecoveryIssues(findings);
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
const result = {
findings: findings.length,
autoRecoveryEnabled,
lookbackHours,
cutoff: cutoff.toISOString(),
escalationsCreated: 0,
existingEscalations: 0,
skipped: 0,
skippedAutoRecoveryDisabled: 0,
skippedAutoRecoveryTooYoung: 0,
skippedOutsideLookback: 0,
obsoleteRecoveriesRetired: obsoleteRecoveryCleanup.retired,
obsoleteRecoveriesActiveSkipped: obsoleteRecoveryCleanup.activeSkipped,
obsoleteRecoveryBlockerRelationsRemoved: obsoleteRecoveryCleanup.blockerRelationsRemoved,
@@ -2160,10 +2275,9 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
return result;
}
const now = new Date();
for (const finding of findings) {
if (!await isLivenessFindingOldEnoughForAutoRecovery(finding, now)) {
result.skippedAutoRecoveryTooYoung += 1;
if (!isLivenessFindingInsideAutoRecoveryLookback(finding, cutoff, updatedAtByIssueKey)) {
result.skippedOutsideLookback += 1;
result.skipped += 1;
continue;
}
@@ -2197,6 +2311,7 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
recordWatchdogDecision,
scanSilentActiveRuns,
reconcileStrandedAssignedIssues,
buildIssueGraphLivenessAutoRecoveryPreview,
reconcileIssueGraphLiveness,
readRecoveryTimerIntervalMs,
};