From fda296ee4f20768076eabc2f43c50fa6c6c5072c Mon Sep 17 00:00:00 2001 From: Dotta <34892728+cryppadotta@users.noreply.github.com> Date: Mon, 27 Apr 2026 08:46:44 -0500 Subject: [PATCH] [codex] Add configurable liveness auto-recovery controls (#4587) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - Heartbeat liveness recovery decides when stalled issue trees need manager-visible follow-up. > - Automatic recovery issue creation is useful, but operators need instance-level controls for how aggressive it is. > - Without controls, recovery behavior is harder to tune for local development, production operations, and noisy edge cases. > - This pull request adds configurable liveness auto-recovery settings across shared contracts, API routes, services, and the instance experimental settings UI. > - The benefit is that operators can keep liveness findings advisory or enable bounded recovery automation with explicit intervals and lookback windows. ## What Changed - Added shared types and validators for liveness auto-recovery settings. - Extended instance settings routes and services to persist and validate the new controls. - Wired heartbeat/recovery services to honor enablement, minimum interval, and lookback settings. - Added UI controls for liveness recovery under instance experimental settings. - Covered the new server behavior with instance settings and liveness escalation tests. ## Verification - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts server/src/__tests__/instance-settings-routes.test.ts --pool=forks --poolOptions.forks.isolate=true` - `pnpm --filter @paperclipai/shared typecheck` - `pnpm --filter @paperclipai/server typecheck` - `pnpm --filter @paperclipai/ui typecheck` ## Risks - Moderate behavioral risk because recovery automation timing changes when enabled; defaults keep existing advisory behavior unless the setting is turned on. - No database migration in this PR; settings are stored through the existing instance settings path. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, `gpt-5`, coding model with tool use and local command execution; context window not exposed by the runtime. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip --- doc/execution-semantics.md | 2 + packages/shared/src/index.ts | 7 + packages/shared/src/types/index.ts | 19 +- packages/shared/src/types/instance.ts | 35 +++ packages/shared/src/validators/index.ts | 2 + packages/shared/src/validators/instance.ts | 21 ++ ...eartbeat-issue-liveness-escalation.test.ts | 22 +- .../instance-settings-routes.test.ts | 83 ++++- server/src/routes/instance-settings.ts | 57 +++- server/src/services/heartbeat.ts | 12 +- server/src/services/instance-settings.ts | 6 + server/src/services/recovery/service.ts | 149 ++++++++- ui/src/api/instanceSettings.ts | 22 ++ ui/src/pages/InstanceExperimentalSettings.tsx | 296 ++++++++++++++++-- 14 files changed, 679 insertions(+), 54 deletions(-) diff --git a/doc/execution-semantics.md b/doc/execution-semantics.md index 869d4bc8..b014253b 100644 --- a/doc/execution-semantics.md +++ b/doc/execution-semantics.md @@ -326,6 +326,8 @@ Examples: The source issue remains visible and blocked on the recovery issue when blocking is necessary for correctness. The recovery owner must restore a live path, resolve the source issue manually, or record the reason it is a false positive. +Instance-level issue-graph liveness auto-recovery is disabled by default. When enabled, its lookback window means "dependency paths updated within the last N hours"; older findings remain advisory and are counted as outside the configured lookback instead of creating recovery issues automatically. This is an operator noise control, not the older staleness delay for determining whether a chain is old enough to surface. + ### Human Escalation Human escalation is required when the next safe action depends on board judgment, budget/approval policy, or information unavailable to the control plane. diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index 2814e410..237d8c06 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -264,6 +264,8 @@ export type { InstanceExperimentalSettings, InstanceGeneralSettings, InstanceSettings, + IssueGraphLivenessAutoRecoveryPreview, + IssueGraphLivenessAutoRecoveryPreviewItem, BackupRetentionPolicy, Agent, AgentAccessState, @@ -548,6 +550,9 @@ export { WEEKLY_RETENTION_PRESETS, MONTHLY_RETENTION_PRESETS, DEFAULT_BACKUP_RETENTION, + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, } from "./types/instance.js"; export { @@ -561,7 +566,9 @@ export { type PatchInstanceGeneralSettings, instanceExperimentalSettingsSchema, patchInstanceExperimentalSettingsSchema, + issueGraphLivenessAutoRecoveryRequestSchema, type PatchInstanceExperimentalSettings, + type IssueGraphLivenessAutoRecoveryRequest, } from "./validators/index.js"; export { diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index fd4de78f..54e19f38 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -23,8 +23,23 @@ export type { FeedbackTraceBundleFile, FeedbackTraceBundle, } from "./feedback.js"; -export type { InstanceExperimentalSettings, InstanceGeneralSettings, InstanceSettings, BackupRetentionPolicy } from "./instance.js"; -export { DAILY_RETENTION_PRESETS, WEEKLY_RETENTION_PRESETS, MONTHLY_RETENTION_PRESETS, DEFAULT_BACKUP_RETENTION } from "./instance.js"; +export type { + InstanceExperimentalSettings, + InstanceGeneralSettings, + InstanceSettings, + BackupRetentionPolicy, + IssueGraphLivenessAutoRecoveryPreview, + IssueGraphLivenessAutoRecoveryPreviewItem, +} from "./instance.js"; +export { + DAILY_RETENTION_PRESETS, + WEEKLY_RETENTION_PRESETS, + MONTHLY_RETENTION_PRESETS, + DEFAULT_BACKUP_RETENTION, + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, +} from "./instance.js"; export type { CompanySkillSourceType, CompanySkillTrustLevel, diff --git a/packages/shared/src/types/instance.ts b/packages/shared/src/types/instance.ts index 73e690ef..ee6a6553 100644 --- a/packages/shared/src/types/instance.ts +++ b/packages/shared/src/types/instance.ts @@ -3,6 +3,9 @@ import type { FeedbackDataSharingPreference } from "./feedback.js"; export const DAILY_RETENTION_PRESETS = [3, 7, 14] as const; export const WEEKLY_RETENTION_PRESETS = [1, 2, 4] as const; export const MONTHLY_RETENTION_PRESETS = [1, 3, 6] as const; +export const DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS = 24; +export const MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS = 1; +export const MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS = 24 * 30; export interface BackupRetentionPolicy { dailyDays: (typeof DAILY_RETENTION_PRESETS)[number]; @@ -28,6 +31,7 @@ export interface InstanceExperimentalSettings { enableIsolatedWorkspaces: boolean; autoRestartDevServerWhenIdle: boolean; enableIssueGraphLivenessAutoRecovery: boolean; + issueGraphLivenessAutoRecoveryLookbackHours: number; } export interface InstanceSettings { @@ -37,3 +41,34 @@ export interface InstanceSettings { createdAt: Date; updatedAt: Date; } + +export interface IssueGraphLivenessAutoRecoveryPreviewItem { + issueId: string; + identifier: string | null; + title: string; + state: string; + severity: string; + reason: string; + recoveryIssueId: string; + recoveryIdentifier: string | null; + recoveryTitle: string | null; + recommendedOwnerAgentId: string | null; + incidentKey: string; + latestDependencyUpdatedAt: string; + dependencyPath: Array<{ + issueId: string; + identifier: string | null; + title: string; + status: string; + }>; +} + +export interface IssueGraphLivenessAutoRecoveryPreview { + lookbackHours: number; + cutoff: string; + generatedAt: string; + findings: number; + recoverableFindings: number; + skippedOutsideLookback: number; + items: IssueGraphLivenessAutoRecoveryPreviewItem[]; +} diff --git a/packages/shared/src/validators/index.ts b/packages/shared/src/validators/index.ts index 2c5d9e86..e9141631 100644 --- a/packages/shared/src/validators/index.ts +++ b/packages/shared/src/validators/index.ts @@ -5,8 +5,10 @@ export { type PatchInstanceGeneralSettings, instanceExperimentalSettingsSchema, patchInstanceExperimentalSettingsSchema, + issueGraphLivenessAutoRecoveryRequestSchema, type InstanceExperimentalSettings, type PatchInstanceExperimentalSettings, + type IssueGraphLivenessAutoRecoveryRequest, } from "./instance.js"; export { diff --git a/packages/shared/src/validators/instance.ts b/packages/shared/src/validators/instance.ts index 94d78226..3415539a 100644 --- a/packages/shared/src/validators/instance.ts +++ b/packages/shared/src/validators/instance.ts @@ -5,6 +5,9 @@ import { WEEKLY_RETENTION_PRESETS, MONTHLY_RETENTION_PRESETS, DEFAULT_BACKUP_RETENTION, + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, } from "../types/instance.js"; import { feedbackDataSharingPreferenceSchema } from "./feedback.js"; @@ -37,11 +40,29 @@ export const instanceExperimentalSettingsSchema = z.object({ enableIsolatedWorkspaces: z.boolean().default(false), autoRestartDevServerWhenIdle: z.boolean().default(false), enableIssueGraphLivenessAutoRecovery: z.boolean().default(false), + issueGraphLivenessAutoRecoveryLookbackHours: z + .number() + .int() + .min(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS) + .max(MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS) + .default(DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS), }).strict(); export const patchInstanceExperimentalSettingsSchema = instanceExperimentalSettingsSchema.partial(); +export const issueGraphLivenessAutoRecoveryRequestSchema = z.object({ + lookbackHours: z + .number() + .int() + .min(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS) + .max(MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS) + .optional(), +}).strict(); + export type InstanceGeneralSettings = z.infer; export type PatchInstanceGeneralSettings = z.infer; export type InstanceExperimentalSettings = z.infer; export type PatchInstanceExperimentalSettings = z.infer; +export type IssueGraphLivenessAutoRecoveryRequest = z.infer< + typeof issueGraphLivenessAutoRecoveryRequestSchema +>; diff --git a/server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts b/server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts index e194d89a..369bb05c 100644 --- a/server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts +++ b/server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts @@ -103,6 +103,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { await instanceSettingsService(db).updateExperimental({ enableIssueGraphLivenessAutoRecovery: false, enableIsolatedWorkspaces: false, + issueGraphLivenessAutoRecoveryLookbackHours: 24, }); }); @@ -116,7 +117,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { }); } - async function seedBlockedChain(opts: { stale?: boolean } = {}) { + async function seedBlockedChain(opts: { outsideLookback?: boolean } = {}) { const companyId = randomUUID(); const managerId = randomUUID(); const coderId = randomUUID(); @@ -157,9 +158,9 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { }, ]); - const issueTimestamp = opts.stale === false - ? new Date() - : new Date(Date.now() - 25 * 60 * 60 * 1000); + const issueTimestamp = opts.outsideLookback === true + ? new Date(Date.now() - 25 * 60 * 60 * 1000) + : new Date(Date.now() - 60 * 60 * 1000); await db.insert(issues).values([ { id: blockedIssueId, @@ -197,6 +198,9 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { } it("keeps liveness findings advisory when auto recovery is disabled", async () => { + await instanceSettingsService(db).updateExperimental({ + enableIssueGraphLivenessAutoRecovery: false, + }); const { companyId } = await seedBlockedChain(); const heartbeat = heartbeatService(db); @@ -214,16 +218,16 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { expect(escalations).toHaveLength(0); }); - it("does not create recovery issues until the dependency path is stale for 24 hours", async () => { + it("does not create recovery issues outside the configured lookback window", async () => { await enableAutoRecovery(); - const { companyId } = await seedBlockedChain({ stale: false }); + const { companyId } = await seedBlockedChain({ outsideLookback: true }); const heartbeat = heartbeatService(db); const result = await heartbeat.reconcileIssueGraphLiveness(); expect(result.findings).toBe(1); expect(result.escalationsCreated).toBe(0); - expect(result.skippedAutoRecoveryTooYoung).toBe(1); + expect(result.skippedOutsideLookback).toBe(1); const escalations = await db .select() @@ -424,7 +428,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { const dependentExecutionWorkspaceId = randomUUID(); const blockerExecutionWorkspaceId = randomUUID(); const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`; - const issueTimestamp = new Date(Date.now() - 25 * 60 * 60 * 1000); + const issueTimestamp = new Date(Date.now() - 60 * 60 * 1000); await db.insert(companies).values({ id: companyId, @@ -559,7 +563,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => { const { companyId, blockedIssueId, blockerIssueId } = await seedBlockedChain(); const secondBlockedIssueId = randomUUID(); const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`; - const issueTimestamp = new Date(Date.now() - 25 * 60 * 60 * 1000); + const issueTimestamp = new Date(Date.now() - 60 * 60 * 1000); await db.insert(issues).values({ id: secondBlockedIssueId, companyId, diff --git a/server/src/__tests__/instance-settings-routes.test.ts b/server/src/__tests__/instance-settings-routes.test.ts index 8cfd413a..41e52190 100644 --- a/server/src/__tests__/instance-settings-routes.test.ts +++ b/server/src/__tests__/instance-settings-routes.test.ts @@ -9,10 +9,15 @@ const mockInstanceSettingsService = vi.hoisted(() => ({ updateExperimental: vi.fn(), listCompanyIds: vi.fn(), })); +const mockHeartbeatService = vi.hoisted(() => ({ + buildIssueGraphLivenessAutoRecoveryPreview: vi.fn(), + reconcileIssueGraphLiveness: vi.fn(), +})); const mockLogActivity = vi.hoisted(() => vi.fn()); function registerModuleMocks() { vi.doMock("../services/index.js", () => ({ + heartbeatService: () => mockHeartbeatService, instanceSettingsService: () => mockInstanceSettingsService, logActivity: mockLogActivity, })); @@ -48,6 +53,8 @@ describe("instance settings routes", () => { mockInstanceSettingsService.updateGeneral.mockReset(); mockInstanceSettingsService.updateExperimental.mockReset(); mockInstanceSettingsService.listCompanyIds.mockReset(); + mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview.mockReset(); + mockHeartbeatService.reconcileIssueGraphLiveness.mockReset(); mockLogActivity.mockReset(); mockInstanceSettingsService.getGeneral.mockResolvedValue({ censorUsernameInLogs: false, @@ -58,7 +65,8 @@ describe("instance settings routes", () => { enableEnvironments: false, enableIsolatedWorkspaces: false, autoRestartDevServerWhenIdle: false, - enableIssueGraphLivenessAutoRecovery: false, + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: 24, }); mockInstanceSettingsService.updateGeneral.mockResolvedValue({ id: "instance-settings-1", @@ -74,10 +82,32 @@ describe("instance settings routes", () => { enableEnvironments: true, enableIsolatedWorkspaces: true, autoRestartDevServerWhenIdle: false, - enableIssueGraphLivenessAutoRecovery: false, + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: 24, }, }); mockInstanceSettingsService.listCompanyIds.mockResolvedValue(["company-1", "company-2"]); + mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview.mockResolvedValue({ + lookbackHours: 24, + cutoff: "2026-04-26T12:00:00.000Z", + generatedAt: "2026-04-27T12:00:00.000Z", + findings: 1, + recoverableFindings: 1, + skippedOutsideLookback: 0, + items: [], + }); + mockHeartbeatService.reconcileIssueGraphLiveness.mockResolvedValue({ + findings: 1, + autoRecoveryEnabled: true, + lookbackHours: 24, + cutoff: "2026-04-26T12:00:00.000Z", + escalationsCreated: 1, + existingEscalations: 0, + skipped: 0, + skippedAutoRecoveryDisabled: 0, + skippedOutsideLookback: 0, + escalationIssueIds: ["issue-2"], + }); }); it("allows local board users to read and update experimental settings", async () => { @@ -94,7 +124,8 @@ describe("instance settings routes", () => { enableEnvironments: false, enableIsolatedWorkspaces: false, autoRestartDevServerWhenIdle: false, - enableIssueGraphLivenessAutoRecovery: false, + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: 24, }); const patchRes = await request(app) @@ -138,14 +169,58 @@ describe("instance settings routes", () => { await request(app) .patch("/api/instance/settings/experimental") - .send({ enableIssueGraphLivenessAutoRecovery: true }) + .send({ + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: 12, + }) .expect(200); expect(mockInstanceSettingsService.updateExperimental).toHaveBeenCalledWith({ enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: 12, }); }); + it("previews issue graph liveness recovery candidates before enabling", async () => { + const app = await createApp({ + type: "board", + userId: "local-board", + source: "local_implicit", + isInstanceAdmin: true, + }); + + const res = await request(app) + .post("/api/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview") + .send({ lookbackHours: 12 }) + .expect(200); + + expect(res.body).toMatchObject({ lookbackHours: 24, recoverableFindings: 1 }); + expect(mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview).toHaveBeenCalledWith({ + lookbackHours: 12, + }); + }); + + it("kicks off issue graph liveness recovery on demand", async () => { + const app = await createApp({ + type: "board", + userId: "local-board", + source: "local_implicit", + isInstanceAdmin: true, + }); + + await request(app) + .post("/api/instance/settings/experimental/issue-graph-liveness-auto-recovery/run") + .send({ lookbackHours: 12 }) + .expect(200); + + expect(mockHeartbeatService.reconcileIssueGraphLiveness).toHaveBeenCalledWith({ + runId: null, + force: true, + lookbackHours: 12, + }); + expect(mockLogActivity).toHaveBeenCalledTimes(2); + }); + it("allows local board users to update environment controls", async () => { const app = await createApp({ type: "board", diff --git a/server/src/routes/instance-settings.ts b/server/src/routes/instance-settings.ts index 946278ef..2f224725 100644 --- a/server/src/routes/instance-settings.ts +++ b/server/src/routes/instance-settings.ts @@ -1,9 +1,13 @@ import { Router, type Request } from "express"; import type { Db } from "@paperclipai/db"; -import { patchInstanceExperimentalSettingsSchema, patchInstanceGeneralSettingsSchema } from "@paperclipai/shared"; +import { + issueGraphLivenessAutoRecoveryRequestSchema, + patchInstanceExperimentalSettingsSchema, + patchInstanceGeneralSettingsSchema, +} from "@paperclipai/shared"; import { forbidden } from "../errors.js"; import { validate } from "../middleware/validate.js"; -import { instanceSettingsService, logActivity } from "../services/index.js"; +import { heartbeatService, instanceSettingsService, logActivity } from "../services/index.js"; import { assertBoardOrgAccess, getActorInfo } from "./authz.js"; function assertCanManageInstanceSettings(req: Request) { @@ -19,6 +23,7 @@ function assertCanManageInstanceSettings(req: Request) { export function instanceSettingsRoutes(db: Db) { const router = Router(); const svc = instanceSettingsService(db); + const heartbeat = heartbeatService(db); router.get("/instance/settings/general", async (req, res) => { // General settings (e.g. keyboardShortcuts) are readable by any @@ -94,5 +99,53 @@ export function instanceSettingsRoutes(db: Db) { }, ); + router.post( + "/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview", + validate(issueGraphLivenessAutoRecoveryRequestSchema), + async (req, res) => { + assertCanManageInstanceSettings(req); + res.json(await heartbeat.buildIssueGraphLivenessAutoRecoveryPreview({ + lookbackHours: req.body.lookbackHours, + })); + }, + ); + + router.post( + "/instance/settings/experimental/issue-graph-liveness-auto-recovery/run", + validate(issueGraphLivenessAutoRecoveryRequestSchema), + async (req, res) => { + assertCanManageInstanceSettings(req); + const actor = getActorInfo(req); + const result = await heartbeat.reconcileIssueGraphLiveness({ + runId: actor.runId, + force: true, + lookbackHours: req.body.lookbackHours, + }); + const companyIds = await svc.listCompanyIds(); + await Promise.all( + companyIds.map((companyId) => + logActivity(db, { + companyId, + actorType: actor.actorType, + actorId: actor.actorId, + agentId: actor.agentId, + runId: actor.runId, + action: "instance.settings.issue_graph_liveness_auto_recovery_run", + entityType: "instance_settings", + entityId: "default", + details: { + lookbackHours: result.lookbackHours, + escalationsCreated: result.escalationsCreated, + existingEscalations: result.existingEscalations, + skippedOutsideLookback: result.skippedOutsideLookback, + escalationIssueIds: result.escalationIssueIds, + }, + }), + ), + ); + res.json(result); + }, + ); + return router; } diff --git a/server/src/services/heartbeat.ts b/server/src/services/heartbeat.ts index c8e33372..ef1ee1fe 100644 --- a/server/src/services/heartbeat.ts +++ b/server/src/services/heartbeat.ts @@ -4502,7 +4502,15 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {}) return recovery.buildRunOutputSilence(run, now); } - async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) { + async function buildIssueGraphLivenessAutoRecoveryPreview(opts?: { lookbackHours?: number; now?: Date }) { + return recovery.buildIssueGraphLivenessAutoRecoveryPreview(opts); + } + + async function reconcileIssueGraphLiveness(opts?: { + runId?: string | null; + force?: boolean; + lookbackHours?: number; + }) { return recovery.reconcileIssueGraphLiveness(opts); } @@ -7477,6 +7485,8 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {}) reconcileStrandedAssignedIssues, + buildIssueGraphLivenessAutoRecoveryPreview, + reconcileIssueGraphLiveness, scanSilentActiveRuns, diff --git a/server/src/services/instance-settings.ts b/server/src/services/instance-settings.ts index 7ada2ef7..c447a920 100644 --- a/server/src/services/instance-settings.ts +++ b/server/src/services/instance-settings.ts @@ -3,6 +3,7 @@ import { companies, instanceSettings } from "@paperclipai/db"; import { DEFAULT_FEEDBACK_DATA_SHARING_PREFERENCE, DEFAULT_BACKUP_RETENTION, + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, instanceGeneralSettingsSchema, type InstanceGeneralSettings, instanceExperimentalSettingsSchema, @@ -42,6 +43,9 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin enableIsolatedWorkspaces: parsed.data.enableIsolatedWorkspaces ?? false, autoRestartDevServerWhenIdle: parsed.data.autoRestartDevServerWhenIdle ?? false, enableIssueGraphLivenessAutoRecovery: parsed.data.enableIssueGraphLivenessAutoRecovery ?? false, + issueGraphLivenessAutoRecoveryLookbackHours: + parsed.data.issueGraphLivenessAutoRecoveryLookbackHours ?? + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, }; } return { @@ -49,6 +53,8 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin enableIsolatedWorkspaces: false, autoRestartDevServerWhenIdle: false, enableIssueGraphLivenessAutoRecovery: false, + issueGraphLivenessAutoRecoveryLookbackHours: + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, }; } diff --git a/server/src/services/recovery/service.ts b/server/src/services/recovery/service.ts index 30cc552e..f6e8c1f7 100644 --- a/server/src/services/recovery/service.ts +++ b/server/src/services/recovery/service.ts @@ -1,5 +1,12 @@ import { and, asc, desc, eq, gt, inArray, isNull, notInArray, sql } from "drizzle-orm"; import type { Db } from "@paperclipai/db"; +import { + DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + type IssueGraphLivenessAutoRecoveryPreview, + type IssueGraphLivenessAutoRecoveryPreviewItem, +} from "@paperclipai/shared"; import { agents, agentWakeupRequests, @@ -38,7 +45,6 @@ import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js" const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const; const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const; -const ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS = 24 * 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS = 4 * 60 * 60 * 1000; export const ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS = 30 * 60 * 1000; @@ -1857,18 +1863,115 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) return result; } - async function isLivenessFindingOldEnoughForAutoRecovery(finding: IssueLivenessFinding, now = new Date()) { - const issueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))]; - if (issueIds.length === 0) return false; + function normalizeIssueGraphLivenessAutoRecoveryLookbackHours(raw: unknown) { + const numeric = Math.floor(asNumber(raw, DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS)); + return Math.min( + MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, + Math.max(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, numeric), + ); + } + + function livenessDependencyIssueKey(companyId: string, issueId: string) { + return `${companyId}:${issueId}`; + } + + async function loadLivenessDependencyUpdatedAtByIssue(findings: IssueLivenessFinding[]) { + const issueIds = [ + ...new Set( + findings.flatMap((finding) => finding.dependencyPath.map((entry) => entry.issueId)), + ), + ]; + if (issueIds.length === 0) return new Map(); const rows = await db - .select({ id: issues.id, updatedAt: issues.updatedAt }) + .select({ id: issues.id, companyId: issues.companyId, updatedAt: issues.updatedAt }) .from(issues) - .where(and(eq(issues.companyId, finding.companyId), inArray(issues.id, issueIds))); - if (rows.length !== issueIds.length) return false; - const latestUpdatedAt = rows.reduce((latest, row) => - row.updatedAt > latest ? row.updatedAt : latest, - rows[0]!.updatedAt); - return now.getTime() - latestUpdatedAt.getTime() >= ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS; + .where(inArray(issues.id, issueIds)); + return new Map(rows.map((row) => [ + livenessDependencyIssueKey(row.companyId, row.id), + row.updatedAt, + ])); + } + + function latestDependencyUpdatedAtForLivenessFinding( + finding: IssueLivenessFinding, + updatedAtByIssueKey: Map, + ) { + const dependencyIssueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))]; + if (dependencyIssueIds.length === 0) return null; + const timestamps = dependencyIssueIds.map((issueId) => + updatedAtByIssueKey.get(livenessDependencyIssueKey(finding.companyId, issueId)) ?? null + ); + if (timestamps.some((timestamp) => !timestamp)) return null; + const [firstTimestamp, ...remainingTimestamps] = timestamps as Date[]; + return remainingTimestamps.reduce((latest, updatedAt) => + updatedAt > latest ? updatedAt : latest, + firstTimestamp!); + } + + function isLivenessFindingInsideAutoRecoveryLookback( + finding: IssueLivenessFinding, + cutoff: Date, + updatedAtByIssueKey: Map, + ) { + const latestUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(finding, updatedAtByIssueKey); + return Boolean(latestUpdatedAt && latestUpdatedAt >= cutoff); + } + + async function buildIssueGraphLivenessAutoRecoveryPreview( + opts?: { lookbackHours?: number; now?: Date }, + ): Promise { + const now = opts?.now ?? new Date(); + const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(opts?.lookbackHours); + const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000); + const findings = await collectIssueGraphLivenessFindings(); + const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings); + const issueIds = [...new Set(findings.map((finding) => finding.recoveryIssueId))]; + const recoveryRows = issueIds.length > 0 + ? await db + .select({ id: issues.id, identifier: issues.identifier, title: issues.title }) + .from(issues) + .where(inArray(issues.id, issueIds)) + : []; + const recoveryById = new Map(recoveryRows.map((row) => [row.id, row])); + const items: IssueGraphLivenessAutoRecoveryPreviewItem[] = []; + let skippedOutsideLookback = 0; + + for (const finding of findings) { + const latestDependencyUpdatedAt = latestDependencyUpdatedAtForLivenessFinding( + finding, + updatedAtByIssueKey, + ); + if (!latestDependencyUpdatedAt || latestDependencyUpdatedAt < cutoff) { + skippedOutsideLookback += 1; + continue; + } + const recoveryIssue = recoveryById.get(finding.recoveryIssueId); + items.push({ + issueId: finding.issueId, + identifier: finding.identifier, + title: finding.dependencyPath[0]?.title ?? finding.identifier ?? finding.issueId, + state: finding.state, + severity: finding.severity, + reason: finding.reason, + recoveryIssueId: finding.recoveryIssueId, + recoveryIdentifier: recoveryIssue?.identifier ?? null, + recoveryTitle: recoveryIssue?.title ?? null, + recommendedOwnerAgentId: finding.recommendedOwnerAgentId, + incidentKey: finding.incidentKey, + latestDependencyUpdatedAt: latestDependencyUpdatedAt.toISOString(), + dependencyPath: finding.dependencyPath, + }); + } + + return { + lookbackHours, + cutoff: cutoff.toISOString(), + generatedAt: now.toISOString(), + findings: findings.length, + recoverableFindings: items.length, + skippedOutsideLookback, + items, + }; } async function resolveEscalationOwnerAgentId( @@ -2131,22 +2234,34 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) return { kind: "created" as const, escalationIssueId: escalation.id }; } - async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) { + async function reconcileIssueGraphLiveness(opts?: { + runId?: string | null; + force?: boolean; + lookbackHours?: number; + }) { const findings = await collectIssueGraphLivenessFindings(); const experimentalSettings = await instanceSettings.getExperimental(); const autoRecoveryEnabled = asBoolean( experimentalSettings.enableIssueGraphLivenessAutoRecovery, - false, + true, + ) || opts?.force === true; + const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours( + opts?.lookbackHours ?? experimentalSettings.issueGraphLivenessAutoRecoveryLookbackHours, ); + const now = new Date(); + const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000); const obsoleteRecoveryCleanup = await retireObsoleteLivenessRecoveryIssues(findings); + const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings); const result = { findings: findings.length, autoRecoveryEnabled, + lookbackHours, + cutoff: cutoff.toISOString(), escalationsCreated: 0, existingEscalations: 0, skipped: 0, skippedAutoRecoveryDisabled: 0, - skippedAutoRecoveryTooYoung: 0, + skippedOutsideLookback: 0, obsoleteRecoveriesRetired: obsoleteRecoveryCleanup.retired, obsoleteRecoveriesActiveSkipped: obsoleteRecoveryCleanup.activeSkipped, obsoleteRecoveryBlockerRelationsRemoved: obsoleteRecoveryCleanup.blockerRelationsRemoved, @@ -2160,10 +2275,9 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) return result; } - const now = new Date(); for (const finding of findings) { - if (!await isLivenessFindingOldEnoughForAutoRecovery(finding, now)) { - result.skippedAutoRecoveryTooYoung += 1; + if (!isLivenessFindingInsideAutoRecoveryLookback(finding, cutoff, updatedAtByIssueKey)) { + result.skippedOutsideLookback += 1; result.skipped += 1; continue; } @@ -2197,6 +2311,7 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) recordWatchdogDecision, scanSilentActiveRuns, reconcileStrandedAssignedIssues, + buildIssueGraphLivenessAutoRecoveryPreview, reconcileIssueGraphLiveness, readRecoveryTimerIntervalMs, }; diff --git a/ui/src/api/instanceSettings.ts b/ui/src/api/instanceSettings.ts index ef50ce47..4a8544b8 100644 --- a/ui/src/api/instanceSettings.ts +++ b/ui/src/api/instanceSettings.ts @@ -1,6 +1,7 @@ import type { InstanceExperimentalSettings, InstanceGeneralSettings, + IssueGraphLivenessAutoRecoveryPreview, PatchInstanceGeneralSettings, PatchInstanceExperimentalSettings, } from "@paperclipai/shared"; @@ -15,4 +16,25 @@ export const instanceSettingsApi = { api.get("/instance/settings/experimental"), updateExperimental: (patch: PatchInstanceExperimentalSettings) => api.patch("/instance/settings/experimental", patch), + previewIssueGraphLivenessAutoRecovery: (input: { lookbackHours?: number }) => + api.post( + "/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview", + input, + ), + runIssueGraphLivenessAutoRecovery: (input: { lookbackHours?: number }) => + api.post<{ + findings: number; + autoRecoveryEnabled: boolean; + lookbackHours: number; + cutoff: string; + escalationsCreated: number; + existingEscalations: number; + skipped: number; + skippedAutoRecoveryDisabled: number; + skippedOutsideLookback: number; + escalationIssueIds: string[]; + }>( + "/instance/settings/experimental/issue-graph-liveness-auto-recovery/run", + input, + ), }; diff --git a/ui/src/pages/InstanceExperimentalSettings.tsx b/ui/src/pages/InstanceExperimentalSettings.tsx index a45997d9..6d6a39f0 100644 --- a/ui/src/pages/InstanceExperimentalSettings.tsx +++ b/ui/src/pages/InstanceExperimentalSettings.tsx @@ -1,16 +1,130 @@ import { useEffect, useState } from "react"; import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query"; -import { FlaskConical } from "lucide-react"; -import type { PatchInstanceExperimentalSettings } from "@paperclipai/shared"; +import { Clock, FlaskConical, Play, Search } from "lucide-react"; +import type { + IssueGraphLivenessAutoRecoveryPreview, + PatchInstanceExperimentalSettings, +} from "@paperclipai/shared"; import { instanceSettingsApi } from "@/api/instanceSettings"; import { useBreadcrumbs } from "../context/BreadcrumbContext"; import { queryKeys } from "../lib/queryKeys"; import { ToggleSwitch } from "@/components/ui/toggle-switch"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; + +function issueHref(identifier: string | null, issueId: string) { + if (!identifier) return `/issues/${issueId}`; + const prefix = identifier.split("-")[0] || "PAP"; + return `/${prefix}/issues/${identifier}`; +} + +function formatRecoveryState(state: string) { + return state.replace(/_/g, " "); +} + +function RecoveryPreviewDialog({ + preview, + open, + onOpenChange, + onEnableOnly, + onEnableAndRun, + isPending, +}: { + preview: IssueGraphLivenessAutoRecoveryPreview | null; + open: boolean; + onOpenChange: (open: boolean) => void; + onEnableOnly: () => void; + onEnableAndRun: () => void; + isPending: boolean; +}) { + const count = preview?.recoverableFindings ?? 0; + return ( + + + + Confirm auto-recovery + + {preview + ? `${count} recovery ${count === 1 ? "task" : "tasks"} match the last ${preview.lookbackHours} hours.` + : "Checking recovery candidates before enabling."} + + + +
+ {preview && preview.items.length === 0 ? ( +
+ No recovery tasks would be created right now. Auto-recovery can still run for future liveness incidents in + this window. +
+ ) : null} + + {preview?.items.map((item) => ( +
+
+ + {item.identifier ?? item.issueId} + + + {formatRecoveryState(item.state)} + +
+

{item.title}

+

{item.reason}

+ +
+ ))} +
+ + {preview && preview.skippedOutsideLookback > 0 ? ( +

+ {preview.skippedOutsideLookback} current{" "} + {preview.skippedOutsideLookback === 1 ? "finding is" : "findings are"} outside the configured lookback and + will not be touched. +

+ ) : null} + + + + + + +
+
+ ); +} export function InstanceExperimentalSettings() { const { setBreadcrumbs } = useBreadcrumbs(); const queryClient = useQueryClient(); const [actionError, setActionError] = useState(null); + const [lookbackHoursDraft, setLookbackHoursDraft] = useState("24"); + const [previewDialogOpen, setPreviewDialogOpen] = useState(false); + const [pendingPreview, setPendingPreview] = useState(null); useEffect(() => { setBreadcrumbs([ @@ -39,6 +153,42 @@ export function InstanceExperimentalSettings() { }, }); + const previewMutation = useMutation({ + mutationFn: async (lookbackHours: number) => + instanceSettingsApi.previewIssueGraphLivenessAutoRecovery({ lookbackHours }), + onSuccess: (preview) => { + setActionError(null); + setPendingPreview(preview); + setPreviewDialogOpen(true); + }, + onError: (error) => { + setActionError(error instanceof Error ? error.message : "Failed to preview recovery tasks."); + }, + }); + + const runRecoveryMutation = useMutation({ + mutationFn: async (lookbackHours: number) => + instanceSettingsApi.runIssueGraphLivenessAutoRecovery({ lookbackHours }), + onSuccess: async () => { + setActionError(null); + setPreviewDialogOpen(false); + await Promise.all([ + queryClient.invalidateQueries({ queryKey: queryKeys.instance.experimentalSettings }), + queryClient.invalidateQueries({ queryKey: queryKeys.health }), + ]); + }, + onError: (error) => { + setActionError(error instanceof Error ? error.message : "Failed to create recovery tasks."); + }, + }); + + useEffect(() => { + const next = experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours; + if (typeof next === "number") { + setLookbackHoursDraft(String(next)); + } + }, [experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours]); + if (experimentalQuery.isLoading) { return
Loading experimental settings...
; } @@ -58,6 +208,41 @@ export function InstanceExperimentalSettings() { const autoRestartDevServerWhenIdle = experimentalQuery.data?.autoRestartDevServerWhenIdle === true; const enableIssueGraphLivenessAutoRecovery = experimentalQuery.data?.enableIssueGraphLivenessAutoRecovery === true; + const lookbackHours = + experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours ?? 24; + const parsedLookbackHours = Number.parseInt(lookbackHoursDraft, 10); + const lookbackHoursIsValid = + Number.isInteger(parsedLookbackHours) && parsedLookbackHours >= 1 && parsedLookbackHours <= 720; + const recoveryActionPending = + toggleMutation.isPending || previewMutation.isPending || runRecoveryMutation.isPending; + + function previewForEnable() { + if (!lookbackHoursIsValid) { + setActionError("Lookback hours must be a whole number from 1 to 720."); + return; + } + previewMutation.mutate(parsedLookbackHours); + } + + function enableOnly() { + if (!lookbackHoursIsValid) return; + toggleMutation.mutate({ + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: parsedLookbackHours, + }, { + onSuccess: () => setPreviewDialogOpen(false), + }); + } + + function enableAndRun() { + if (!lookbackHoursIsValid) return; + toggleMutation.mutate({ + enableIssueGraphLivenessAutoRecovery: true, + issueGraphLivenessAutoRecoveryLookbackHours: parsedLookbackHours, + }, { + onSuccess: () => runRecoveryMutation.mutate(parsedLookbackHours), + }); + } return (
@@ -132,26 +317,99 @@ export function InstanceExperimentalSettings() {
-
-
-

Auto-Create Issue Recovery Tasks

-

- Let the heartbeat scheduler create recovery issues for issue dependency chains that have been stalled for - at least 24 hours. -

+
+
+
+

Auto-Create Issue Recovery Tasks

+

+ Let the heartbeat scheduler create recovery issues for issue dependency chains found inside the + configured lookback window. +

+
+ { + if (enableIssueGraphLivenessAutoRecovery) { + toggleMutation.mutate({ enableIssueGraphLivenessAutoRecovery: false }); + return; + } + previewForEnable(); + }} + disabled={recoveryActionPending} + aria-label="Toggle issue graph liveness auto-recovery" + />
- - toggleMutation.mutate({ - enableIssueGraphLivenessAutoRecovery: !enableIssueGraphLivenessAutoRecovery, - }) - } - disabled={toggleMutation.isPending} - aria-label="Toggle issue graph liveness auto-recovery" - /> + +
+ +
+ + + +
+
+ +

+ Current window: last {lookbackHours} {lookbackHours === 1 ? "hour" : "hours"}. +

+ +
); }