forked from farhoodlabs/paperclip
[codex] Add configurable liveness auto-recovery controls (#4587)
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - Heartbeat liveness recovery decides when stalled issue trees need manager-visible follow-up. > - Automatic recovery issue creation is useful, but operators need instance-level controls for how aggressive it is. > - Without controls, recovery behavior is harder to tune for local development, production operations, and noisy edge cases. > - This pull request adds configurable liveness auto-recovery settings across shared contracts, API routes, services, and the instance experimental settings UI. > - The benefit is that operators can keep liveness findings advisory or enable bounded recovery automation with explicit intervals and lookback windows. ## What Changed - Added shared types and validators for liveness auto-recovery settings. - Extended instance settings routes and services to persist and validate the new controls. - Wired heartbeat/recovery services to honor enablement, minimum interval, and lookback settings. - Added UI controls for liveness recovery under instance experimental settings. - Covered the new server behavior with instance settings and liveness escalation tests. ## Verification - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts server/src/__tests__/instance-settings-routes.test.ts --pool=forks --poolOptions.forks.isolate=true` - `pnpm --filter @paperclipai/shared typecheck` - `pnpm --filter @paperclipai/server typecheck` - `pnpm --filter @paperclipai/ui typecheck` ## Risks - Moderate behavioral risk because recovery automation timing changes when enabled; defaults keep existing advisory behavior unless the setting is turned on. - No database migration in this PR; settings are stored through the existing instance settings path. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, `gpt-5`, coding model with tool use and local command execution; context window not exposed by the runtime. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -103,6 +103,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
await instanceSettingsService(db).updateExperimental({
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
enableIsolatedWorkspaces: false,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 24,
|
||||
});
|
||||
});
|
||||
|
||||
@@ -116,7 +117,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
});
|
||||
}
|
||||
|
||||
async function seedBlockedChain(opts: { stale?: boolean } = {}) {
|
||||
async function seedBlockedChain(opts: { outsideLookback?: boolean } = {}) {
|
||||
const companyId = randomUUID();
|
||||
const managerId = randomUUID();
|
||||
const coderId = randomUUID();
|
||||
@@ -157,9 +158,9 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
},
|
||||
]);
|
||||
|
||||
const issueTimestamp = opts.stale === false
|
||||
? new Date()
|
||||
: new Date(Date.now() - 25 * 60 * 60 * 1000);
|
||||
const issueTimestamp = opts.outsideLookback === true
|
||||
? new Date(Date.now() - 25 * 60 * 60 * 1000)
|
||||
: new Date(Date.now() - 60 * 60 * 1000);
|
||||
await db.insert(issues).values([
|
||||
{
|
||||
id: blockedIssueId,
|
||||
@@ -197,6 +198,9 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
}
|
||||
|
||||
it("keeps liveness findings advisory when auto recovery is disabled", async () => {
|
||||
await instanceSettingsService(db).updateExperimental({
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
});
|
||||
const { companyId } = await seedBlockedChain();
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
@@ -214,16 +218,16 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
expect(escalations).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("does not create recovery issues until the dependency path is stale for 24 hours", async () => {
|
||||
it("does not create recovery issues outside the configured lookback window", async () => {
|
||||
await enableAutoRecovery();
|
||||
const { companyId } = await seedBlockedChain({ stale: false });
|
||||
const { companyId } = await seedBlockedChain({ outsideLookback: true });
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
const result = await heartbeat.reconcileIssueGraphLiveness();
|
||||
|
||||
expect(result.findings).toBe(1);
|
||||
expect(result.escalationsCreated).toBe(0);
|
||||
expect(result.skippedAutoRecoveryTooYoung).toBe(1);
|
||||
expect(result.skippedOutsideLookback).toBe(1);
|
||||
|
||||
const escalations = await db
|
||||
.select()
|
||||
@@ -424,7 +428,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
const dependentExecutionWorkspaceId = randomUUID();
|
||||
const blockerExecutionWorkspaceId = randomUUID();
|
||||
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
|
||||
const issueTimestamp = new Date(Date.now() - 25 * 60 * 60 * 1000);
|
||||
const issueTimestamp = new Date(Date.now() - 60 * 60 * 1000);
|
||||
|
||||
await db.insert(companies).values({
|
||||
id: companyId,
|
||||
@@ -559,7 +563,7 @@ describeEmbeddedPostgres("heartbeat issue graph liveness escalation", () => {
|
||||
const { companyId, blockedIssueId, blockerIssueId } = await seedBlockedChain();
|
||||
const secondBlockedIssueId = randomUUID();
|
||||
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
|
||||
const issueTimestamp = new Date(Date.now() - 25 * 60 * 60 * 1000);
|
||||
const issueTimestamp = new Date(Date.now() - 60 * 60 * 1000);
|
||||
await db.insert(issues).values({
|
||||
id: secondBlockedIssueId,
|
||||
companyId,
|
||||
|
||||
@@ -9,10 +9,15 @@ const mockInstanceSettingsService = vi.hoisted(() => ({
|
||||
updateExperimental: vi.fn(),
|
||||
listCompanyIds: vi.fn(),
|
||||
}));
|
||||
const mockHeartbeatService = vi.hoisted(() => ({
|
||||
buildIssueGraphLivenessAutoRecoveryPreview: vi.fn(),
|
||||
reconcileIssueGraphLiveness: vi.fn(),
|
||||
}));
|
||||
const mockLogActivity = vi.hoisted(() => vi.fn());
|
||||
|
||||
function registerModuleMocks() {
|
||||
vi.doMock("../services/index.js", () => ({
|
||||
heartbeatService: () => mockHeartbeatService,
|
||||
instanceSettingsService: () => mockInstanceSettingsService,
|
||||
logActivity: mockLogActivity,
|
||||
}));
|
||||
@@ -48,6 +53,8 @@ describe("instance settings routes", () => {
|
||||
mockInstanceSettingsService.updateGeneral.mockReset();
|
||||
mockInstanceSettingsService.updateExperimental.mockReset();
|
||||
mockInstanceSettingsService.listCompanyIds.mockReset();
|
||||
mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview.mockReset();
|
||||
mockHeartbeatService.reconcileIssueGraphLiveness.mockReset();
|
||||
mockLogActivity.mockReset();
|
||||
mockInstanceSettingsService.getGeneral.mockResolvedValue({
|
||||
censorUsernameInLogs: false,
|
||||
@@ -58,7 +65,8 @@ describe("instance settings routes", () => {
|
||||
enableEnvironments: false,
|
||||
enableIsolatedWorkspaces: false,
|
||||
autoRestartDevServerWhenIdle: false,
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
enableIssueGraphLivenessAutoRecovery: true,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 24,
|
||||
});
|
||||
mockInstanceSettingsService.updateGeneral.mockResolvedValue({
|
||||
id: "instance-settings-1",
|
||||
@@ -74,10 +82,32 @@ describe("instance settings routes", () => {
|
||||
enableEnvironments: true,
|
||||
enableIsolatedWorkspaces: true,
|
||||
autoRestartDevServerWhenIdle: false,
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
enableIssueGraphLivenessAutoRecovery: true,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 24,
|
||||
},
|
||||
});
|
||||
mockInstanceSettingsService.listCompanyIds.mockResolvedValue(["company-1", "company-2"]);
|
||||
mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview.mockResolvedValue({
|
||||
lookbackHours: 24,
|
||||
cutoff: "2026-04-26T12:00:00.000Z",
|
||||
generatedAt: "2026-04-27T12:00:00.000Z",
|
||||
findings: 1,
|
||||
recoverableFindings: 1,
|
||||
skippedOutsideLookback: 0,
|
||||
items: [],
|
||||
});
|
||||
mockHeartbeatService.reconcileIssueGraphLiveness.mockResolvedValue({
|
||||
findings: 1,
|
||||
autoRecoveryEnabled: true,
|
||||
lookbackHours: 24,
|
||||
cutoff: "2026-04-26T12:00:00.000Z",
|
||||
escalationsCreated: 1,
|
||||
existingEscalations: 0,
|
||||
skipped: 0,
|
||||
skippedAutoRecoveryDisabled: 0,
|
||||
skippedOutsideLookback: 0,
|
||||
escalationIssueIds: ["issue-2"],
|
||||
});
|
||||
});
|
||||
|
||||
it("allows local board users to read and update experimental settings", async () => {
|
||||
@@ -94,7 +124,8 @@ describe("instance settings routes", () => {
|
||||
enableEnvironments: false,
|
||||
enableIsolatedWorkspaces: false,
|
||||
autoRestartDevServerWhenIdle: false,
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
enableIssueGraphLivenessAutoRecovery: true,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 24,
|
||||
});
|
||||
|
||||
const patchRes = await request(app)
|
||||
@@ -138,14 +169,58 @@ describe("instance settings routes", () => {
|
||||
|
||||
await request(app)
|
||||
.patch("/api/instance/settings/experimental")
|
||||
.send({ enableIssueGraphLivenessAutoRecovery: true })
|
||||
.send({
|
||||
enableIssueGraphLivenessAutoRecovery: true,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 12,
|
||||
})
|
||||
.expect(200);
|
||||
|
||||
expect(mockInstanceSettingsService.updateExperimental).toHaveBeenCalledWith({
|
||||
enableIssueGraphLivenessAutoRecovery: true,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours: 12,
|
||||
});
|
||||
});
|
||||
|
||||
it("previews issue graph liveness recovery candidates before enabling", async () => {
|
||||
const app = await createApp({
|
||||
type: "board",
|
||||
userId: "local-board",
|
||||
source: "local_implicit",
|
||||
isInstanceAdmin: true,
|
||||
});
|
||||
|
||||
const res = await request(app)
|
||||
.post("/api/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview")
|
||||
.send({ lookbackHours: 12 })
|
||||
.expect(200);
|
||||
|
||||
expect(res.body).toMatchObject({ lookbackHours: 24, recoverableFindings: 1 });
|
||||
expect(mockHeartbeatService.buildIssueGraphLivenessAutoRecoveryPreview).toHaveBeenCalledWith({
|
||||
lookbackHours: 12,
|
||||
});
|
||||
});
|
||||
|
||||
it("kicks off issue graph liveness recovery on demand", async () => {
|
||||
const app = await createApp({
|
||||
type: "board",
|
||||
userId: "local-board",
|
||||
source: "local_implicit",
|
||||
isInstanceAdmin: true,
|
||||
});
|
||||
|
||||
await request(app)
|
||||
.post("/api/instance/settings/experimental/issue-graph-liveness-auto-recovery/run")
|
||||
.send({ lookbackHours: 12 })
|
||||
.expect(200);
|
||||
|
||||
expect(mockHeartbeatService.reconcileIssueGraphLiveness).toHaveBeenCalledWith({
|
||||
runId: null,
|
||||
force: true,
|
||||
lookbackHours: 12,
|
||||
});
|
||||
expect(mockLogActivity).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it("allows local board users to update environment controls", async () => {
|
||||
const app = await createApp({
|
||||
type: "board",
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import { Router, type Request } from "express";
|
||||
import type { Db } from "@paperclipai/db";
|
||||
import { patchInstanceExperimentalSettingsSchema, patchInstanceGeneralSettingsSchema } from "@paperclipai/shared";
|
||||
import {
|
||||
issueGraphLivenessAutoRecoveryRequestSchema,
|
||||
patchInstanceExperimentalSettingsSchema,
|
||||
patchInstanceGeneralSettingsSchema,
|
||||
} from "@paperclipai/shared";
|
||||
import { forbidden } from "../errors.js";
|
||||
import { validate } from "../middleware/validate.js";
|
||||
import { instanceSettingsService, logActivity } from "../services/index.js";
|
||||
import { heartbeatService, instanceSettingsService, logActivity } from "../services/index.js";
|
||||
import { assertBoardOrgAccess, getActorInfo } from "./authz.js";
|
||||
|
||||
function assertCanManageInstanceSettings(req: Request) {
|
||||
@@ -19,6 +23,7 @@ function assertCanManageInstanceSettings(req: Request) {
|
||||
export function instanceSettingsRoutes(db: Db) {
|
||||
const router = Router();
|
||||
const svc = instanceSettingsService(db);
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
router.get("/instance/settings/general", async (req, res) => {
|
||||
// General settings (e.g. keyboardShortcuts) are readable by any
|
||||
@@ -94,5 +99,53 @@ export function instanceSettingsRoutes(db: Db) {
|
||||
},
|
||||
);
|
||||
|
||||
router.post(
|
||||
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview",
|
||||
validate(issueGraphLivenessAutoRecoveryRequestSchema),
|
||||
async (req, res) => {
|
||||
assertCanManageInstanceSettings(req);
|
||||
res.json(await heartbeat.buildIssueGraphLivenessAutoRecoveryPreview({
|
||||
lookbackHours: req.body.lookbackHours,
|
||||
}));
|
||||
},
|
||||
);
|
||||
|
||||
router.post(
|
||||
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/run",
|
||||
validate(issueGraphLivenessAutoRecoveryRequestSchema),
|
||||
async (req, res) => {
|
||||
assertCanManageInstanceSettings(req);
|
||||
const actor = getActorInfo(req);
|
||||
const result = await heartbeat.reconcileIssueGraphLiveness({
|
||||
runId: actor.runId,
|
||||
force: true,
|
||||
lookbackHours: req.body.lookbackHours,
|
||||
});
|
||||
const companyIds = await svc.listCompanyIds();
|
||||
await Promise.all(
|
||||
companyIds.map((companyId) =>
|
||||
logActivity(db, {
|
||||
companyId,
|
||||
actorType: actor.actorType,
|
||||
actorId: actor.actorId,
|
||||
agentId: actor.agentId,
|
||||
runId: actor.runId,
|
||||
action: "instance.settings.issue_graph_liveness_auto_recovery_run",
|
||||
entityType: "instance_settings",
|
||||
entityId: "default",
|
||||
details: {
|
||||
lookbackHours: result.lookbackHours,
|
||||
escalationsCreated: result.escalationsCreated,
|
||||
existingEscalations: result.existingEscalations,
|
||||
skippedOutsideLookback: result.skippedOutsideLookback,
|
||||
escalationIssueIds: result.escalationIssueIds,
|
||||
},
|
||||
}),
|
||||
),
|
||||
);
|
||||
res.json(result);
|
||||
},
|
||||
);
|
||||
|
||||
return router;
|
||||
}
|
||||
|
||||
@@ -4502,7 +4502,15 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {})
|
||||
return recovery.buildRunOutputSilence(run, now);
|
||||
}
|
||||
|
||||
async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) {
|
||||
async function buildIssueGraphLivenessAutoRecoveryPreview(opts?: { lookbackHours?: number; now?: Date }) {
|
||||
return recovery.buildIssueGraphLivenessAutoRecoveryPreview(opts);
|
||||
}
|
||||
|
||||
async function reconcileIssueGraphLiveness(opts?: {
|
||||
runId?: string | null;
|
||||
force?: boolean;
|
||||
lookbackHours?: number;
|
||||
}) {
|
||||
return recovery.reconcileIssueGraphLiveness(opts);
|
||||
}
|
||||
|
||||
@@ -7477,6 +7485,8 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {})
|
||||
|
||||
reconcileStrandedAssignedIssues,
|
||||
|
||||
buildIssueGraphLivenessAutoRecoveryPreview,
|
||||
|
||||
reconcileIssueGraphLiveness,
|
||||
|
||||
scanSilentActiveRuns,
|
||||
|
||||
@@ -3,6 +3,7 @@ import { companies, instanceSettings } from "@paperclipai/db";
|
||||
import {
|
||||
DEFAULT_FEEDBACK_DATA_SHARING_PREFERENCE,
|
||||
DEFAULT_BACKUP_RETENTION,
|
||||
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
instanceGeneralSettingsSchema,
|
||||
type InstanceGeneralSettings,
|
||||
instanceExperimentalSettingsSchema,
|
||||
@@ -42,6 +43,9 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin
|
||||
enableIsolatedWorkspaces: parsed.data.enableIsolatedWorkspaces ?? false,
|
||||
autoRestartDevServerWhenIdle: parsed.data.autoRestartDevServerWhenIdle ?? false,
|
||||
enableIssueGraphLivenessAutoRecovery: parsed.data.enableIssueGraphLivenessAutoRecovery ?? false,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours:
|
||||
parsed.data.issueGraphLivenessAutoRecoveryLookbackHours ??
|
||||
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
};
|
||||
}
|
||||
return {
|
||||
@@ -49,6 +53,8 @@ function normalizeExperimentalSettings(raw: unknown): InstanceExperimentalSettin
|
||||
enableIsolatedWorkspaces: false,
|
||||
autoRestartDevServerWhenIdle: false,
|
||||
enableIssueGraphLivenessAutoRecovery: false,
|
||||
issueGraphLivenessAutoRecoveryLookbackHours:
|
||||
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
import { and, asc, desc, eq, gt, inArray, isNull, notInArray, sql } from "drizzle-orm";
|
||||
import type { Db } from "@paperclipai/db";
|
||||
import {
|
||||
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
type IssueGraphLivenessAutoRecoveryPreview,
|
||||
type IssueGraphLivenessAutoRecoveryPreviewItem,
|
||||
} from "@paperclipai/shared";
|
||||
import {
|
||||
agents,
|
||||
agentWakeupRequests,
|
||||
@@ -38,7 +45,6 @@ import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js"
|
||||
|
||||
const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const;
|
||||
const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const;
|
||||
const ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS = 24 * 60 * 60 * 1000;
|
||||
export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000;
|
||||
export const ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS = 4 * 60 * 60 * 1000;
|
||||
export const ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS = 30 * 60 * 1000;
|
||||
@@ -1857,18 +1863,115 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
||||
return result;
|
||||
}
|
||||
|
||||
async function isLivenessFindingOldEnoughForAutoRecovery(finding: IssueLivenessFinding, now = new Date()) {
|
||||
const issueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))];
|
||||
if (issueIds.length === 0) return false;
|
||||
function normalizeIssueGraphLivenessAutoRecoveryLookbackHours(raw: unknown) {
|
||||
const numeric = Math.floor(asNumber(raw, DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS));
|
||||
return Math.min(
|
||||
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||||
Math.max(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, numeric),
|
||||
);
|
||||
}
|
||||
|
||||
function livenessDependencyIssueKey(companyId: string, issueId: string) {
|
||||
return `${companyId}:${issueId}`;
|
||||
}
|
||||
|
||||
async function loadLivenessDependencyUpdatedAtByIssue(findings: IssueLivenessFinding[]) {
|
||||
const issueIds = [
|
||||
...new Set(
|
||||
findings.flatMap((finding) => finding.dependencyPath.map((entry) => entry.issueId)),
|
||||
),
|
||||
];
|
||||
if (issueIds.length === 0) return new Map<string, Date>();
|
||||
const rows = await db
|
||||
.select({ id: issues.id, updatedAt: issues.updatedAt })
|
||||
.select({ id: issues.id, companyId: issues.companyId, updatedAt: issues.updatedAt })
|
||||
.from(issues)
|
||||
.where(and(eq(issues.companyId, finding.companyId), inArray(issues.id, issueIds)));
|
||||
if (rows.length !== issueIds.length) return false;
|
||||
const latestUpdatedAt = rows.reduce((latest, row) =>
|
||||
row.updatedAt > latest ? row.updatedAt : latest,
|
||||
rows[0]!.updatedAt);
|
||||
return now.getTime() - latestUpdatedAt.getTime() >= ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_MIN_STALE_MS;
|
||||
.where(inArray(issues.id, issueIds));
|
||||
return new Map(rows.map((row) => [
|
||||
livenessDependencyIssueKey(row.companyId, row.id),
|
||||
row.updatedAt,
|
||||
]));
|
||||
}
|
||||
|
||||
function latestDependencyUpdatedAtForLivenessFinding(
|
||||
finding: IssueLivenessFinding,
|
||||
updatedAtByIssueKey: Map<string, Date>,
|
||||
) {
|
||||
const dependencyIssueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))];
|
||||
if (dependencyIssueIds.length === 0) return null;
|
||||
const timestamps = dependencyIssueIds.map((issueId) =>
|
||||
updatedAtByIssueKey.get(livenessDependencyIssueKey(finding.companyId, issueId)) ?? null
|
||||
);
|
||||
if (timestamps.some((timestamp) => !timestamp)) return null;
|
||||
const [firstTimestamp, ...remainingTimestamps] = timestamps as Date[];
|
||||
return remainingTimestamps.reduce((latest, updatedAt) =>
|
||||
updatedAt > latest ? updatedAt : latest,
|
||||
firstTimestamp!);
|
||||
}
|
||||
|
||||
function isLivenessFindingInsideAutoRecoveryLookback(
|
||||
finding: IssueLivenessFinding,
|
||||
cutoff: Date,
|
||||
updatedAtByIssueKey: Map<string, Date>,
|
||||
) {
|
||||
const latestUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(finding, updatedAtByIssueKey);
|
||||
return Boolean(latestUpdatedAt && latestUpdatedAt >= cutoff);
|
||||
}
|
||||
|
||||
async function buildIssueGraphLivenessAutoRecoveryPreview(
|
||||
opts?: { lookbackHours?: number; now?: Date },
|
||||
): Promise<IssueGraphLivenessAutoRecoveryPreview> {
|
||||
const now = opts?.now ?? new Date();
|
||||
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(opts?.lookbackHours);
|
||||
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
|
||||
const findings = await collectIssueGraphLivenessFindings();
|
||||
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
|
||||
const issueIds = [...new Set(findings.map((finding) => finding.recoveryIssueId))];
|
||||
const recoveryRows = issueIds.length > 0
|
||||
? await db
|
||||
.select({ id: issues.id, identifier: issues.identifier, title: issues.title })
|
||||
.from(issues)
|
||||
.where(inArray(issues.id, issueIds))
|
||||
: [];
|
||||
const recoveryById = new Map(recoveryRows.map((row) => [row.id, row]));
|
||||
const items: IssueGraphLivenessAutoRecoveryPreviewItem[] = [];
|
||||
let skippedOutsideLookback = 0;
|
||||
|
||||
for (const finding of findings) {
|
||||
const latestDependencyUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(
|
||||
finding,
|
||||
updatedAtByIssueKey,
|
||||
);
|
||||
if (!latestDependencyUpdatedAt || latestDependencyUpdatedAt < cutoff) {
|
||||
skippedOutsideLookback += 1;
|
||||
continue;
|
||||
}
|
||||
const recoveryIssue = recoveryById.get(finding.recoveryIssueId);
|
||||
items.push({
|
||||
issueId: finding.issueId,
|
||||
identifier: finding.identifier,
|
||||
title: finding.dependencyPath[0]?.title ?? finding.identifier ?? finding.issueId,
|
||||
state: finding.state,
|
||||
severity: finding.severity,
|
||||
reason: finding.reason,
|
||||
recoveryIssueId: finding.recoveryIssueId,
|
||||
recoveryIdentifier: recoveryIssue?.identifier ?? null,
|
||||
recoveryTitle: recoveryIssue?.title ?? null,
|
||||
recommendedOwnerAgentId: finding.recommendedOwnerAgentId,
|
||||
incidentKey: finding.incidentKey,
|
||||
latestDependencyUpdatedAt: latestDependencyUpdatedAt.toISOString(),
|
||||
dependencyPath: finding.dependencyPath,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
lookbackHours,
|
||||
cutoff: cutoff.toISOString(),
|
||||
generatedAt: now.toISOString(),
|
||||
findings: findings.length,
|
||||
recoverableFindings: items.length,
|
||||
skippedOutsideLookback,
|
||||
items,
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveEscalationOwnerAgentId(
|
||||
@@ -2131,22 +2234,34 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
||||
return { kind: "created" as const, escalationIssueId: escalation.id };
|
||||
}
|
||||
|
||||
async function reconcileIssueGraphLiveness(opts?: { runId?: string | null }) {
|
||||
async function reconcileIssueGraphLiveness(opts?: {
|
||||
runId?: string | null;
|
||||
force?: boolean;
|
||||
lookbackHours?: number;
|
||||
}) {
|
||||
const findings = await collectIssueGraphLivenessFindings();
|
||||
const experimentalSettings = await instanceSettings.getExperimental();
|
||||
const autoRecoveryEnabled = asBoolean(
|
||||
experimentalSettings.enableIssueGraphLivenessAutoRecovery,
|
||||
false,
|
||||
true,
|
||||
) || opts?.force === true;
|
||||
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(
|
||||
opts?.lookbackHours ?? experimentalSettings.issueGraphLivenessAutoRecoveryLookbackHours,
|
||||
);
|
||||
const now = new Date();
|
||||
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
|
||||
const obsoleteRecoveryCleanup = await retireObsoleteLivenessRecoveryIssues(findings);
|
||||
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
|
||||
const result = {
|
||||
findings: findings.length,
|
||||
autoRecoveryEnabled,
|
||||
lookbackHours,
|
||||
cutoff: cutoff.toISOString(),
|
||||
escalationsCreated: 0,
|
||||
existingEscalations: 0,
|
||||
skipped: 0,
|
||||
skippedAutoRecoveryDisabled: 0,
|
||||
skippedAutoRecoveryTooYoung: 0,
|
||||
skippedOutsideLookback: 0,
|
||||
obsoleteRecoveriesRetired: obsoleteRecoveryCleanup.retired,
|
||||
obsoleteRecoveriesActiveSkipped: obsoleteRecoveryCleanup.activeSkipped,
|
||||
obsoleteRecoveryBlockerRelationsRemoved: obsoleteRecoveryCleanup.blockerRelationsRemoved,
|
||||
@@ -2160,10 +2275,9 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
||||
return result;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
for (const finding of findings) {
|
||||
if (!await isLivenessFindingOldEnoughForAutoRecovery(finding, now)) {
|
||||
result.skippedAutoRecoveryTooYoung += 1;
|
||||
if (!isLivenessFindingInsideAutoRecoveryLookback(finding, cutoff, updatedAtByIssueKey)) {
|
||||
result.skippedOutsideLookback += 1;
|
||||
result.skipped += 1;
|
||||
continue;
|
||||
}
|
||||
@@ -2197,6 +2311,7 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
||||
recordWatchdogDecision,
|
||||
scanSilentActiveRuns,
|
||||
reconcileStrandedAssignedIssues,
|
||||
buildIssueGraphLivenessAutoRecoveryPreview,
|
||||
reconcileIssueGraphLiveness,
|
||||
readRecoveryTimerIntervalMs,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user