forked from farhoodlabs/paperclip
911a1e8b0d
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - The recovery subsystem is responsible for keeping assigned work moving when a live heartbeat run disappears or fails. > - `continuation_recovery` is the path that re-enqueues stranded `in_progress` issues after an interrupted continuation attempt. > - That path recently gained cause-aware retry classes and transient retry caps, but the streak counter was still aggregating mixed failure causes into one retry history. > - That meant a sequence like `timeout -> timeout -> adapter_failed -> adapter_failed` could escalate as a false `3x adapter_failed` streak even though the latest cause had only happened twice. > - This pull request makes continuation retry streaks count only consecutive failures whose `errorCode` matches the latest run and adds a regression test for the mixed-cause case. > - The benefit is that transient retry backoff and escalation now match the actual current failure cause instead of inheriting stale budget from unrelated failures. ## What Changed - Updated `summarizeRecentContinuationRetries(...)` to stop counting as soon as the continuation failure cause no longer matches the latest run's `errorCode`. - Wired the continuation recovery escalation/backoff path to pass the latest classified `errorCode` into the retry streak summarizer. - Added a regression test proving mixed-cause continuation failures do not consume the transient retry cap for a new failure cause. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-process-recovery.test.ts` ## Risks - Low risk. The behavioral change is intentionally narrow, but any future continuation retry modes that rely on `errorCode = null` will now be counted as a separate streak bucket and should be kept in mind when adding new retry classifications. ## Model Used - OpenAI Codex via Paperclip `codex_local` (GPT-5-based Codex coding agent; exact backend revision is not surfaced in the runtime), with tool use, shell execution, and patch application in the local repository. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [ ] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
3599 lines
131 KiB
TypeScript
3599 lines
131 KiB
TypeScript
import { and, asc, desc, eq, gt, gte, inArray, isNull, notInArray, sql } from "drizzle-orm";
|
||
import type { Db } from "@paperclipai/db";
|
||
import {
|
||
DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||
MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||
type IssueGraphLivenessAutoRecoveryPreview,
|
||
type IssueGraphLivenessAutoRecoveryPreviewItem,
|
||
} from "@paperclipai/shared";
|
||
import {
|
||
agents,
|
||
agentWakeupRequests,
|
||
approvals,
|
||
activityLog,
|
||
companies,
|
||
heartbeatRunEvents,
|
||
heartbeatRunWatchdogDecisions,
|
||
heartbeatRuns,
|
||
issueComments,
|
||
issueApprovals,
|
||
issueRecoveryActions,
|
||
issueRelations,
|
||
issueThreadInteractions,
|
||
issues,
|
||
} from "@paperclipai/db";
|
||
import { parseObject, asBoolean, asNumber } from "../../adapters/utils.js";
|
||
import { runningProcesses } from "../../adapters/index.js";
|
||
import { forbidden, notFound } from "../../errors.js";
|
||
import { logger } from "../../middleware/logger.js";
|
||
import { isPidAlive, isProcessGroupAlive, terminateLocalService } from "../local-service-supervisor.js";
|
||
import { redactCurrentUserText } from "../../log-redaction.js";
|
||
import { redactSensitiveText } from "../../redaction.js";
|
||
import { logActivity } from "../activity-log.js";
|
||
import { budgetService } from "../budgets.js";
|
||
import { instanceSettingsService } from "../instance-settings.js";
|
||
import { issueRecoveryActionService } from "../issue-recovery-actions.js";
|
||
import { issueTreeControlService } from "../issue-tree-control.js";
|
||
import { issueService } from "../issues.js";
|
||
import { getRunLogStore } from "../run-log-store.js";
|
||
import {
|
||
DEFAULT_MAX_SUCCESSFUL_RUN_HANDOFF_ATTEMPTS,
|
||
FINISH_SUCCESSFUL_RUN_HANDOFF_REASON,
|
||
SUCCESSFUL_RUN_MISSING_STATE_REASON,
|
||
buildSuccessfulRunHandoffExhaustedNotice,
|
||
noticeMetadataReferencesRecoveryAction,
|
||
type SuccessfulRunHandoffNotice,
|
||
} from "./successful-run-handoff.js";
|
||
import {
|
||
RECOVERY_ORIGIN_KINDS,
|
||
buildIssueGraphLivenessLeafKey,
|
||
isStrandedIssueRecoveryOriginKind,
|
||
parseIssueGraphLivenessIncidentKey,
|
||
} from "./origins.js";
|
||
import {
|
||
classifyIssueGraphLiveness,
|
||
type IssueLivenessFinding,
|
||
} from "./issue-graph-liveness.js";
|
||
import {
|
||
recoveryAssigneeAdapterOverrides,
|
||
withRecoveryModelProfileHint,
|
||
} from "./model-profile-hint.js";
|
||
import { isAutomaticRecoverySuppressedByPauseHold } from "./pause-hold-guard.js";
|
||
|
||
const EXECUTION_PATH_HEARTBEAT_RUN_STATUSES = ["queued", "running", "scheduled_retry"] as const;
|
||
const UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES = ["failed", "cancelled", "timed_out"] as const;
|
||
export const ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS = 60 * 60 * 1000;
|
||
export const ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS = 4 * 60 * 60 * 1000;
|
||
export const ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS = 30 * 60 * 1000;
|
||
const ACTIVE_RUN_OUTPUT_EVIDENCE_TAIL_BYTES = 8 * 1024;
|
||
const STRANDED_ISSUE_RECOVERY_ORIGIN_KIND = RECOVERY_ORIGIN_KINDS.strandedIssueRecovery;
|
||
const STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND = RECOVERY_ORIGIN_KINDS.staleActiveRunEvaluation;
|
||
const DEFERRED_WAKE_CONTEXT_KEY = "_paperclipWakeContext";
|
||
const SESSIONED_LOCAL_ADAPTERS = new Set([
|
||
"claude_local",
|
||
"codex_local",
|
||
"cursor",
|
||
"gemini_local",
|
||
"hermes_local",
|
||
"opencode_local",
|
||
"pi_local",
|
||
]);
|
||
|
||
type RecoveryWakeupOptions = {
|
||
source?: "timer" | "assignment" | "on_demand" | "automation";
|
||
triggerDetail?: "manual" | "ping" | "callback" | "system";
|
||
reason?: string | null;
|
||
payload?: Record<string, unknown> | null;
|
||
idempotencyKey?: string | null;
|
||
requestedByActorType?: "user" | "agent" | "system";
|
||
requestedByActorId?: string | null;
|
||
contextSnapshot?: Record<string, unknown>;
|
||
};
|
||
|
||
type RecoveryWakeup = (
|
||
agentId: string,
|
||
opts?: RecoveryWakeupOptions,
|
||
) => Promise<typeof heartbeatRuns.$inferSelect | null>;
|
||
|
||
type LatestIssueRun = Pick<
|
||
typeof heartbeatRuns.$inferSelect,
|
||
"id" | "agentId" | "status" | "error" | "errorCode" | "contextSnapshot" | "livenessState"
|
||
> | null;
|
||
type SuccessfulLatestIssueRun = NonNullable<LatestIssueRun> & { status: "succeeded" };
|
||
|
||
type StrandedRecoveryCause = "stranded_assigned_issue" | typeof SUCCESSFUL_RUN_MISSING_STATE_REASON;
|
||
|
||
type SuccessfulRunHandoffRecoveryEvidence = {
|
||
sourceRunId: string | null;
|
||
correctiveRunId: string;
|
||
missingDisposition: string;
|
||
handoffAttempt: number;
|
||
maxHandoffAttempts: number;
|
||
};
|
||
|
||
type WatchdogDecisionActor =
|
||
| { type: "board"; userId?: string | null; runId?: string | null }
|
||
| { type: "agent"; agentId?: string | null; runId?: string | null }
|
||
| { type: "none" };
|
||
|
||
export type RunOutputSilenceSummary = {
|
||
lastOutputAt: Date | null;
|
||
lastOutputSeq: number;
|
||
lastOutputStream: "stdout" | "stderr" | null;
|
||
silenceStartedAt: Date | null;
|
||
silenceAgeMs: number | null;
|
||
level: "not_applicable" | "ok" | "suspicious" | "critical" | "snoozed";
|
||
suspicionThresholdMs: number;
|
||
criticalThresholdMs: number;
|
||
snoozedUntil: Date | null;
|
||
evaluationIssueId: string | null;
|
||
evaluationIssueIdentifier: string | null;
|
||
evaluationIssueAssigneeAgentId: string | null;
|
||
};
|
||
|
||
function readNonEmptyString(value: unknown): string | null {
|
||
return typeof value === "string" && value.trim().length > 0 ? value : null;
|
||
}
|
||
|
||
function summarizeRunFailureForIssueComment(run: LatestIssueRun) {
|
||
if (!run) return null;
|
||
|
||
if (readNonEmptyString(run.error) || readNonEmptyString(run.errorCode)) {
|
||
return " Latest retry failure details were withheld from the issue thread; inspect the linked run for evidence.";
|
||
}
|
||
return null;
|
||
}
|
||
|
||
function didAutomaticRecoveryFail(
|
||
latestRun: LatestIssueRun,
|
||
expectedRetryReason: "assignment_recovery" | "issue_continuation_needed",
|
||
) {
|
||
if (!latestRun) return false;
|
||
|
||
const latestContext = parseObject(latestRun.contextSnapshot);
|
||
const latestRetryReason = readNonEmptyString(latestContext.retryReason);
|
||
return latestRetryReason === expectedRetryReason &&
|
||
UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||
latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||
);
|
||
}
|
||
|
||
const TRANSIENT_INFRA_CONTINUATION_ERROR_CODES = new Set<string>([
|
||
"adapter_failed",
|
||
"codex_transient_upstream",
|
||
"claude_transient_upstream",
|
||
"timeout",
|
||
]);
|
||
|
||
const NON_RETRYABLE_CONTINUATION_ERROR_CODES = new Set<string>([
|
||
"agent_not_invokable",
|
||
"agent_not_found",
|
||
"budget_blocked",
|
||
"budget_exhausted",
|
||
"issue_paused",
|
||
"issue_dependencies_blocked",
|
||
]);
|
||
|
||
const CONTINUATION_RECOVERY_TRANSIENT_MAX_ATTEMPTS = 3;
|
||
const CONTINUATION_RECOVERY_DEFAULT_MAX_ATTEMPTS = 1;
|
||
const CONTINUATION_RECOVERY_TRANSIENT_BASE_BACKOFF_MS = 60_000;
|
||
|
||
type ContinuationRetryClassification = {
|
||
kind: "transient_infra" | "non_retryable" | "default";
|
||
maxAttempts: number;
|
||
baseBackoffMs: number;
|
||
errorCode: string | null;
|
||
};
|
||
|
||
function classifyContinuationFailure(latestRun: LatestIssueRun): ContinuationRetryClassification {
|
||
const errorCode = readNonEmptyString(latestRun?.errorCode);
|
||
if (errorCode && NON_RETRYABLE_CONTINUATION_ERROR_CODES.has(errorCode)) {
|
||
return { kind: "non_retryable", maxAttempts: 0, baseBackoffMs: 0, errorCode };
|
||
}
|
||
if (errorCode && TRANSIENT_INFRA_CONTINUATION_ERROR_CODES.has(errorCode)) {
|
||
return {
|
||
kind: "transient_infra",
|
||
maxAttempts: CONTINUATION_RECOVERY_TRANSIENT_MAX_ATTEMPTS,
|
||
baseBackoffMs: CONTINUATION_RECOVERY_TRANSIENT_BASE_BACKOFF_MS,
|
||
errorCode,
|
||
};
|
||
}
|
||
return {
|
||
kind: "default",
|
||
maxAttempts: CONTINUATION_RECOVERY_DEFAULT_MAX_ATTEMPTS,
|
||
baseBackoffMs: 0,
|
||
errorCode,
|
||
};
|
||
}
|
||
|
||
function successfulRunHandoffRecoveryEvidence(latestRun: LatestIssueRun): SuccessfulRunHandoffRecoveryEvidence | null {
|
||
if (!latestRun) return null;
|
||
|
||
const context = parseObject(latestRun.contextSnapshot);
|
||
const wakeReason = readNonEmptyString(context.wakeReason);
|
||
const handoffReason = readNonEmptyString(context.handoffReason);
|
||
const isSuccessfulRunHandoff =
|
||
wakeReason === FINISH_SUCCESSFUL_RUN_HANDOFF_REASON ||
|
||
handoffReason === SUCCESSFUL_RUN_MISSING_STATE_REASON ||
|
||
asBoolean(context.handoffRequired, false) === true;
|
||
if (!isSuccessfulRunHandoff) return null;
|
||
|
||
const handoffAttempt = asNumber(context.handoffAttempt, 1);
|
||
const maxHandoffAttempts = asNumber(
|
||
context.maxHandoffAttempts,
|
||
DEFAULT_MAX_SUCCESSFUL_RUN_HANDOFF_ATTEMPTS,
|
||
);
|
||
return {
|
||
sourceRunId: readNonEmptyString(context.sourceRunId) ?? readNonEmptyString(context.resumeFromRunId),
|
||
correctiveRunId: latestRun.id,
|
||
missingDisposition: readNonEmptyString(context.missingDisposition) ?? "clear_next_step",
|
||
handoffAttempt,
|
||
maxHandoffAttempts,
|
||
};
|
||
}
|
||
|
||
function isExhaustedSuccessfulRunHandoff(latestRun: LatestIssueRun) {
|
||
const evidence = successfulRunHandoffRecoveryEvidence(latestRun);
|
||
if (!evidence) return null;
|
||
if (evidence.handoffAttempt < evidence.maxHandoffAttempts) return { ...evidence, exhausted: false };
|
||
return { ...evidence, exhausted: true };
|
||
}
|
||
|
||
function issueIdFromRunContext(contextSnapshot: unknown) {
|
||
const context = parseObject(contextSnapshot);
|
||
return readNonEmptyString(context.issueId) ?? readNonEmptyString(context.taskId);
|
||
}
|
||
|
||
function issueIdFromWakePayload(payload: unknown) {
|
||
const parsed = parseObject(payload);
|
||
const nestedContext = parseObject(parsed[DEFERRED_WAKE_CONTEXT_KEY]);
|
||
return readNonEmptyString(parsed.issueId) ??
|
||
readNonEmptyString(nestedContext.issueId) ??
|
||
readNonEmptyString(nestedContext.taskId);
|
||
}
|
||
|
||
function issueUiLink(issue: { identifier: string | null; id: string }, prefix: string) {
|
||
const label = issue.identifier ?? issue.id;
|
||
return `[${label}](/${prefix}/issues/${label})`;
|
||
}
|
||
|
||
function runUiLink(run: { id: string; agentId: string }, prefix: string) {
|
||
return `[${run.id}](/${prefix}/agents/${run.agentId}/runs/${run.id})`;
|
||
}
|
||
|
||
function agentUiLink(agent: { id: string; name: string | null } | null, prefix: string) {
|
||
if (!agent) return "unknown";
|
||
return `[${agent.name ?? agent.id}](/${prefix}/agents/${agent.id})`;
|
||
}
|
||
|
||
function formatDuration(ms: number | null) {
|
||
if (ms === null) return "unknown";
|
||
const minutes = Math.floor(ms / 60_000);
|
||
if (minutes < 60) return `${minutes}m`;
|
||
const hours = Math.floor(minutes / 60);
|
||
const remainingMinutes = minutes % 60;
|
||
return remainingMinutes > 0 ? `${hours}h ${remainingMinutes}m` : `${hours}h`;
|
||
}
|
||
|
||
function formatIssueLinksForComment(relations: Array<{ identifier?: string | null }>) {
|
||
const identifiers = [
|
||
...new Set(
|
||
relations
|
||
.map((relation) => relation.identifier)
|
||
.filter((identifier): identifier is string => Boolean(identifier)),
|
||
),
|
||
];
|
||
if (identifiers.length === 0) return "another open issue";
|
||
return identifiers
|
||
.slice(0, 5)
|
||
.map((identifier) => {
|
||
const prefix = identifier.split("-")[0] || "PAP";
|
||
return `[${identifier}](/${prefix}/issues/${identifier})`;
|
||
})
|
||
.join(", ");
|
||
}
|
||
|
||
function unwrapDatabaseConflictError(error: unknown) {
|
||
if (!error || typeof error !== "object") return null;
|
||
|
||
const candidate = error as {
|
||
code?: string;
|
||
constraint?: string;
|
||
constraint_name?: string;
|
||
message?: string;
|
||
cause?: unknown;
|
||
};
|
||
|
||
if (
|
||
typeof candidate.code === "string" ||
|
||
typeof candidate.constraint === "string" ||
|
||
typeof candidate.constraint_name === "string"
|
||
) {
|
||
return candidate;
|
||
}
|
||
|
||
const cause = candidate.cause;
|
||
if (!cause || typeof cause !== "object") return candidate;
|
||
|
||
return cause as {
|
||
code?: string;
|
||
constraint?: string;
|
||
constraint_name?: string;
|
||
message?: string;
|
||
};
|
||
}
|
||
|
||
function isAgentInvokable(agent: typeof agents.$inferSelect | null | undefined) {
|
||
return Boolean(agent && !["paused", "terminated", "pending_approval"].includes(agent.status));
|
||
}
|
||
|
||
function isStrandedIssueRecoveryIssue(issue: Pick<typeof issues.$inferSelect, "originKind">) {
|
||
return isStrandedIssueRecoveryOriginKind(issue.originKind);
|
||
}
|
||
|
||
function isUnsuccessfulTerminalIssueRun(latestRun: LatestIssueRun) {
|
||
return Boolean(
|
||
latestRun &&
|
||
UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||
latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||
),
|
||
);
|
||
}
|
||
|
||
function isSuccessfulInProgressContinuationRun(latestRun: LatestIssueRun): latestRun is SuccessfulLatestIssueRun {
|
||
return latestRun?.status === "succeeded";
|
||
}
|
||
|
||
function isProductiveContinuationRun(latestRun: LatestIssueRun) {
|
||
return latestRun?.status === "succeeded" &&
|
||
(latestRun.livenessState === "advanced" ||
|
||
latestRun.livenessState === "completed" ||
|
||
latestRun.livenessState === "blocked" ||
|
||
latestRun.livenessState === "needs_followup");
|
||
}
|
||
|
||
function isRepeatedProductiveContinuationRecovery(latestRun: SuccessfulLatestIssueRun) {
|
||
const latestContext = parseObject(latestRun.contextSnapshot);
|
||
return readNonEmptyString(latestContext.retryReason) === "issue_continuation_needed" &&
|
||
readNonEmptyString(latestContext.source) === "issue.productive_terminal_continuation_recovery" &&
|
||
isProductiveContinuationRun(latestRun);
|
||
}
|
||
|
||
function parseLivenessIncidentKey(incidentKey: string | null | undefined) {
|
||
if (!incidentKey) return null;
|
||
return parseIssueGraphLivenessIncidentKey(incidentKey);
|
||
}
|
||
|
||
function livenessRecoveryLeafIssueId(finding: IssueLivenessFinding) {
|
||
return finding.recoveryIssueId;
|
||
}
|
||
|
||
function livenessRecoveryLeafFingerprint(finding: IssueLivenessFinding) {
|
||
return buildIssueGraphLivenessLeafKey({
|
||
companyId: finding.companyId,
|
||
state: finding.state,
|
||
leafIssueId: livenessRecoveryLeafIssueId(finding),
|
||
});
|
||
}
|
||
|
||
function livenessRecoveryLeafKey(companyId: string, state: string, leafIssueId: string) {
|
||
return buildIssueGraphLivenessLeafKey({ companyId, state, leafIssueId });
|
||
}
|
||
|
||
function isUniqueLivenessRecoveryConflict(error: unknown) {
|
||
if (!error || typeof error !== "object") return false;
|
||
const maybe = error as { code?: string; constraint?: string; message?: string };
|
||
return maybe.code === "23505" &&
|
||
(
|
||
maybe.constraint === "issues_active_liveness_recovery_incident_uq" ||
|
||
maybe.constraint === "issues_active_liveness_recovery_leaf_uq" ||
|
||
typeof maybe.message === "string" &&
|
||
(
|
||
maybe.message.includes("issues_active_liveness_recovery_incident_uq") ||
|
||
maybe.message.includes("issues_active_liveness_recovery_leaf_uq")
|
||
)
|
||
);
|
||
}
|
||
|
||
function formatDependencyPath(finding: IssueLivenessFinding) {
|
||
return finding.dependencyPath
|
||
.map((entry) => entry.identifier ?? entry.issueId)
|
||
.join(" -> ");
|
||
}
|
||
|
||
function buildLivenessEscalationDescription(finding: IssueLivenessFinding) {
|
||
const source = finding.dependencyPath[0];
|
||
const recovery = finding.dependencyPath.find((entry) => entry.issueId === finding.recoveryIssueId);
|
||
const selectedOwner = finding.recommendedOwnerAgentId ?? "none";
|
||
|
||
return [
|
||
"Paperclip detected a harness-level issue graph liveness incident.",
|
||
"",
|
||
"## Source",
|
||
"",
|
||
`- Source issue: ${source?.identifier ?? source?.issueId ?? finding.issueId}`,
|
||
`- Recovery target issue: ${recovery?.identifier ?? recovery?.issueId ?? finding.recoveryIssueId}`,
|
||
`- Incident key: \`${finding.incidentKey}\``,
|
||
`- Detected invariant: \`${finding.state}\``,
|
||
`- Dependency path: ${formatDependencyPath(finding)}`,
|
||
`- Reason: ${finding.reason}`,
|
||
"",
|
||
"## Ownership",
|
||
"",
|
||
`- Selected owner agent: \`${selectedOwner}\``,
|
||
`- Candidate owner agents: ${finding.recommendedOwnerCandidateAgentIds.length > 0 ? finding.recommendedOwnerCandidateAgentIds.map((id) => `\`${id}\``).join(", ") : "none"}`,
|
||
"",
|
||
"## Next Action",
|
||
"",
|
||
finding.recommendedAction,
|
||
"",
|
||
"Resolve the blocked chain, then mark this escalation issue done so the original issue can resume when all blockers are cleared.",
|
||
].join("\n");
|
||
}
|
||
|
||
function buildLivenessOriginalIssueComment(finding: IssueLivenessFinding, escalation: typeof issues.$inferSelect) {
|
||
return [
|
||
"Paperclip detected a harness-level liveness incident in this issue's dependency graph.",
|
||
"",
|
||
`- Escalation issue: ${escalation.identifier ?? escalation.id}`,
|
||
`- Incident key: \`${finding.incidentKey}\``,
|
||
`- Finding: \`${finding.state}\``,
|
||
`- Dependency path: ${formatDependencyPath(finding)}`,
|
||
`- Reason: ${finding.reason}`,
|
||
`- Manager action requested: ${finding.recommendedAction}`,
|
||
"",
|
||
"This issue now keeps its existing blockers and is also blocked by the escalation issue so dependency wakeups remain explicit.",
|
||
].join("\n");
|
||
}
|
||
|
||
export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup }) {
|
||
const issuesSvc = issueService(db);
|
||
const recoveryActionsSvc = issueRecoveryActionService(db);
|
||
const treeControlSvc = issueTreeControlService(db);
|
||
const budgets = budgetService(db);
|
||
const instanceSettings = instanceSettingsService(db);
|
||
const runLogStore = getRunLogStore();
|
||
|
||
const getCurrentUserRedactionOptions = async () => ({
|
||
enabled: (await instanceSettings.getGeneral()).censorUsernameInLogs,
|
||
});
|
||
|
||
async function getAgent(agentId: string) {
|
||
return db.select().from(agents).where(eq(agents.id, agentId)).then((rows) => rows[0] ?? null);
|
||
}
|
||
|
||
async function getLatestIssueRun(companyId: string, issueId: string): Promise<LatestIssueRun> {
|
||
return db
|
||
.select({
|
||
id: heartbeatRuns.id,
|
||
agentId: heartbeatRuns.agentId,
|
||
status: heartbeatRuns.status,
|
||
error: heartbeatRuns.error,
|
||
errorCode: heartbeatRuns.errorCode,
|
||
contextSnapshot: heartbeatRuns.contextSnapshot,
|
||
livenessState: heartbeatRuns.livenessState,
|
||
})
|
||
.from(heartbeatRuns)
|
||
.where(
|
||
and(
|
||
eq(heartbeatRuns.companyId, companyId),
|
||
sql`${heartbeatRuns.contextSnapshot} ->> 'issueId' = ${issueId}`,
|
||
),
|
||
)
|
||
.orderBy(desc(heartbeatRuns.createdAt), desc(heartbeatRuns.id))
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null);
|
||
}
|
||
|
||
async function summarizeRecentContinuationRetries(
|
||
companyId: string,
|
||
issueId: string,
|
||
errorCodeToMatch: string | null,
|
||
) {
|
||
const rows = await db
|
||
.select({
|
||
id: heartbeatRuns.id,
|
||
status: heartbeatRuns.status,
|
||
errorCode: heartbeatRuns.errorCode,
|
||
contextSnapshot: heartbeatRuns.contextSnapshot,
|
||
finishedAt: heartbeatRuns.finishedAt,
|
||
})
|
||
.from(heartbeatRuns)
|
||
.where(
|
||
and(
|
||
eq(heartbeatRuns.companyId, companyId),
|
||
sql`${heartbeatRuns.contextSnapshot} ->> 'issueId' = ${issueId}`,
|
||
),
|
||
)
|
||
.orderBy(desc(heartbeatRuns.createdAt), desc(heartbeatRuns.id))
|
||
.limit(10);
|
||
|
||
let consecutive = 0;
|
||
let latestFinishedAt: Date | null = null;
|
||
for (const row of rows) {
|
||
const ctx = parseObject(row.contextSnapshot);
|
||
const retryReason = readNonEmptyString(ctx.retryReason);
|
||
if (retryReason !== "issue_continuation_needed") break;
|
||
if (
|
||
!UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||
row.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||
)
|
||
) {
|
||
break;
|
||
}
|
||
|
||
const rowErrorCode = readNonEmptyString(row.errorCode);
|
||
if (errorCodeToMatch !== rowErrorCode) {
|
||
break;
|
||
}
|
||
|
||
consecutive += 1;
|
||
if (latestFinishedAt === null) latestFinishedAt = row.finishedAt ?? null;
|
||
}
|
||
return { consecutive, latestFinishedAt };
|
||
}
|
||
|
||
async function hasActiveExecutionPath(companyId: string, issueId: string) {
|
||
const [run, deferredWake] = await Promise.all([
|
||
db
|
||
.select({ id: heartbeatRuns.id })
|
||
.from(heartbeatRuns)
|
||
.where(
|
||
and(
|
||
eq(heartbeatRuns.companyId, companyId),
|
||
inArray(heartbeatRuns.status, [...EXECUTION_PATH_HEARTBEAT_RUN_STATUSES]),
|
||
sql`${heartbeatRuns.contextSnapshot} ->> 'issueId' = ${issueId}`,
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null),
|
||
db
|
||
.select({ id: agentWakeupRequests.id })
|
||
.from(agentWakeupRequests)
|
||
.where(
|
||
and(
|
||
eq(agentWakeupRequests.companyId, companyId),
|
||
eq(agentWakeupRequests.status, "deferred_issue_execution"),
|
||
sql`${agentWakeupRequests.payload} ->> 'issueId' = ${issueId}`,
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null),
|
||
]);
|
||
|
||
return Boolean(run || deferredWake);
|
||
}
|
||
|
||
async function hasQueuedIssueWake(companyId: string, issueId: string) {
|
||
return db
|
||
.select({ id: agentWakeupRequests.id })
|
||
.from(agentWakeupRequests)
|
||
.where(
|
||
and(
|
||
eq(agentWakeupRequests.companyId, companyId),
|
||
eq(agentWakeupRequests.status, "queued"),
|
||
sql`${agentWakeupRequests.payload} ->> 'issueId' = ${issueId}`,
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => Boolean(rows[0]));
|
||
}
|
||
|
||
async function enqueueStrandedIssueRecovery(input: {
|
||
issueId: string;
|
||
agentId: string;
|
||
reason: "issue_assignment_recovery" | "issue_continuation_needed";
|
||
retryReason: "assignment_recovery" | "issue_continuation_needed";
|
||
source: string;
|
||
retryOfRunId?: string | null;
|
||
}) {
|
||
const queued = await deps.enqueueWakeup(input.agentId, {
|
||
source: "automation",
|
||
triggerDetail: "system",
|
||
reason: input.reason,
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: input.issueId,
|
||
...(input.retryOfRunId ? { retryOfRunId: input.retryOfRunId } : {}),
|
||
}, "normal_model"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: input.issueId,
|
||
taskId: input.issueId,
|
||
wakeReason: input.reason,
|
||
retryReason: input.retryReason,
|
||
source: input.source,
|
||
...(input.retryOfRunId ? { retryOfRunId: input.retryOfRunId } : {}),
|
||
}, "normal_model"),
|
||
});
|
||
|
||
if (queued && input.retryOfRunId) {
|
||
return db
|
||
.update(heartbeatRuns)
|
||
.set({
|
||
retryOfRunId: input.retryOfRunId,
|
||
updatedAt: new Date(),
|
||
})
|
||
.where(eq(heartbeatRuns.id, queued.id))
|
||
.returning()
|
||
.then((rows) => rows[0] ?? queued);
|
||
}
|
||
|
||
return queued;
|
||
}
|
||
|
||
async function enqueueInitialAssignedTodoDispatch(issue: typeof issues.$inferSelect, agentId: string) {
|
||
return deps.enqueueWakeup(agentId, {
|
||
source: "assignment",
|
||
triggerDetail: "system",
|
||
reason: "issue_assigned",
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: issue.id,
|
||
mutation: "assigned_todo_liveness_dispatch",
|
||
}, "normal_model"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: issue.id,
|
||
taskId: issue.id,
|
||
wakeReason: "issue_assigned",
|
||
source: "issue.assigned_todo_liveness_dispatch",
|
||
}, "normal_model"),
|
||
});
|
||
}
|
||
|
||
async function isInvocationBudgetBlocked(issue: typeof issues.$inferSelect, agentId: string) {
|
||
const budgetBlock = await budgets.getInvocationBlock(issue.companyId, agentId, {
|
||
issueId: issue.id,
|
||
projectId: issue.projectId,
|
||
});
|
||
return Boolean(budgetBlock);
|
||
}
|
||
|
||
async function reconcileUnassignedBlockingIssues() {
|
||
const candidates = await db
|
||
.select({
|
||
id: issues.id,
|
||
companyId: issues.companyId,
|
||
identifier: issues.identifier,
|
||
status: issues.status,
|
||
createdByAgentId: issues.createdByAgentId,
|
||
})
|
||
.from(issueRelations)
|
||
.innerJoin(issues, eq(issueRelations.issueId, issues.id))
|
||
.where(
|
||
and(
|
||
eq(issueRelations.type, "blocks"),
|
||
inArray(issues.status, ["todo", "blocked"]),
|
||
isNull(issues.assigneeAgentId),
|
||
isNull(issues.assigneeUserId),
|
||
sql`${issues.createdByAgentId} is not null`,
|
||
sql`exists (
|
||
select 1
|
||
from issues blocked_issue
|
||
where blocked_issue.id = ${issueRelations.relatedIssueId}
|
||
and blocked_issue.company_id = ${issues.companyId}
|
||
and blocked_issue.status not in ('done', 'cancelled')
|
||
)`,
|
||
),
|
||
);
|
||
|
||
let assigned = 0;
|
||
let skipped = 0;
|
||
const issueIds: string[] = [];
|
||
const seen = new Set<string>();
|
||
|
||
for (const candidate of candidates) {
|
||
if (seen.has(candidate.id)) continue;
|
||
seen.add(candidate.id);
|
||
|
||
const creatorAgentId = candidate.createdByAgentId;
|
||
if (!creatorAgentId) {
|
||
skipped += 1;
|
||
continue;
|
||
}
|
||
const creatorAgent = await getAgent(creatorAgentId);
|
||
if (!creatorAgent || creatorAgent.companyId !== candidate.companyId || !isAgentInvokable(creatorAgent)) {
|
||
skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const relations = await issuesSvc.getRelationSummaries(candidate.id);
|
||
const blockingLinks = formatIssueLinksForComment(relations.blocks);
|
||
const updated = await issuesSvc.update(candidate.id, {
|
||
assigneeAgentId: creatorAgent.id,
|
||
assigneeUserId: null,
|
||
});
|
||
if (!updated) {
|
||
skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
await issuesSvc.addComment(
|
||
candidate.id,
|
||
[
|
||
"## Assigned Orphan Blocker",
|
||
"",
|
||
`Paperclip found this issue is blocking ${blockingLinks} but had no assignee, so no heartbeat could pick it up.`,
|
||
"",
|
||
"- Assigned it back to the agent that created the blocker.",
|
||
"- Next action: resolve this blocker or reassign it to the right owner.",
|
||
].join("\n"),
|
||
{},
|
||
);
|
||
|
||
await logActivity(db, {
|
||
companyId: candidate.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: null,
|
||
runId: null,
|
||
action: "issue.updated",
|
||
entityType: "issue",
|
||
entityId: candidate.id,
|
||
details: {
|
||
identifier: candidate.identifier,
|
||
assigneeAgentId: creatorAgent.id,
|
||
source: "recovery.reconcile_unassigned_blocking_issue",
|
||
},
|
||
});
|
||
|
||
const queued = await deps.enqueueWakeup(creatorAgent.id, {
|
||
source: "automation",
|
||
triggerDetail: "system",
|
||
reason: "issue_assigned",
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: candidate.id,
|
||
mutation: "unassigned_blocker_recovery",
|
||
}, "normal_model"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: candidate.id,
|
||
taskId: candidate.id,
|
||
wakeReason: "issue_assigned",
|
||
source: "issue.unassigned_blocker_recovery",
|
||
}, "normal_model"),
|
||
});
|
||
|
||
if (queued) {
|
||
assigned += 1;
|
||
issueIds.push(candidate.id);
|
||
} else {
|
||
skipped += 1;
|
||
}
|
||
}
|
||
|
||
return { assigned, skipped, issueIds };
|
||
}
|
||
|
||
async function getCompanyIssuePrefix(companyId: string) {
|
||
return db
|
||
.select({ issuePrefix: companies.issuePrefix })
|
||
.from(companies)
|
||
.where(eq(companies.id, companyId))
|
||
.then((rows) => rows[0]?.issuePrefix ?? "PAP");
|
||
}
|
||
|
||
function staleActiveRunOriginFingerprint(companyId: string, runId: string) {
|
||
return `stale_active_run:${companyId}:${runId}`;
|
||
}
|
||
|
||
function isTerminalIssueStatus(status: string | null | undefined) {
|
||
return status === "done" || status === "cancelled";
|
||
}
|
||
|
||
function isRecoveryOriginIssue(issue: typeof issues.$inferSelect) {
|
||
return Object.values(RECOVERY_ORIGIN_KINDS).includes(
|
||
issue.originKind as typeof RECOVERY_ORIGIN_KINDS[keyof typeof RECOVERY_ORIGIN_KINDS],
|
||
);
|
||
}
|
||
|
||
function silenceStartedAtForRun(run: Pick<typeof heartbeatRuns.$inferSelect, "lastOutputAt" | "processStartedAt" | "startedAt" | "createdAt">) {
|
||
return run.lastOutputAt ?? run.processStartedAt ?? run.startedAt ?? run.createdAt ?? null;
|
||
}
|
||
|
||
function silenceAgeMsForRun(run: Pick<typeof heartbeatRuns.$inferSelect, "lastOutputAt" | "processStartedAt" | "startedAt" | "createdAt">, now = new Date()) {
|
||
const startedAt = silenceStartedAtForRun(run);
|
||
return startedAt ? Math.max(0, now.getTime() - startedAt.getTime()) : null;
|
||
}
|
||
|
||
async function latestActiveOutputQuietUntilDecision(companyId: string, runId: string, now = new Date()) {
|
||
const [row] = await db
|
||
.select()
|
||
.from(heartbeatRunWatchdogDecisions)
|
||
.where(
|
||
and(
|
||
eq(heartbeatRunWatchdogDecisions.companyId, companyId),
|
||
eq(heartbeatRunWatchdogDecisions.runId, runId),
|
||
inArray(heartbeatRunWatchdogDecisions.decision, ["snooze", "continue"]),
|
||
gt(heartbeatRunWatchdogDecisions.snoozedUntil, now),
|
||
),
|
||
)
|
||
.orderBy(desc(heartbeatRunWatchdogDecisions.createdAt))
|
||
.limit(1);
|
||
return row ?? null;
|
||
}
|
||
|
||
async function findOpenStaleRunEvaluation(companyId: string, runId: string) {
|
||
const [row] = await db
|
||
.select({
|
||
id: issues.id,
|
||
identifier: issues.identifier,
|
||
status: issues.status,
|
||
priority: issues.priority,
|
||
assigneeAgentId: issues.assigneeAgentId,
|
||
updatedAt: issues.updatedAt,
|
||
})
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, companyId),
|
||
eq(issues.originKind, STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND),
|
||
eq(issues.originId, runId),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
)
|
||
.limit(1);
|
||
return row ?? null;
|
||
}
|
||
|
||
async function buildRunOutputSilence(
|
||
run: Pick<
|
||
typeof heartbeatRuns.$inferSelect,
|
||
"id" | "companyId" | "status" | "lastOutputAt" | "lastOutputSeq" | "lastOutputStream" | "processStartedAt" | "startedAt" | "createdAt"
|
||
>,
|
||
now = new Date(),
|
||
): Promise<RunOutputSilenceSummary> {
|
||
const [quietUntilDecision, evaluation] = await Promise.all([
|
||
latestActiveOutputQuietUntilDecision(run.companyId, run.id, now),
|
||
findOpenStaleRunEvaluation(run.companyId, run.id),
|
||
]);
|
||
const silenceStartedAt = silenceStartedAtForRun(run);
|
||
const silenceAgeMs = run.status === "running" ? silenceAgeMsForRun(run, now) : null;
|
||
const level = run.status !== "running"
|
||
? "not_applicable"
|
||
: quietUntilDecision
|
||
? "snoozed"
|
||
: (silenceAgeMs ?? 0) >= ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS
|
||
? "critical"
|
||
: (silenceAgeMs ?? 0) >= ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS
|
||
? "suspicious"
|
||
: "ok";
|
||
return {
|
||
lastOutputAt: run.lastOutputAt ?? null,
|
||
lastOutputSeq: run.lastOutputSeq ?? 0,
|
||
lastOutputStream: (run.lastOutputStream === "stdout" || run.lastOutputStream === "stderr")
|
||
? run.lastOutputStream
|
||
: null,
|
||
silenceStartedAt,
|
||
silenceAgeMs,
|
||
level,
|
||
suspicionThresholdMs: ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS,
|
||
criticalThresholdMs: ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS,
|
||
snoozedUntil: quietUntilDecision?.snoozedUntil ?? null,
|
||
evaluationIssueId: evaluation?.id ?? null,
|
||
evaluationIssueIdentifier: evaluation?.identifier ?? null,
|
||
evaluationIssueAssigneeAgentId: evaluation?.assigneeAgentId ?? null,
|
||
};
|
||
}
|
||
|
||
function redactWatchdogEvidenceText(value: string, currentUserRedactionOptions: Awaited<ReturnType<typeof getCurrentUserRedactionOptions>>) {
|
||
return redactSensitiveText(redactCurrentUserText(value, currentUserRedactionOptions));
|
||
}
|
||
|
||
function truncateEvidenceText(value: string, maxChars = 4000) {
|
||
if (value.length <= maxChars) return value;
|
||
return `${value.slice(value.length - maxChars)}\n[truncated earlier evidence]`;
|
||
}
|
||
|
||
async function readRunLogTailForEvidence(run: typeof heartbeatRuns.$inferSelect) {
|
||
if (!run.logStore || !run.logRef || !run.logBytes) return "";
|
||
try {
|
||
const offset = Math.max(0, run.logBytes - ACTIVE_RUN_OUTPUT_EVIDENCE_TAIL_BYTES);
|
||
const result = await runLogStore.read(
|
||
{ store: run.logStore as "local_file", logRef: run.logRef },
|
||
{ offset, limitBytes: ACTIVE_RUN_OUTPUT_EVIDENCE_TAIL_BYTES },
|
||
);
|
||
return result.content;
|
||
} catch (err) {
|
||
logger.warn({ err, runId: run.id }, "failed to read stale-run watchdog evidence tail");
|
||
return "";
|
||
}
|
||
}
|
||
|
||
async function resolveStaleRunSourceIssue(run: typeof heartbeatRuns.$inferSelect) {
|
||
const issueId = issueIdFromRunContext(run.contextSnapshot);
|
||
if (!issueId) return null;
|
||
const [issue] = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(and(eq(issues.companyId, run.companyId), eq(issues.id, issueId), isNull(issues.hiddenAt)))
|
||
.limit(1);
|
||
return issue ?? null;
|
||
}
|
||
|
||
async function latestSameRunSourceTerminalEvidence(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
sourceIssue: typeof issues.$inferSelect;
|
||
evidenceAfter: Date | null;
|
||
}) {
|
||
if (!isTerminalIssueStatus(input.sourceIssue.status)) return null;
|
||
const after = input.evidenceAfter ?? input.run.startedAt ?? input.run.createdAt ?? null;
|
||
const activityPredicates = [
|
||
eq(activityLog.companyId, input.run.companyId),
|
||
eq(activityLog.runId, input.run.id),
|
||
eq(activityLog.action, "issue.updated"),
|
||
eq(activityLog.entityType, "issue"),
|
||
eq(activityLog.entityId, input.sourceIssue.id),
|
||
sql`${activityLog.details} ->> 'status' = ${input.sourceIssue.status}`,
|
||
];
|
||
if (after) {
|
||
activityPredicates.push(gte(activityLog.createdAt, after));
|
||
}
|
||
|
||
const activity = await db
|
||
.select({
|
||
id: activityLog.id,
|
||
createdAt: activityLog.createdAt,
|
||
action: activityLog.action,
|
||
})
|
||
.from(activityLog)
|
||
.where(and(...activityPredicates))
|
||
.orderBy(desc(activityLog.createdAt))
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null);
|
||
|
||
if (activity) {
|
||
return {
|
||
kind: "activity" as const,
|
||
id: activity.id,
|
||
createdAt: activity.createdAt,
|
||
action: activity.action,
|
||
};
|
||
}
|
||
return null;
|
||
}
|
||
|
||
async function nextRunEventSeq(runId: string) {
|
||
const [row] = await db
|
||
.select({ maxSeq: sql<number | null>`max(${heartbeatRunEvents.seq})` })
|
||
.from(heartbeatRunEvents)
|
||
.where(eq(heartbeatRunEvents.runId, runId));
|
||
return Number(row?.maxSeq ?? 0) + 1;
|
||
}
|
||
|
||
async function appendRecoveryRunEvent(
|
||
run: typeof heartbeatRuns.$inferSelect,
|
||
event: {
|
||
level: "info" | "warn" | "error";
|
||
message: string;
|
||
payload?: Record<string, unknown>;
|
||
},
|
||
) {
|
||
await db.insert(heartbeatRunEvents).values({
|
||
companyId: run.companyId,
|
||
runId: run.id,
|
||
agentId: run.agentId,
|
||
seq: await nextRunEventSeq(run.id),
|
||
eventType: "lifecycle",
|
||
stream: "system",
|
||
level: event.level,
|
||
message: event.message,
|
||
payload: event.payload ?? null,
|
||
});
|
||
}
|
||
|
||
async function cleanupSourceResolvedRunProcess(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
runningAgent: typeof agents.$inferSelect;
|
||
}) {
|
||
if (!SESSIONED_LOCAL_ADAPTERS.has(input.runningAgent.adapterType)) {
|
||
return {
|
||
attempted: false,
|
||
outcome: "skipped_non_local_adapter",
|
||
adapterType: input.runningAgent.adapterType,
|
||
};
|
||
}
|
||
|
||
const running = runningProcesses.get(input.run.id);
|
||
const pid = running?.child.pid ?? input.run.processPid ?? null;
|
||
const processGroupId = running?.processGroupId ?? input.run.processGroupId ?? null;
|
||
if (typeof pid !== "number" && typeof processGroupId !== "number") {
|
||
return {
|
||
attempted: false,
|
||
outcome: "no_process_metadata",
|
||
adapterType: input.runningAgent.adapterType,
|
||
};
|
||
}
|
||
|
||
const wasAlive =
|
||
(typeof pid === "number" && isPidAlive(pid)) ||
|
||
(typeof processGroupId === "number" && isProcessGroupAlive(processGroupId));
|
||
if (!wasAlive) {
|
||
runningProcesses.delete(input.run.id);
|
||
return {
|
||
attempted: false,
|
||
outcome: "not_running",
|
||
adapterType: input.runningAgent.adapterType,
|
||
pid,
|
||
processGroupId,
|
||
};
|
||
}
|
||
|
||
try {
|
||
await terminateLocalService(
|
||
{
|
||
pid: typeof pid === "number" && Number.isInteger(pid) && pid > 0
|
||
? pid
|
||
: (processGroupId ?? 0),
|
||
processGroupId: typeof processGroupId === "number" && Number.isInteger(processGroupId) && processGroupId > 0
|
||
? processGroupId
|
||
: null,
|
||
},
|
||
running ? { forceAfterMs: Math.max(1, running.graceSec) * 1000 } : undefined,
|
||
);
|
||
runningProcesses.delete(input.run.id);
|
||
const stillAlive =
|
||
(typeof pid === "number" && isPidAlive(pid)) ||
|
||
(typeof processGroupId === "number" && isProcessGroupAlive(processGroupId));
|
||
return {
|
||
attempted: true,
|
||
outcome: stillAlive ? "termination_sent_still_running" : "terminated",
|
||
adapterType: input.runningAgent.adapterType,
|
||
pid,
|
||
processGroupId,
|
||
};
|
||
} catch (error) {
|
||
return {
|
||
attempted: true,
|
||
outcome: "failed",
|
||
adapterType: input.runningAgent.adapterType,
|
||
pid,
|
||
processGroupId,
|
||
error: error instanceof Error ? error.message : String(error),
|
||
};
|
||
}
|
||
}
|
||
|
||
async function finalizeAgentAfterSourceResolvedRun(run: typeof heartbeatRuns.$inferSelect, status: "succeeded" | "cancelled") {
|
||
const [runningCountRow] = await db
|
||
.select({ count: sql<number>`count(*)::int` })
|
||
.from(heartbeatRuns)
|
||
.where(and(eq(heartbeatRuns.agentId, run.agentId), eq(heartbeatRuns.status, "running")));
|
||
const runningCount = Number(runningCountRow?.count ?? 0);
|
||
const nextStatus = runningCount > 0 ? "running" : status === "succeeded" || status === "cancelled" ? "idle" : "error";
|
||
await db
|
||
.update(agents)
|
||
.set({
|
||
status: nextStatus,
|
||
lastHeartbeatAt: new Date(),
|
||
updatedAt: new Date(),
|
||
})
|
||
.where(and(eq(agents.id, run.agentId), notInArray(agents.status, ["paused", "terminated"])));
|
||
}
|
||
|
||
async function foldSourceResolvedStaleRun(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
runningAgent: typeof agents.$inferSelect;
|
||
sourceIssue: typeof issues.$inferSelect;
|
||
evidence: Awaited<ReturnType<typeof latestSameRunSourceTerminalEvidence>>;
|
||
existingEvaluation: Awaited<ReturnType<typeof findOpenStaleRunEvaluation>>;
|
||
silenceStartedAt: Date | null;
|
||
silenceAgeMs: number | null;
|
||
now: Date;
|
||
}) {
|
||
if (!input.evidence) return { kind: "skipped" as const };
|
||
const cleanup = await cleanupSourceResolvedRunProcess({ run: input.run, runningAgent: input.runningAgent });
|
||
const finalRunStatus = input.sourceIssue.status === "cancelled" ? "cancelled" : "succeeded";
|
||
const resultJson = {
|
||
...parseObject(input.run.resultJson),
|
||
sourceResolvedWatchdogFold: {
|
||
sourceIssueId: input.sourceIssue.id,
|
||
sourceIssueIdentifier: input.sourceIssue.identifier,
|
||
sourceIssueStatus: input.sourceIssue.status,
|
||
sameRunEvidenceKind: input.evidence.kind,
|
||
sameRunEvidenceId: input.evidence.id,
|
||
sameRunEvidenceAt: input.evidence.createdAt.toISOString(),
|
||
silenceStartedAt: input.silenceStartedAt?.toISOString() ?? null,
|
||
silenceAgeMs: input.silenceAgeMs,
|
||
evaluationIssueId: input.existingEvaluation?.id ?? null,
|
||
evaluationIssueIdentifier: input.existingEvaluation?.identifier ?? null,
|
||
cleanup,
|
||
},
|
||
};
|
||
const finalizedRun = await db.transaction(async (tx) => {
|
||
const [updatedRun] = await tx
|
||
.update(heartbeatRuns)
|
||
.set({
|
||
status: finalRunStatus,
|
||
finishedAt: input.now,
|
||
error: null,
|
||
errorCode: null,
|
||
resultJson,
|
||
updatedAt: input.now,
|
||
})
|
||
.where(and(eq(heartbeatRuns.id, input.run.id), eq(heartbeatRuns.companyId, input.run.companyId), eq(heartbeatRuns.status, "running")))
|
||
.returning();
|
||
if (!updatedRun) return null;
|
||
|
||
if (input.run.wakeupRequestId) {
|
||
await tx
|
||
.update(agentWakeupRequests)
|
||
.set({
|
||
status: finalRunStatus === "succeeded" ? "completed" : "cancelled",
|
||
finishedAt: input.now,
|
||
error: null,
|
||
updatedAt: input.now,
|
||
})
|
||
.where(and(eq(agentWakeupRequests.id, input.run.wakeupRequestId), eq(agentWakeupRequests.companyId, input.run.companyId)));
|
||
}
|
||
|
||
await tx
|
||
.update(issues)
|
||
.set({
|
||
executionRunId: null,
|
||
executionAgentNameKey: null,
|
||
executionLockedAt: null,
|
||
updatedAt: input.now,
|
||
})
|
||
.where(
|
||
and(
|
||
eq(issues.id, input.sourceIssue.id),
|
||
eq(issues.companyId, input.run.companyId),
|
||
eq(issues.executionRunId, input.run.id),
|
||
),
|
||
);
|
||
|
||
return updatedRun;
|
||
});
|
||
if (!finalizedRun) return { kind: "skipped" as const };
|
||
|
||
if (input.existingEvaluation && !isTerminalIssueStatus(input.existingEvaluation.status)) {
|
||
await issuesSvc.update(input.existingEvaluation.id, { status: "done" });
|
||
await issuesSvc.addComment(input.existingEvaluation.id, [
|
||
"Source-resolved watchdog fold.",
|
||
"",
|
||
`- Source issue: ${input.sourceIssue.identifier ?? input.sourceIssue.id}`,
|
||
`- Run: \`${input.run.id}\``,
|
||
`- Same-run evidence: \`${input.evidence.kind}:${input.evidence.id}\` at ${input.evidence.createdAt.toISOString()}`,
|
||
"- Outcome: false positive; the source issue already reached a terminal disposition from this run.",
|
||
].join("\n"), { runId: input.run.id });
|
||
}
|
||
|
||
const activeRecoveryAction = await recoveryActionsSvc.getActiveForIssue(input.run.companyId, input.sourceIssue.id);
|
||
if (activeRecoveryAction?.kind === "active_run_watchdog") {
|
||
await recoveryActionsSvc.resolveActiveForIssue({
|
||
companyId: input.run.companyId,
|
||
sourceIssueId: input.sourceIssue.id,
|
||
actionId: activeRecoveryAction.id,
|
||
status: "resolved",
|
||
outcome: "false_positive",
|
||
resolutionNote: "Source issue reached a terminal disposition through durable same-run activity; watchdog folded as source-resolved.",
|
||
});
|
||
}
|
||
|
||
const [decision] = await db
|
||
.insert(heartbeatRunWatchdogDecisions)
|
||
.values({
|
||
companyId: input.run.companyId,
|
||
runId: input.run.id,
|
||
evaluationIssueId: input.existingEvaluation?.id ?? null,
|
||
decision: "dismissed_false_positive",
|
||
reason: "Source issue already reached a terminal disposition through durable same-run activity.",
|
||
createdByRunId: input.run.id,
|
||
})
|
||
.returning();
|
||
|
||
await appendRecoveryRunEvent(finalizedRun, {
|
||
level: cleanup.outcome === "failed" ? "warn" : "info",
|
||
message: "Source-resolved watchdog fold finalized stale active run",
|
||
payload: resultJson.sourceResolvedWatchdogFold,
|
||
});
|
||
await logActivity(db, {
|
||
companyId: input.run.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: input.run.agentId,
|
||
runId: input.run.id,
|
||
action: "heartbeat.output_stale_source_resolved",
|
||
entityType: "heartbeat_run",
|
||
entityId: input.run.id,
|
||
details: {
|
||
source: "recovery.scan_silent_active_runs",
|
||
sourceIssueId: input.sourceIssue.id,
|
||
sourceIssueIdentifier: input.sourceIssue.identifier,
|
||
sourceIssueStatus: input.sourceIssue.status,
|
||
evaluationIssueId: input.existingEvaluation?.id ?? null,
|
||
watchdogDecisionId: decision.id,
|
||
sameRunEvidenceKind: input.evidence.kind,
|
||
sameRunEvidenceId: input.evidence.id,
|
||
sameRunEvidenceAt: input.evidence.createdAt.toISOString(),
|
||
cleanup,
|
||
},
|
||
});
|
||
await finalizeAgentAfterSourceResolvedRun(finalizedRun, finalRunStatus);
|
||
return { kind: "folded" as const, evaluationIssueId: input.existingEvaluation?.id ?? null };
|
||
}
|
||
|
||
async function resolveStaleRunOwnerAgentId(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
runningAgent: typeof agents.$inferSelect;
|
||
sourceIssue: typeof issues.$inferSelect | null;
|
||
}) {
|
||
const candidateIds: string[] = [];
|
||
if (input.sourceIssue?.assigneeAgentId) {
|
||
const sourceAssignee = await getAgent(input.sourceIssue.assigneeAgentId);
|
||
if (sourceAssignee?.reportsTo) candidateIds.push(sourceAssignee.reportsTo);
|
||
}
|
||
if (input.runningAgent.reportsTo) candidateIds.push(input.runningAgent.reportsTo);
|
||
const roleCandidates = await db
|
||
.select()
|
||
.from(agents)
|
||
.where(and(eq(agents.companyId, input.run.companyId), inArray(agents.role, ["cto", "ceo"])))
|
||
.orderBy(sql`case when ${agents.role} = 'cto' then 0 else 1 end`, asc(agents.createdAt));
|
||
candidateIds.push(...roleCandidates.map((agent) => agent.id));
|
||
|
||
const seen = new Set<string>();
|
||
for (const agentId of candidateIds) {
|
||
if (seen.has(agentId)) continue;
|
||
seen.add(agentId);
|
||
const candidate = await getAgent(agentId);
|
||
if (!candidate || candidate.companyId !== input.run.companyId) continue;
|
||
const budgetBlock = await budgets.getInvocationBlock(input.run.companyId, candidate.id, {
|
||
issueId: input.sourceIssue?.id ?? null,
|
||
projectId: input.sourceIssue?.projectId ?? null,
|
||
});
|
||
if (isAgentInvokable(candidate) && !budgetBlock) return candidate.id;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
async function collectStaleRunEvidence(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
runningAgent: typeof agents.$inferSelect;
|
||
sourceIssue: typeof issues.$inferSelect | null;
|
||
prefix: string;
|
||
now: Date;
|
||
}) {
|
||
const [tail, recentEvents, childIssues, blockers] = await Promise.all([
|
||
readRunLogTailForEvidence(input.run),
|
||
db
|
||
.select({
|
||
eventType: heartbeatRunEvents.eventType,
|
||
level: heartbeatRunEvents.level,
|
||
message: heartbeatRunEvents.message,
|
||
createdAt: heartbeatRunEvents.createdAt,
|
||
})
|
||
.from(heartbeatRunEvents)
|
||
.where(and(eq(heartbeatRunEvents.companyId, input.run.companyId), eq(heartbeatRunEvents.runId, input.run.id)))
|
||
.orderBy(desc(heartbeatRunEvents.id))
|
||
.limit(8),
|
||
input.sourceIssue
|
||
? db
|
||
.select({ id: issues.id, identifier: issues.identifier, title: issues.title, status: issues.status })
|
||
.from(issues)
|
||
.where(and(eq(issues.companyId, input.run.companyId), eq(issues.parentId, input.sourceIssue.id), isNull(issues.hiddenAt)))
|
||
.orderBy(desc(issues.updatedAt))
|
||
.limit(8)
|
||
: Promise.resolve([]),
|
||
input.sourceIssue
|
||
? db
|
||
.select({ id: issues.id, identifier: issues.identifier, title: issues.title, status: issues.status })
|
||
.from(issueRelations)
|
||
.innerJoin(issues, eq(issueRelations.issueId, issues.id))
|
||
.where(
|
||
and(
|
||
eq(issueRelations.companyId, input.run.companyId),
|
||
eq(issueRelations.relatedIssueId, input.sourceIssue.id),
|
||
eq(issueRelations.type, "blocks"),
|
||
),
|
||
)
|
||
.limit(8)
|
||
: Promise.resolve([]),
|
||
]);
|
||
const currentUserRedactionOptions = await getCurrentUserRedactionOptions();
|
||
const safeTail = truncateEvidenceText(redactWatchdogEvidenceText(tail, currentUserRedactionOptions));
|
||
const silenceAgeMs = silenceAgeMsForRun(input.run, input.now);
|
||
return {
|
||
safeTail,
|
||
silenceAgeMs,
|
||
recentEvents: recentEvents.reverse().map((event) => ({
|
||
eventType: event.eventType,
|
||
level: event.level,
|
||
createdAt: event.createdAt.toISOString(),
|
||
message: event.message ? truncateEvidenceText(redactWatchdogEvidenceText(event.message, currentUserRedactionOptions), 300) : null,
|
||
})),
|
||
childIssues,
|
||
blockers,
|
||
};
|
||
}
|
||
|
||
function buildStaleRunEvaluationDescription(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
runningAgent: typeof agents.$inferSelect;
|
||
sourceIssue: typeof issues.$inferSelect | null;
|
||
prefix: string;
|
||
evidence: Awaited<ReturnType<typeof collectStaleRunEvidence>>;
|
||
level: "suspicious" | "critical";
|
||
now: Date;
|
||
}) {
|
||
const sourceIssue = input.sourceIssue
|
||
? issueUiLink({ identifier: input.sourceIssue.identifier, id: input.sourceIssue.id }, input.prefix)
|
||
: "none";
|
||
const recentEvents = input.evidence.recentEvents.length > 0
|
||
? input.evidence.recentEvents.map((event) =>
|
||
`- ${event.createdAt} \`${event.eventType}\`${event.level ? ` ${event.level}` : ""}: ${event.message ?? "(no message)"}`,
|
||
).join("\n")
|
||
: "- none";
|
||
const childIssues = input.evidence.childIssues.length > 0
|
||
? input.evidence.childIssues.map((issue) =>
|
||
`- ${issueUiLink({ identifier: issue.identifier, id: issue.id }, input.prefix)} \`${issue.status}\`: ${issue.title}`,
|
||
).join("\n")
|
||
: "- none detected";
|
||
const blockers = input.evidence.blockers.length > 0
|
||
? input.evidence.blockers.map((issue) =>
|
||
`- ${issueUiLink({ identifier: issue.identifier, id: issue.id }, input.prefix)} \`${issue.status}\`: ${issue.title}`,
|
||
).join("\n")
|
||
: "- none detected";
|
||
return [
|
||
`Paperclip detected ${input.level} output silence on an active heartbeat run.`,
|
||
"",
|
||
"## Run",
|
||
"",
|
||
`- Run: ${runUiLink(input.run, input.prefix)}`,
|
||
`- Agent: ${input.runningAgent.name} (${input.runningAgent.adapterType})`,
|
||
`- Invocation: ${input.run.invocationSource}${input.run.triggerDetail ? ` / ${input.run.triggerDetail}` : ""}`,
|
||
`- Source issue: ${sourceIssue}`,
|
||
`- Started at: ${input.run.startedAt?.toISOString() ?? "unknown"}`,
|
||
`- Process started at: ${input.run.processStartedAt?.toISOString() ?? "unknown"}`,
|
||
`- Last output at: ${input.run.lastOutputAt?.toISOString() ?? "none recorded"}`,
|
||
`- Last output sequence: ${input.run.lastOutputSeq ?? 0}`,
|
||
`- Silent for: ${formatDuration(input.evidence.silenceAgeMs)}`,
|
||
`- Thresholds: suspicious after ${formatDuration(ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS)}, critical after ${formatDuration(ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS)}`,
|
||
`- Process metadata: pid \`${input.run.processPid ?? "unknown"}\`, process group \`${input.run.processGroupId ?? "unknown"}\`, in-memory handle \`${runningProcesses.has(input.run.id) ? "yes" : "no"}\``,
|
||
"",
|
||
"## Last Output Excerpt",
|
||
"",
|
||
input.evidence.safeTail ? `\`\`\`text\n${input.evidence.safeTail}\n\`\`\`` : "_No run-log tail was available._",
|
||
"",
|
||
"## Recent Run Events",
|
||
"",
|
||
recentEvents,
|
||
"",
|
||
"## Related Work",
|
||
"",
|
||
"Active child issues:",
|
||
childIssues,
|
||
"",
|
||
"Current source blockers:",
|
||
blockers,
|
||
"",
|
||
"## Decision Checklist",
|
||
"",
|
||
"- Continue or snooze if the run is intentionally quiet.",
|
||
"- Ask the run owner for context if work may be delegated outside the transcript.",
|
||
"- Preserve artifacts, branch state, and useful output before cancellation.",
|
||
"- Cancel or recover through the explicit run recovery controls when authorized.",
|
||
"- Close this issue as a false positive only after recording the reason.",
|
||
].join("\n");
|
||
}
|
||
|
||
function isUniqueStaleRunEvaluationConflict(error: unknown) {
|
||
const maybe = unwrapDatabaseConflictError(error);
|
||
if (!maybe) return false;
|
||
return maybe.code === "23505" &&
|
||
(
|
||
maybe.constraint === "issues_active_stale_run_evaluation_uq" ||
|
||
maybe.constraint_name === "issues_active_stale_run_evaluation_uq" ||
|
||
typeof maybe.message === "string" && maybe.message.includes("issues_active_stale_run_evaluation_uq")
|
||
);
|
||
}
|
||
|
||
function isUniqueStrandedIssueRecoveryConflict(error: unknown) {
|
||
const maybe = unwrapDatabaseConflictError(error);
|
||
if (!maybe) return false;
|
||
return maybe.code === "23505" &&
|
||
(
|
||
maybe.constraint === "issues_active_stranded_issue_recovery_uq" ||
|
||
maybe.constraint_name === "issues_active_stranded_issue_recovery_uq" ||
|
||
typeof maybe.message === "string" && maybe.message.includes("issues_active_stranded_issue_recovery_uq")
|
||
);
|
||
}
|
||
|
||
async function ensureSourceIssueBlockedByStaleEvaluation(input: {
|
||
sourceIssue: typeof issues.$inferSelect | null;
|
||
evaluationIssue: { id: string; identifier: string | null };
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
}) {
|
||
if (!input.sourceIssue || ["done", "cancelled"].includes(input.sourceIssue.status)) return false;
|
||
const blockerIds = await existingBlockerIssueIds(input.sourceIssue.companyId, input.sourceIssue.id);
|
||
if (blockerIds.includes(input.evaluationIssue.id)) return false;
|
||
const nextBlockerIds = [...blockerIds, input.evaluationIssue.id];
|
||
await issuesSvc.update(input.sourceIssue.id, {
|
||
...(input.sourceIssue.status === "blocked" ? {} : { status: "blocked" }),
|
||
blockedByIssueIds: nextBlockerIds,
|
||
});
|
||
await issuesSvc.addComment(input.sourceIssue.id, [
|
||
"Paperclip detected critical output silence on this issue's active run.",
|
||
"",
|
||
`- Evaluation issue: ${input.evaluationIssue.identifier ?? input.evaluationIssue.id}`,
|
||
`- Run: \`${input.run.id}\``,
|
||
"",
|
||
"This blocks the source issue on the explicit review task without cancelling the active process.",
|
||
].join("\n"), { runId: input.run.id });
|
||
await logActivity(db, {
|
||
companyId: input.sourceIssue.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: null,
|
||
runId: input.run.id,
|
||
action: "heartbeat.output_stale_escalated",
|
||
entityType: "issue",
|
||
entityId: input.sourceIssue.id,
|
||
details: {
|
||
source: "recovery.scan_silent_active_runs",
|
||
evaluationIssueId: input.evaluationIssue.id,
|
||
blockerIssueIds: nextBlockerIds,
|
||
},
|
||
});
|
||
return true;
|
||
}
|
||
|
||
async function createOrUpdateStaleRunEvaluation(input: {
|
||
run: typeof heartbeatRuns.$inferSelect;
|
||
now: Date;
|
||
}) {
|
||
const runningAgent = await getAgent(input.run.agentId);
|
||
if (!runningAgent || runningAgent.companyId !== input.run.companyId) return { kind: "skipped" as const };
|
||
const sourceIssue = await resolveStaleRunSourceIssue(input.run);
|
||
const existing = await findOpenStaleRunEvaluation(input.run.companyId, input.run.id);
|
||
if (sourceIssue && isRecoveryOriginIssue(sourceIssue)) {
|
||
await logActivity(db, {
|
||
companyId: input.run.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: input.run.agentId,
|
||
runId: input.run.id,
|
||
action: "heartbeat.output_stale_recovery_recursion_refused",
|
||
entityType: "heartbeat_run",
|
||
entityId: input.run.id,
|
||
details: {
|
||
source: "recovery.scan_silent_active_runs",
|
||
sourceIssueId: sourceIssue.id,
|
||
sourceIssueIdentifier: sourceIssue.identifier,
|
||
sourceIssueOriginKind: sourceIssue.originKind,
|
||
existingEvaluationIssueId: existing?.id ?? null,
|
||
},
|
||
});
|
||
return { kind: "skipped" as const };
|
||
}
|
||
const silenceStartedAt = silenceStartedAtForRun(input.run);
|
||
if (sourceIssue && isTerminalIssueStatus(sourceIssue.status)) {
|
||
const terminalEvidence = await latestSameRunSourceTerminalEvidence({
|
||
run: input.run,
|
||
sourceIssue,
|
||
evidenceAfter: silenceStartedAt,
|
||
});
|
||
if (terminalEvidence) {
|
||
return foldSourceResolvedStaleRun({
|
||
run: input.run,
|
||
runningAgent,
|
||
sourceIssue,
|
||
evidence: terminalEvidence,
|
||
existingEvaluation: existing,
|
||
silenceStartedAt,
|
||
silenceAgeMs: silenceAgeMsForRun(input.run, input.now),
|
||
now: input.now,
|
||
});
|
||
}
|
||
}
|
||
const prefix = await getCompanyIssuePrefix(input.run.companyId);
|
||
const evidence = await collectStaleRunEvidence({
|
||
run: input.run,
|
||
runningAgent,
|
||
sourceIssue,
|
||
prefix,
|
||
now: input.now,
|
||
});
|
||
const level = (evidence.silenceAgeMs ?? 0) >= ACTIVE_RUN_OUTPUT_CRITICAL_THRESHOLD_MS ? "critical" : "suspicious";
|
||
if (existing) {
|
||
if (level === "critical" && existing.priority !== "high") {
|
||
await issuesSvc.update(existing.id, {
|
||
priority: "high",
|
||
});
|
||
await issuesSvc.addComment(existing.id, [
|
||
"Critical output silence threshold crossed.",
|
||
"",
|
||
`- Run: \`${input.run.id}\``,
|
||
`- Silent for: ${formatDuration(evidence.silenceAgeMs)}`,
|
||
`- Last output at: ${input.run.lastOutputAt?.toISOString() ?? "none recorded"}`,
|
||
].join("\n"), { runId: input.run.id });
|
||
await ensureSourceIssueBlockedByStaleEvaluation({
|
||
sourceIssue,
|
||
evaluationIssue: existing,
|
||
run: input.run,
|
||
});
|
||
return { kind: "escalated" as const, evaluationIssueId: existing.id };
|
||
}
|
||
if (level === "critical") {
|
||
await ensureSourceIssueBlockedByStaleEvaluation({
|
||
sourceIssue,
|
||
evaluationIssue: existing,
|
||
run: input.run,
|
||
});
|
||
}
|
||
return { kind: "existing" as const, evaluationIssueId: existing.id };
|
||
}
|
||
|
||
const ownerAgentId = await resolveStaleRunOwnerAgentId({ run: input.run, runningAgent, sourceIssue });
|
||
const description = buildStaleRunEvaluationDescription({
|
||
run: input.run,
|
||
runningAgent,
|
||
sourceIssue,
|
||
prefix,
|
||
evidence,
|
||
level,
|
||
now: input.now,
|
||
});
|
||
let evaluation: Awaited<ReturnType<typeof issuesSvc.create>>;
|
||
try {
|
||
evaluation = await issuesSvc.create(input.run.companyId, {
|
||
title: `Review silent active run for ${runningAgent.name}`,
|
||
description,
|
||
status: "todo",
|
||
priority: level === "critical" ? "high" : "medium",
|
||
parentId: sourceIssue && !["done", "cancelled"].includes(sourceIssue.status) ? sourceIssue.id : null,
|
||
projectId: sourceIssue?.projectId ?? null,
|
||
goalId: sourceIssue?.goalId ?? null,
|
||
billingCode: sourceIssue?.billingCode ?? null,
|
||
assigneeAgentId: ownerAgentId,
|
||
assigneeAdapterOverrides: recoveryAssigneeAdapterOverrides("status_only"),
|
||
originKind: STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND,
|
||
originId: input.run.id,
|
||
originRunId: input.run.id,
|
||
originFingerprint: staleActiveRunOriginFingerprint(input.run.companyId, input.run.id),
|
||
});
|
||
} catch (error) {
|
||
if (!isUniqueStaleRunEvaluationConflict(error)) throw error;
|
||
const raced = await findOpenStaleRunEvaluation(input.run.companyId, input.run.id);
|
||
if (!raced) throw error;
|
||
return { kind: "existing" as const, evaluationIssueId: raced.id };
|
||
}
|
||
|
||
await logActivity(db, {
|
||
companyId: input.run.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: ownerAgentId,
|
||
runId: input.run.id,
|
||
action: "heartbeat.output_stale_detected",
|
||
entityType: "issue",
|
||
entityId: evaluation.id,
|
||
details: {
|
||
source: "recovery.scan_silent_active_runs",
|
||
level,
|
||
sourceIssueId: sourceIssue?.id ?? null,
|
||
silenceAgeMs: evidence.silenceAgeMs,
|
||
lastOutputAt: input.run.lastOutputAt?.toISOString() ?? null,
|
||
},
|
||
});
|
||
if (level === "critical") {
|
||
await ensureSourceIssueBlockedByStaleEvaluation({
|
||
sourceIssue,
|
||
evaluationIssue: evaluation,
|
||
run: input.run,
|
||
});
|
||
}
|
||
if (ownerAgentId) {
|
||
await deps.enqueueWakeup(ownerAgentId, {
|
||
source: "assignment",
|
||
triggerDetail: "system",
|
||
reason: "issue_assigned",
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: evaluation.id,
|
||
staleRunId: input.run.id,
|
||
sourceIssueId: sourceIssue?.id ?? null,
|
||
}, "status_only"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: evaluation.id,
|
||
taskId: evaluation.id,
|
||
wakeReason: "issue_assigned",
|
||
source: STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND,
|
||
staleRunId: input.run.id,
|
||
sourceIssueId: sourceIssue?.id ?? null,
|
||
}, "status_only"),
|
||
});
|
||
}
|
||
return { kind: "created" as const, evaluationIssueId: evaluation.id };
|
||
}
|
||
|
||
async function scanSilentActiveRuns(opts?: { now?: Date; companyId?: string }) {
|
||
const now = opts?.now ?? new Date();
|
||
const suspicionBefore = new Date(now.getTime() - ACTIVE_RUN_OUTPUT_SUSPICION_THRESHOLD_MS);
|
||
const candidates = await db
|
||
.select()
|
||
.from(heartbeatRuns)
|
||
.where(
|
||
and(
|
||
opts?.companyId ? eq(heartbeatRuns.companyId, opts.companyId) : undefined,
|
||
eq(heartbeatRuns.status, "running"),
|
||
sql`coalesce(${heartbeatRuns.lastOutputAt}, ${heartbeatRuns.processStartedAt}, ${heartbeatRuns.startedAt}, ${heartbeatRuns.createdAt}) <= ${suspicionBefore.toISOString()}::timestamptz`,
|
||
),
|
||
)
|
||
.orderBy(asc(heartbeatRuns.createdAt))
|
||
.limit(100);
|
||
|
||
const result = {
|
||
scanned: candidates.length,
|
||
created: 0,
|
||
existing: 0,
|
||
escalated: 0,
|
||
folded: 0,
|
||
snoozed: 0,
|
||
skipped: 0,
|
||
evaluationIssueIds: [] as string[],
|
||
};
|
||
|
||
for (const run of candidates) {
|
||
if (await latestActiveOutputQuietUntilDecision(run.companyId, run.id, now)) {
|
||
result.snoozed += 1;
|
||
continue;
|
||
}
|
||
const outcome = await createOrUpdateStaleRunEvaluation({ run, now });
|
||
if (outcome.kind === "created") result.created += 1;
|
||
else if (outcome.kind === "existing") result.existing += 1;
|
||
else if (outcome.kind === "escalated") result.escalated += 1;
|
||
else if (outcome.kind === "folded") result.folded += 1;
|
||
else result.skipped += 1;
|
||
if ("evaluationIssueId" in outcome && outcome.evaluationIssueId) {
|
||
result.evaluationIssueIds.push(outcome.evaluationIssueId);
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
async function recordWatchdogDecision(input: {
|
||
runId: string;
|
||
actor: WatchdogDecisionActor;
|
||
decision: "snooze" | "continue" | "dismissed_false_positive";
|
||
evaluationIssueId?: string | null;
|
||
reason?: string | null;
|
||
snoozedUntil?: Date | null;
|
||
createdByRunId?: string | null;
|
||
now?: Date;
|
||
}) {
|
||
const [run] = await db
|
||
.select()
|
||
.from(heartbeatRuns)
|
||
.where(eq(heartbeatRuns.id, input.runId))
|
||
.limit(1);
|
||
if (!run) throw notFound("Heartbeat run not found");
|
||
|
||
let evaluationIssue: {
|
||
id: string;
|
||
assigneeAgentId: string | null;
|
||
companyId: string;
|
||
originKind: string;
|
||
originId: string | null;
|
||
hiddenAt: Date | null;
|
||
status: string;
|
||
} | null = null;
|
||
if (input.evaluationIssueId) {
|
||
evaluationIssue = await db
|
||
.select({
|
||
id: issues.id,
|
||
assigneeAgentId: issues.assigneeAgentId,
|
||
companyId: issues.companyId,
|
||
originKind: issues.originKind,
|
||
originId: issues.originId,
|
||
hiddenAt: issues.hiddenAt,
|
||
status: issues.status,
|
||
})
|
||
.from(issues)
|
||
.where(and(eq(issues.id, input.evaluationIssueId), eq(issues.companyId, run.companyId)))
|
||
.then((rows) => rows[0] ?? null);
|
||
if (!evaluationIssue) throw notFound("Evaluation issue not found");
|
||
}
|
||
|
||
const boardActor = input.actor.type === "board";
|
||
const assignedRecoveryOwner =
|
||
input.actor.type === "agent" &&
|
||
Boolean(input.actor.agentId) &&
|
||
evaluationIssue !== null &&
|
||
evaluationIssue.originKind === STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND &&
|
||
evaluationIssue.originId === run.id &&
|
||
evaluationIssue.hiddenAt === null &&
|
||
!["done", "cancelled"].includes(evaluationIssue.status) &&
|
||
evaluationIssue?.assigneeAgentId === input.actor.agentId;
|
||
if (!boardActor && !assignedRecoveryOwner) {
|
||
throw forbidden("Only the board or the assigned recovery owner can record watchdog decisions");
|
||
}
|
||
|
||
if (evaluationIssue && (
|
||
evaluationIssue.originKind !== STALE_ACTIVE_RUN_EVALUATION_ORIGIN_KIND ||
|
||
evaluationIssue.originId !== run.id
|
||
)) {
|
||
throw forbidden("Watchdog decision evaluation issue is not bound to the target run");
|
||
}
|
||
|
||
if (input.actor.type === "agent" && !evaluationIssue) {
|
||
throw forbidden("Agent watchdog decisions require the target evaluation issue");
|
||
}
|
||
|
||
const createdByRunId = input.actor.type === "agent"
|
||
? input.actor.runId ?? input.createdByRunId ?? null
|
||
: input.actor.type === "board"
|
||
? input.actor.runId ?? input.createdByRunId ?? null
|
||
: null;
|
||
if (createdByRunId) {
|
||
const [creatorRun] = await db
|
||
.select({ id: heartbeatRuns.id, companyId: heartbeatRuns.companyId, agentId: heartbeatRuns.agentId })
|
||
.from(heartbeatRuns)
|
||
.where(eq(heartbeatRuns.id, createdByRunId))
|
||
.limit(1);
|
||
const sameCompany = creatorRun?.companyId === run.companyId;
|
||
const sameAgent = input.actor.type !== "agent" || creatorRun?.agentId === input.actor.agentId;
|
||
if (!creatorRun || !sameCompany || !sameAgent) {
|
||
throw forbidden("createdByRunId is not valid for this watchdog decision actor");
|
||
}
|
||
}
|
||
|
||
const decisionNow = input.now ?? new Date();
|
||
const effectiveSnoozedUntil = input.decision === "snooze"
|
||
? input.snoozedUntil ?? null
|
||
: input.decision === "continue"
|
||
? input.snoozedUntil && input.snoozedUntil > decisionNow
|
||
? input.snoozedUntil
|
||
: new Date(decisionNow.getTime() + ACTIVE_RUN_OUTPUT_CONTINUE_REARM_MS)
|
||
: null;
|
||
|
||
const [row] = await db
|
||
.insert(heartbeatRunWatchdogDecisions)
|
||
.values({
|
||
companyId: run.companyId,
|
||
runId: run.id,
|
||
evaluationIssueId: input.evaluationIssueId ?? null,
|
||
decision: input.decision,
|
||
snoozedUntil: effectiveSnoozedUntil,
|
||
reason: input.reason ?? null,
|
||
createdByAgentId: input.actor.type === "agent" ? input.actor.agentId ?? null : null,
|
||
createdByUserId: input.actor.type === "board" ? input.actor.userId ?? null : null,
|
||
createdByRunId,
|
||
})
|
||
.returning();
|
||
|
||
await logActivity(db, {
|
||
companyId: run.companyId,
|
||
actorType: input.actor.type === "agent" ? "agent" : "user",
|
||
actorId: input.actor.type === "agent"
|
||
? input.actor.agentId ?? "agent"
|
||
: input.actor.type === "board"
|
||
? input.actor.userId ?? "board"
|
||
: "unknown",
|
||
agentId: input.actor.type === "agent" ? input.actor.agentId ?? null : null,
|
||
runId: run.id,
|
||
action: input.decision === "snooze" ? "heartbeat.watchdog_snoozed" : "heartbeat.watchdog_decision_recorded",
|
||
entityType: "heartbeat_run",
|
||
entityId: run.id,
|
||
details: {
|
||
source: "recovery.record_watchdog_decision",
|
||
decision: input.decision,
|
||
evaluationIssueId: input.evaluationIssueId ?? null,
|
||
snoozedUntil: effectiveSnoozedUntil?.toISOString() ?? null,
|
||
reason: input.reason ?? null,
|
||
},
|
||
});
|
||
|
||
return row;
|
||
}
|
||
|
||
async function findOpenStrandedIssueRecoveryIssue(companyId: string, sourceIssueId: string) {
|
||
return db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, companyId),
|
||
eq(issues.originKind, STRANDED_ISSUE_RECOVERY_ORIGIN_KIND),
|
||
eq(issues.originId, sourceIssueId),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
)
|
||
.orderBy(desc(issues.createdAt))
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null);
|
||
}
|
||
|
||
function isStrandedIssueRecoveryIssue(issue: typeof issues.$inferSelect) {
|
||
return issue.originKind === STRANDED_ISSUE_RECOVERY_ORIGIN_KIND;
|
||
}
|
||
|
||
async function buildNestedStrandedRecoveryLine(issue: typeof issues.$inferSelect, prefix: string) {
|
||
const sourceIssueId = readNonEmptyString(issue.originId);
|
||
const sourceIssue = sourceIssueId
|
||
? await db
|
||
.select({ id: issues.id, identifier: issues.identifier })
|
||
.from(issues)
|
||
.where(and(eq(issues.companyId, issue.companyId), eq(issues.id, sourceIssueId)))
|
||
.then((rows) => rows[0] ?? null)
|
||
: null;
|
||
const sourceLine = sourceIssue
|
||
? `- Original source issue: ${issueUiLink(sourceIssue, prefix)}`
|
||
: sourceIssueId
|
||
? `- Original source issue: \`${sourceIssueId}\``
|
||
: "- Original source issue: unknown";
|
||
|
||
return [
|
||
"",
|
||
"- Nested recovery: suppressed because this issue is already a `stranded_issue_recovery` issue.",
|
||
sourceLine,
|
||
"- Next action: the assigned recovery owner or board operator should fix the runtime/adapter problem, resolve or reassign the original source issue, then mark this recovery issue done or cancelled.",
|
||
].join("\n");
|
||
}
|
||
|
||
async function resolveStrandedIssueRecoveryOwnerAgentId(issue: typeof issues.$inferSelect) {
|
||
const candidateIds: string[] = [];
|
||
if (issue.assigneeAgentId) {
|
||
const assignee = await getAgent(issue.assigneeAgentId);
|
||
if (assignee?.reportsTo) candidateIds.push(assignee.reportsTo);
|
||
}
|
||
if (issue.createdByAgentId) {
|
||
const creator = await getAgent(issue.createdByAgentId);
|
||
if (creator?.reportsTo) candidateIds.push(creator.reportsTo);
|
||
candidateIds.push(issue.createdByAgentId);
|
||
}
|
||
|
||
const roleCandidates = await db
|
||
.select()
|
||
.from(agents)
|
||
.where(and(eq(agents.companyId, issue.companyId), inArray(agents.role, ["cto", "ceo"])))
|
||
.orderBy(sql`case when ${agents.role} = 'cto' then 0 else 1 end`, asc(agents.createdAt));
|
||
candidateIds.push(...roleCandidates.map((agent) => agent.id));
|
||
if (issue.assigneeAgentId) candidateIds.push(issue.assigneeAgentId);
|
||
|
||
const seen = new Set<string>();
|
||
for (const agentId of candidateIds) {
|
||
if (seen.has(agentId)) continue;
|
||
seen.add(agentId);
|
||
const candidate = await getAgent(agentId);
|
||
if (!candidate || candidate.companyId !== issue.companyId) continue;
|
||
const budgetBlock = await budgets.getInvocationBlock(issue.companyId, candidate.id, {
|
||
issueId: issue.id,
|
||
projectId: issue.projectId,
|
||
});
|
||
if (isAgentInvokable(candidate) && !budgetBlock) return candidate.id;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
function buildStrandedIssueRecoveryDescription(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
latestRun: LatestIssueRun;
|
||
previousStatus: "todo" | "in_progress";
|
||
prefix: string;
|
||
recoveryCause?: StrandedRecoveryCause;
|
||
successfulRunHandoffEvidence?: SuccessfulRunHandoffRecoveryEvidence | null;
|
||
sourceAssignee?: Pick<typeof agents.$inferSelect, "id" | "name"> | null;
|
||
}) {
|
||
const sourceIssue = issueUiLink({ identifier: input.issue.identifier, id: input.issue.id }, input.prefix);
|
||
const runLink = input.latestRun
|
||
? `[\`${input.latestRun.id}\`](/${input.prefix}/agents/${input.latestRun.agentId}/runs/${input.latestRun.id})`
|
||
: "none";
|
||
if (input.recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON) {
|
||
const sourceRunId = input.successfulRunHandoffEvidence?.sourceRunId;
|
||
const sourceRunLink = sourceRunId && input.latestRun
|
||
? `[\`${sourceRunId}\`](/${input.prefix}/agents/${input.latestRun.agentId}/runs/${sourceRunId})`
|
||
: "unknown";
|
||
const missingDisposition = input.successfulRunHandoffEvidence?.missingDisposition ?? "clear_next_step";
|
||
return [
|
||
"Paperclip exhausted the bounded corrective handoff for a successful run that still has no valid issue disposition.",
|
||
"",
|
||
"This is not a runtime/adapter crash report. The source run succeeded; the remaining problem is the missing `done`, `in_review`, `blocked`, delegated follow-up, or explicit continuation path.",
|
||
"",
|
||
"## Safe Evidence",
|
||
"",
|
||
`- Source issue: ${sourceIssue}`,
|
||
`- Source run: ${sourceRunLink}`,
|
||
`- Corrective handoff run: ${runLink}`,
|
||
`- Source assignee: ${agentUiLink(input.sourceAssignee ?? null, input.prefix)}`,
|
||
`- Latest issue status: \`${input.issue.status}\``,
|
||
`- Latest handoff run status: \`${input.latestRun?.status ?? "unknown"}\``,
|
||
`- Normalized cause: \`${SUCCESSFUL_RUN_MISSING_STATE_REASON}\``,
|
||
`- Missing disposition: \`${missingDisposition}\``,
|
||
"- Suggested manager action: choose and record a valid issue disposition without copying transcript content.",
|
||
"",
|
||
"## Required Action",
|
||
"",
|
||
"- Inspect the source issue and run metadata, not raw transcript excerpts.",
|
||
"- Choose a valid issue disposition: `done`/`cancelled`, `in_review` with an owner, `blocked` with first-class blockers, delegated follow-up work, or an explicit continuation path.",
|
||
"- When the source issue has a clear owner and disposition, mark this recovery issue done.",
|
||
].join("\n");
|
||
}
|
||
|
||
const retryReason = readNonEmptyString(parseObject(input.latestRun?.contextSnapshot)?.retryReason) ?? "unknown";
|
||
const failureSummary = summarizeRunFailureForIssueComment(input.latestRun);
|
||
|
||
return [
|
||
"Paperclip exhausted automatic recovery for an assigned issue and created this explicit recovery task.",
|
||
"",
|
||
"## Source",
|
||
"",
|
||
`- Source issue: ${sourceIssue}`,
|
||
`- Previous source status: \`${input.previousStatus}\``,
|
||
`- Latest retry run: ${runLink}`,
|
||
`- Latest retry status: \`${input.latestRun?.status ?? "unknown"}\``,
|
||
`- Detected invariant: \`stranded_assigned_issue\``,
|
||
`- Retry reason: \`${retryReason}\``,
|
||
failureSummary ? `- Failure: ${failureSummary.trim()}` : "- Failure: none recorded",
|
||
"",
|
||
"## Ownership",
|
||
"",
|
||
"- Selected owner: the first invokable manager/creator/executive candidate with budget available.",
|
||
"",
|
||
"## Required Action",
|
||
"",
|
||
"- Inspect the latest run and source issue state.",
|
||
"- Fix the runtime/adapter problem, reassign the source issue, or convert the source issue into a clear manual-review state.",
|
||
"- When the source issue has a live execution path or has been intentionally resolved, mark this recovery issue done.",
|
||
].join("\n");
|
||
}
|
||
|
||
async function ensureStrandedIssueRecoveryIssue(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
latestRun: LatestIssueRun;
|
||
previousStatus: "todo" | "in_progress";
|
||
recoveryCause?: StrandedRecoveryCause;
|
||
successfulRunHandoffEvidence?: SuccessfulRunHandoffRecoveryEvidence | null;
|
||
}) {
|
||
if (isStrandedIssueRecoveryIssue(input.issue)) return null;
|
||
|
||
const existing = await findOpenStrandedIssueRecoveryIssue(input.issue.companyId, input.issue.id);
|
||
if (existing) return existing;
|
||
|
||
const ownerAgentId = await resolveStrandedIssueRecoveryOwnerAgentId(input.issue);
|
||
if (!ownerAgentId) return null;
|
||
|
||
const prefix = await getCompanyIssuePrefix(input.issue.companyId);
|
||
const sourceAssignee = input.issue.assigneeAgentId ? await getAgent(input.issue.assigneeAgentId) : null;
|
||
const recoveryCause = input.recoveryCause ?? "stranded_assigned_issue";
|
||
let recovery: Awaited<ReturnType<typeof issuesSvc.create>>;
|
||
try {
|
||
recovery = await issuesSvc.create(input.issue.companyId, {
|
||
title: recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON
|
||
? `Recover missing next step ${input.issue.identifier ?? input.issue.title}`
|
||
: `Recover stalled issue ${input.issue.identifier ?? input.issue.title}`,
|
||
description: buildStrandedIssueRecoveryDescription({
|
||
issue: input.issue,
|
||
latestRun: input.latestRun,
|
||
previousStatus: input.previousStatus,
|
||
prefix,
|
||
recoveryCause,
|
||
successfulRunHandoffEvidence: input.successfulRunHandoffEvidence,
|
||
sourceAssignee,
|
||
}),
|
||
status: "todo",
|
||
priority: input.issue.priority,
|
||
parentId: input.issue.id,
|
||
projectId: input.issue.projectId,
|
||
goalId: input.issue.goalId,
|
||
assigneeAgentId: ownerAgentId,
|
||
assigneeAdapterOverrides: recoveryAssigneeAdapterOverrides("status_only"),
|
||
originKind: STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||
originId: input.issue.id,
|
||
originRunId: input.latestRun?.id ?? null,
|
||
originFingerprint: [
|
||
STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||
input.issue.companyId,
|
||
input.issue.id,
|
||
recoveryCause,
|
||
input.latestRun?.id ?? "no-run",
|
||
].join(":"),
|
||
billingCode: input.issue.billingCode,
|
||
inheritExecutionWorkspaceFromIssueId: input.issue.id,
|
||
});
|
||
} catch (error) {
|
||
if (!isUniqueStrandedIssueRecoveryConflict(error)) throw error;
|
||
const raced = await findOpenStrandedIssueRecoveryIssue(input.issue.companyId, input.issue.id);
|
||
if (!raced) throw error;
|
||
return raced;
|
||
}
|
||
|
||
await deps.enqueueWakeup(ownerAgentId, {
|
||
source: "assignment",
|
||
triggerDetail: "system",
|
||
reason: "issue_assigned",
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: recovery.id,
|
||
sourceIssueId: input.issue.id,
|
||
strandedRunId: input.latestRun?.id ?? null,
|
||
recoveryCause,
|
||
}, "status_only"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: recovery.id,
|
||
taskId: recovery.id,
|
||
wakeReason: "issue_assigned",
|
||
source: STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||
sourceIssueId: input.issue.id,
|
||
strandedRunId: input.latestRun?.id ?? null,
|
||
recoveryCause,
|
||
}, "status_only"),
|
||
});
|
||
|
||
return recovery;
|
||
}
|
||
|
||
function strandedRecoveryActionKind(cause: StrandedRecoveryCause) {
|
||
return cause === SUCCESSFUL_RUN_MISSING_STATE_REASON
|
||
? "missing_disposition" as const
|
||
: "stranded_assigned_issue" as const;
|
||
}
|
||
|
||
function strandedRecoveryActionFingerprint(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
recoveryCause: StrandedRecoveryCause;
|
||
}) {
|
||
return [
|
||
"source_scoped_recovery",
|
||
input.issue.companyId,
|
||
input.issue.id,
|
||
input.recoveryCause,
|
||
].join(":");
|
||
}
|
||
|
||
function buildStrandedRecoveryActionEvidence(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
latestRun: LatestIssueRun;
|
||
previousStatus: "todo" | "in_progress";
|
||
recoveryCause: StrandedRecoveryCause;
|
||
successfulRunHandoffEvidence?: SuccessfulRunHandoffRecoveryEvidence | null;
|
||
}) {
|
||
const context = parseObject(input.latestRun?.contextSnapshot);
|
||
return {
|
||
sourceIssueId: input.issue.id,
|
||
sourceIdentifier: input.issue.identifier,
|
||
previousStatus: input.previousStatus,
|
||
latestIssueStatus: input.issue.status,
|
||
latestRunId: input.latestRun?.id ?? null,
|
||
latestRunStatus: input.latestRun?.status ?? null,
|
||
latestRunErrorCode: input.latestRun?.errorCode ?? null,
|
||
retryReason: readNonEmptyString(context.retryReason) ?? null,
|
||
recoveryCause: input.recoveryCause,
|
||
sourceRunId: input.successfulRunHandoffEvidence?.sourceRunId ?? null,
|
||
correctiveRunId: input.successfulRunHandoffEvidence?.correctiveRunId ?? null,
|
||
missingDisposition: input.successfulRunHandoffEvidence?.missingDisposition ?? null,
|
||
handoffAttempt: input.successfulRunHandoffEvidence?.handoffAttempt ?? null,
|
||
maxHandoffAttempts: input.successfulRunHandoffEvidence?.maxHandoffAttempts ?? null,
|
||
};
|
||
}
|
||
|
||
async function ensureSourceScopedStrandedRecoveryAction(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
latestRun: LatestIssueRun;
|
||
previousStatus: "todo" | "in_progress";
|
||
recoveryCause?: StrandedRecoveryCause;
|
||
successfulRunHandoffEvidence?: SuccessfulRunHandoffRecoveryEvidence | null;
|
||
}) {
|
||
const recoveryCause = input.recoveryCause ?? "stranded_assigned_issue";
|
||
const ownerAgentId = await resolveStrandedIssueRecoveryOwnerAgentId(input.issue);
|
||
const now = new Date();
|
||
const action = await recoveryActionsSvc.upsertSourceScoped({
|
||
companyId: input.issue.companyId,
|
||
sourceIssueId: input.issue.id,
|
||
kind: strandedRecoveryActionKind(recoveryCause),
|
||
ownerType: ownerAgentId ? "agent" : "board",
|
||
ownerAgentId,
|
||
previousOwnerAgentId: input.issue.assigneeAgentId,
|
||
returnOwnerAgentId: input.issue.assigneeAgentId,
|
||
cause: recoveryCause,
|
||
fingerprint: strandedRecoveryActionFingerprint({
|
||
issue: input.issue,
|
||
recoveryCause,
|
||
}),
|
||
evidence: buildStrandedRecoveryActionEvidence({
|
||
issue: input.issue,
|
||
latestRun: input.latestRun,
|
||
previousStatus: input.previousStatus,
|
||
recoveryCause,
|
||
successfulRunHandoffEvidence: input.successfulRunHandoffEvidence,
|
||
}),
|
||
nextAction: recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON
|
||
? "Choose and record a valid issue disposition without copying transcript content."
|
||
: "Restore a live execution path, fix the runtime/adapter failure, or record an intentional manual resolution.",
|
||
wakePolicy: ownerAgentId
|
||
? {
|
||
type: "wake_owner",
|
||
reason: "source_scoped_recovery_action",
|
||
ownerAgentId,
|
||
}
|
||
: {
|
||
type: "board_escalation",
|
||
reason: "no_invokable_recovery_owner",
|
||
},
|
||
monitorPolicy: null,
|
||
maxAttempts: null,
|
||
lastAttemptAt: now,
|
||
});
|
||
|
||
return action;
|
||
}
|
||
|
||
async function enqueueSourceScopedStrandedRecoveryWake(input: {
|
||
action: Awaited<ReturnType<typeof recoveryActionsSvc.upsertSourceScoped>>;
|
||
issue: typeof issues.$inferSelect;
|
||
latestRun: LatestIssueRun;
|
||
recoveryCause: StrandedRecoveryCause;
|
||
}) {
|
||
if (!input.action.ownerAgentId) return;
|
||
await deps.enqueueWakeup(input.action.ownerAgentId, {
|
||
source: "assignment",
|
||
triggerDetail: "system",
|
||
reason: "source_scoped_recovery_action",
|
||
idempotencyKey: `source_scoped_recovery_action:${input.action.id}:${input.action.attemptCount}`,
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: input.issue.id,
|
||
sourceIssueId: input.issue.id,
|
||
recoveryActionId: input.action.id,
|
||
strandedRunId: input.latestRun?.id ?? null,
|
||
recoveryCause: input.recoveryCause,
|
||
}, "status_only"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: input.issue.id,
|
||
taskId: input.issue.id,
|
||
wakeReason: "source_scoped_recovery_action",
|
||
skipIssueComment: true,
|
||
source: "issue_recovery_action",
|
||
recoveryActionId: input.action.id,
|
||
sourceIssueId: input.issue.id,
|
||
strandedRunId: input.latestRun?.id ?? null,
|
||
recoveryCause: input.recoveryCause,
|
||
}, "status_only"),
|
||
});
|
||
}
|
||
|
||
function buildRecoveryIssueInPlaceEscalationComment(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
previousStatus: "todo" | "in_progress";
|
||
latestRun: LatestIssueRun;
|
||
prefix: string;
|
||
}) {
|
||
const runLink = input.latestRun
|
||
? runUiLink({ id: input.latestRun.id, agentId: input.latestRun.agentId }, input.prefix)
|
||
: "none";
|
||
const retryReason = readNonEmptyString(parseObject(input.latestRun?.contextSnapshot)?.retryReason) ?? "none";
|
||
const failureSummary = summarizeRunFailureForIssueComment(input.latestRun);
|
||
|
||
return [
|
||
"Paperclip stopped automatic stranded-work recovery for this recovery issue.",
|
||
"",
|
||
`- Recovery issue: ${issueUiLink({ identifier: input.issue.identifier, id: input.issue.id }, input.prefix)}`,
|
||
`- Previous status: \`${input.previousStatus}\``,
|
||
`- Latest run: ${runLink}`,
|
||
`- Latest run status: \`${input.latestRun?.status ?? "unknown"}\``,
|
||
`- Retry reason: \`${retryReason}\``,
|
||
failureSummary ? `- Failure: ${failureSummary.trim()}` : "- Failure: none recorded",
|
||
"- Guard: recovery issues do not create nested `stranded_issue_recovery` issues.",
|
||
"",
|
||
"Next action: the current recovery owner should inspect the failed run evidence, restore a live execution path or record the manual resolution, then move this recovery issue out of `blocked`.",
|
||
].join("\n");
|
||
}
|
||
|
||
async function escalateStrandedRecoveryIssueInPlace(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
previousStatus: "todo" | "in_progress";
|
||
latestRun: LatestIssueRun;
|
||
}) {
|
||
const updated = await issuesSvc.update(input.issue.id, { status: "blocked" });
|
||
if (!updated) return null;
|
||
|
||
const prefix = await getCompanyIssuePrefix(input.issue.companyId);
|
||
await issuesSvc.addComment(
|
||
input.issue.id,
|
||
buildRecoveryIssueInPlaceEscalationComment({
|
||
issue: input.issue,
|
||
previousStatus: input.previousStatus,
|
||
latestRun: input.latestRun,
|
||
prefix,
|
||
}),
|
||
{},
|
||
);
|
||
|
||
await logActivity(db, {
|
||
companyId: input.issue.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: null,
|
||
runId: null,
|
||
action: "issue.updated",
|
||
entityType: "issue",
|
||
entityId: input.issue.id,
|
||
details: {
|
||
identifier: input.issue.identifier,
|
||
status: "blocked",
|
||
previousStatus: input.previousStatus,
|
||
source: "recovery.reconcile_stranded_recovery_issue",
|
||
latestRunId: input.latestRun?.id ?? null,
|
||
latestRunStatus: input.latestRun?.status ?? null,
|
||
latestRunErrorCode: input.latestRun?.errorCode ?? null,
|
||
originKind: input.issue.originKind,
|
||
originId: input.issue.originId,
|
||
},
|
||
});
|
||
|
||
return updated;
|
||
}
|
||
|
||
async function existingBlockerIssueIds(companyId: string, issueId: string) {
|
||
return db
|
||
.select({ blockerIssueId: issueRelations.issueId })
|
||
.from(issueRelations)
|
||
.where(
|
||
and(
|
||
eq(issueRelations.companyId, companyId),
|
||
eq(issueRelations.relatedIssueId, issueId),
|
||
eq(issueRelations.type, "blocks"),
|
||
),
|
||
)
|
||
.then((rows) => rows.map((row) => row.blockerIssueId));
|
||
}
|
||
|
||
async function existingUnresolvedBlockerIssueIds(companyId: string, issueId: string) {
|
||
return db
|
||
.select({ blockerIssueId: issueRelations.issueId })
|
||
.from(issueRelations)
|
||
.innerJoin(
|
||
issues,
|
||
and(
|
||
eq(issues.companyId, issueRelations.companyId),
|
||
eq(issues.id, issueRelations.issueId),
|
||
),
|
||
)
|
||
.where(
|
||
and(
|
||
eq(issueRelations.companyId, companyId),
|
||
eq(issueRelations.relatedIssueId, issueId),
|
||
eq(issueRelations.type, "blocks"),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
)
|
||
.then((rows) => rows.map((row) => row.blockerIssueId));
|
||
}
|
||
|
||
async function escalateStrandedAssignedIssue(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
previousStatus: "todo" | "in_progress";
|
||
latestRun: LatestIssueRun;
|
||
comment?: string;
|
||
recoveryCause?: StrandedRecoveryCause;
|
||
successfulRunHandoffEvidence?: SuccessfulRunHandoffRecoveryEvidence | null;
|
||
}) {
|
||
if (isStrandedIssueRecoveryIssue(input.issue)) {
|
||
return escalateStrandedRecoveryIssueInPlace({
|
||
issue: input.issue,
|
||
previousStatus: input.previousStatus,
|
||
latestRun: input.latestRun,
|
||
});
|
||
}
|
||
|
||
const recoveryCause = input.recoveryCause ?? "stranded_assigned_issue";
|
||
const recoveryAction = await ensureSourceScopedStrandedRecoveryAction({
|
||
issue: input.issue,
|
||
previousStatus: input.previousStatus,
|
||
latestRun: input.latestRun,
|
||
recoveryCause,
|
||
successfulRunHandoffEvidence: input.successfulRunHandoffEvidence,
|
||
});
|
||
const blockerIds = await existingUnresolvedBlockerIssueIds(input.issue.companyId, input.issue.id);
|
||
const updated = await issuesSvc.update(input.issue.id, {
|
||
status: "blocked",
|
||
blockedByIssueIds: blockerIds,
|
||
assigneeAgentId: recoveryAction.ownerAgentId ?? input.issue.assigneeAgentId,
|
||
});
|
||
if (!updated) return null;
|
||
|
||
const prefix = await getCompanyIssuePrefix(input.issue.companyId);
|
||
const recoveryOwner = recoveryAction.ownerAgentId ? await getAgent(recoveryAction.ownerAgentId) : null;
|
||
const sourceAssignee = input.issue.assigneeAgentId ? await getAgent(input.issue.assigneeAgentId) : null;
|
||
let notice: SuccessfulRunHandoffNotice | null = null;
|
||
if (input.recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON && input.successfulRunHandoffEvidence) {
|
||
notice = buildSuccessfulRunHandoffExhaustedNotice({
|
||
issue: input.issue,
|
||
sourceRun: input.successfulRunHandoffEvidence.sourceRunId
|
||
? { id: input.successfulRunHandoffEvidence.sourceRunId, status: "succeeded" }
|
||
: null,
|
||
correctiveRun: input.latestRun ? { id: input.latestRun.id, status: input.latestRun.status } : null,
|
||
sourceAssignee,
|
||
recoveryIssue: null,
|
||
recoveryActionId: recoveryAction.id,
|
||
recoveryOwner,
|
||
latestIssueStatus: input.issue.status,
|
||
latestHandoffRunStatus: input.latestRun?.status ?? "unknown",
|
||
missingDisposition: input.successfulRunHandoffEvidence.missingDisposition,
|
||
});
|
||
}
|
||
const recoveryLine = recoveryAction.ownerAgentId
|
||
? [
|
||
"",
|
||
`- Recovery action: \`${recoveryAction.id}\``,
|
||
`- Recovery owner: ${agentUiLink(recoveryOwner, prefix)}`,
|
||
"- Next action: the recovery owner should either restore a live execution path or record the manual resolution on the source issue.",
|
||
].join("\n")
|
||
: [
|
||
"",
|
||
`- Recovery action: \`${recoveryAction.id}\``,
|
||
"- Recovery owner: board escalation, because Paperclip could not find an invokable manager, creator, or executive owner with budget available.",
|
||
"- Next action: a board operator should assign an invokable recovery owner, fix the agent/runtime state, or record an intentional manual resolution.",
|
||
].join("\n");
|
||
|
||
if (recoveryAction.attemptCount === 1) {
|
||
const escalationCommentMarker = `Recovery action: \`${recoveryAction.id}\``;
|
||
|
||
const hasEscalationComment = await db
|
||
.select({ id: issueComments.id, body: issueComments.body, metadata: issueComments.metadata })
|
||
.from(issueComments)
|
||
.where(
|
||
and(
|
||
eq(issueComments.issueId, input.issue.id),
|
||
eq(issueComments.authorType, "system"),
|
||
),
|
||
)
|
||
.orderBy(desc(issueComments.createdAt))
|
||
.limit(50)
|
||
.then((rows) => rows.some((row) =>
|
||
(row.body ?? "").includes(escalationCommentMarker) ||
|
||
noticeMetadataReferencesRecoveryAction(row.metadata, recoveryAction.id),
|
||
));
|
||
|
||
if (!hasEscalationComment) {
|
||
if (notice) {
|
||
await issuesSvc.addComment(input.issue.id, notice.body, {}, {
|
||
authorType: "system",
|
||
presentation: notice.presentation,
|
||
metadata: notice.metadata,
|
||
});
|
||
} else {
|
||
await issuesSvc.addComment(input.issue.id, `${input.comment ?? ""}${recoveryLine}`, {}, {
|
||
authorType: "system",
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
await logActivity(db, {
|
||
companyId: input.issue.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: null,
|
||
runId: null,
|
||
action: input.recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON
|
||
? "issue.successful_run_handoff_escalated"
|
||
: "issue.updated",
|
||
entityType: "issue",
|
||
entityId: input.issue.id,
|
||
details: {
|
||
identifier: input.issue.identifier,
|
||
status: "blocked",
|
||
previousStatus: input.previousStatus,
|
||
source: input.recoveryCause === SUCCESSFUL_RUN_MISSING_STATE_REASON
|
||
? "recovery.reconcile_successful_run_handoff_missing_state"
|
||
: "recovery.reconcile_stranded_assigned_issue",
|
||
recoveryCause: input.recoveryCause ?? "stranded_assigned_issue",
|
||
latestRunId: input.latestRun?.id ?? null,
|
||
latestRunStatus: input.latestRun?.status ?? null,
|
||
latestRunErrorCode: input.latestRun?.errorCode ?? null,
|
||
recoveryActionId: recoveryAction.id,
|
||
recoveryOwnerAgentId: recoveryAction.ownerAgentId,
|
||
previousOwnerAgentId: recoveryAction.previousOwnerAgentId,
|
||
returnOwnerAgentId: recoveryAction.returnOwnerAgentId,
|
||
blockerIssueIds: blockerIds,
|
||
},
|
||
});
|
||
|
||
await enqueueSourceScopedStrandedRecoveryWake({
|
||
action: recoveryAction,
|
||
issue: input.issue,
|
||
latestRun: input.latestRun,
|
||
recoveryCause,
|
||
});
|
||
|
||
if (recoveryAction.ownerAgentId && recoveryAction.ownerAgentId === input.issue.assigneeAgentId) {
|
||
const [currentIssue] = await db
|
||
.select({
|
||
status: issues.status,
|
||
assigneeAgentId: issues.assigneeAgentId,
|
||
})
|
||
.from(issues)
|
||
.where(eq(issues.id, input.issue.id))
|
||
.limit(1);
|
||
if (
|
||
currentIssue &&
|
||
(currentIssue.status !== "blocked" ||
|
||
currentIssue.assigneeAgentId !== recoveryAction.ownerAgentId)
|
||
) {
|
||
const reblocked = await issuesSvc.update(input.issue.id, {
|
||
status: "blocked",
|
||
blockedByIssueIds: blockerIds,
|
||
assigneeAgentId: recoveryAction.ownerAgentId,
|
||
});
|
||
if (reblocked) return reblocked;
|
||
}
|
||
}
|
||
|
||
return updated;
|
||
}
|
||
|
||
async function reconcileStrandedAssignedIssues() {
|
||
const candidates = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
isNull(issues.assigneeUserId),
|
||
inArray(issues.status, ["todo", "in_progress"]),
|
||
sql`${issues.assigneeAgentId} is not null`,
|
||
),
|
||
);
|
||
|
||
const result = {
|
||
assignmentDispatched: 0,
|
||
dispatchRequeued: 0,
|
||
continuationRequeued: 0,
|
||
productiveContinuationObserved: 0,
|
||
successfulContinuationObserved: 0,
|
||
orphanBlockersAssigned: 0,
|
||
successfulRunHandoffEscalated: 0,
|
||
escalated: 0,
|
||
skipped: 0,
|
||
issueIds: [] as string[],
|
||
};
|
||
|
||
for (const issue of candidates) {
|
||
const agentId = issue.assigneeAgentId;
|
||
if (!agentId) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const agent = await getAgent(agentId);
|
||
if (!agent || agent.companyId !== issue.companyId || !isAgentInvokable(agent)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
if (await hasActiveExecutionPath(issue.companyId, issue.id)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
if (await isAutomaticRecoverySuppressedByPauseHold(db, issue.companyId, issue.id, treeControlSvc)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const latestRun = await getLatestIssueRun(issue.companyId, issue.id);
|
||
if (isStrandedIssueRecoveryIssue(issue) && isUnsuccessfulTerminalIssueRun(latestRun)) {
|
||
const updated = await escalateStrandedRecoveryIssueInPlace({
|
||
issue,
|
||
previousStatus: issue.status as "todo" | "in_progress",
|
||
latestRun,
|
||
});
|
||
if (updated) {
|
||
result.escalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (issue.status === "todo") {
|
||
if (!latestRun) {
|
||
if (await hasQueuedIssueWake(issue.companyId, issue.id)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
if (await isInvocationBudgetBlocked(issue, agentId)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const queued = await enqueueInitialAssignedTodoDispatch(issue, agentId);
|
||
if (queued) {
|
||
result.assignmentDispatched += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (latestRun.status === "succeeded") {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
if (didAutomaticRecoveryFail(latestRun, "assignment_recovery")) {
|
||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||
const updated = await escalateStrandedAssignedIssue({
|
||
issue,
|
||
previousStatus: "todo",
|
||
latestRun,
|
||
comment:
|
||
"Paperclip automatically retried dispatch for this assigned `todo` issue after a lost wake/run, " +
|
||
`but it still has no live execution path.${failureSummary ?? ""} ` +
|
||
"Moving it to `blocked` so it is visible for intervention.",
|
||
});
|
||
if (updated) {
|
||
result.escalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (await isInvocationBudgetBlocked(issue, agentId)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const queued = await enqueueStrandedIssueRecovery({
|
||
issueId: issue.id,
|
||
agentId,
|
||
reason: "issue_assignment_recovery",
|
||
retryReason: "assignment_recovery",
|
||
source: "issue.assignment_recovery",
|
||
retryOfRunId: latestRun.id,
|
||
});
|
||
if (queued) {
|
||
result.dispatchRequeued += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (!latestRun && !issue.checkoutRunId && !issue.executionRunId) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
const handoffEvidence = isExhaustedSuccessfulRunHandoff(latestRun);
|
||
if (handoffEvidence) {
|
||
if (!handoffEvidence.exhausted) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const updated = await escalateStrandedAssignedIssue({
|
||
issue,
|
||
previousStatus: "in_progress",
|
||
latestRun,
|
||
recoveryCause: SUCCESSFUL_RUN_MISSING_STATE_REASON,
|
||
successfulRunHandoffEvidence: handoffEvidence,
|
||
});
|
||
if (updated) {
|
||
result.successfulRunHandoffEscalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
if (isSuccessfulInProgressContinuationRun(latestRun)) {
|
||
const successfulRun = latestRun;
|
||
|
||
if (!isProductiveContinuationRun(successfulRun)) {
|
||
result.successfulContinuationObserved += 1;
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
if (isRepeatedProductiveContinuationRecovery(successfulRun)) {
|
||
const updated = await escalateStrandedAssignedIssue({
|
||
issue,
|
||
previousStatus: "in_progress",
|
||
latestRun: successfulRun,
|
||
comment:
|
||
"Paperclip automatically retried continuation for this assigned `in_progress` issue and the retry " +
|
||
"made progress, but it still has no live execution path. Moving it to `blocked` so it is visible for intervention.",
|
||
});
|
||
if (updated) {
|
||
result.escalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (await isInvocationBudgetBlocked(issue, agentId)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const queued = await enqueueStrandedIssueRecovery({
|
||
issueId: issue.id,
|
||
agentId,
|
||
reason: "issue_continuation_needed",
|
||
retryReason: "issue_continuation_needed",
|
||
source: "issue.productive_terminal_continuation_recovery",
|
||
retryOfRunId: successfulRun.id,
|
||
});
|
||
if (queued) {
|
||
result.continuationRequeued += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
if (isUnsuccessfulTerminalIssueRun(latestRun)) {
|
||
const classification = classifyContinuationFailure(latestRun);
|
||
|
||
if (classification.kind === "non_retryable") {
|
||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||
const updated = await escalateStrandedAssignedIssue({
|
||
issue,
|
||
previousStatus: "in_progress",
|
||
latestRun,
|
||
comment:
|
||
"Paperclip detected a non-retryable failure on this issue's continuation run " +
|
||
`(\`${classification.errorCode}\`). Skipping automatic retries and moving it to \`blocked\` ` +
|
||
`so it is visible for intervention.${failureSummary ?? ""}`,
|
||
});
|
||
if (updated) {
|
||
result.escalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (didAutomaticRecoveryFail(latestRun, "issue_continuation_needed")) {
|
||
const { consecutive, latestFinishedAt } = await summarizeRecentContinuationRetries(
|
||
issue.companyId,
|
||
issue.id,
|
||
classification.errorCode,
|
||
);
|
||
if (consecutive >= classification.maxAttempts) {
|
||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||
const attemptCopy = consecutive <= 1 ? "" : ` (${consecutive}× attempts)`;
|
||
const causeCopy = classification.errorCode
|
||
? ` Latest cause: \`${classification.errorCode}\`.`
|
||
: "";
|
||
const updated = await escalateStrandedAssignedIssue({
|
||
issue,
|
||
previousStatus: "in_progress",
|
||
latestRun,
|
||
comment:
|
||
"Paperclip automatically retried continuation for this assigned `in_progress` issue after its live " +
|
||
`execution disappeared, but it still has no live execution path${attemptCopy}.${causeCopy}${failureSummary ?? ""} ` +
|
||
"Moving it to `blocked` so it is visible for intervention.",
|
||
});
|
||
if (updated) {
|
||
result.escalated += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
if (classification.baseBackoffMs > 0 && latestFinishedAt) {
|
||
const elapsed = Date.now() - latestFinishedAt.getTime();
|
||
const requiredDelay = classification.baseBackoffMs *
|
||
Math.pow(2, Math.max(0, consecutive - 1));
|
||
if (elapsed < requiredDelay) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (await isInvocationBudgetBlocked(issue, agentId)) {
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
|
||
const queued = await enqueueStrandedIssueRecovery({
|
||
issueId: issue.id,
|
||
agentId,
|
||
reason: "issue_continuation_needed",
|
||
retryReason: "issue_continuation_needed",
|
||
source: "issue.continuation_recovery",
|
||
retryOfRunId: latestRun?.id ?? issue.checkoutRunId ?? null,
|
||
});
|
||
if (queued) {
|
||
result.continuationRequeued += 1;
|
||
result.issueIds.push(issue.id);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
}
|
||
|
||
const orphanBlockerRecovery = await reconcileUnassignedBlockingIssues();
|
||
result.orphanBlockersAssigned = orphanBlockerRecovery.assigned;
|
||
result.skipped += orphanBlockerRecovery.skipped;
|
||
result.issueIds.push(...orphanBlockerRecovery.issueIds);
|
||
|
||
return result;
|
||
}
|
||
|
||
async function collectIssueGraphLivenessFindings() {
|
||
const issueRowsPromise = Promise.resolve(db
|
||
.select({
|
||
id: issues.id,
|
||
companyId: issues.companyId,
|
||
identifier: issues.identifier,
|
||
title: issues.title,
|
||
status: issues.status,
|
||
projectId: issues.projectId,
|
||
goalId: issues.goalId,
|
||
parentId: issues.parentId,
|
||
assigneeAgentId: issues.assigneeAgentId,
|
||
assigneeUserId: issues.assigneeUserId,
|
||
createdByAgentId: issues.createdByAgentId,
|
||
createdByUserId: issues.createdByUserId,
|
||
executionPolicy: issues.executionPolicy,
|
||
executionState: issues.executionState,
|
||
monitorNextCheckAt: issues.monitorNextCheckAt,
|
||
monitorAttemptCount: issues.monitorAttemptCount,
|
||
})
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.originKind, [RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation]),
|
||
),
|
||
));
|
||
|
||
const [
|
||
issueRows,
|
||
relationRows,
|
||
agentRows,
|
||
activeRunRows,
|
||
activeIssueRunRows,
|
||
wakeRows,
|
||
interactionRows,
|
||
approvalRows,
|
||
recoveryIssueRows,
|
||
recoveryActionRows,
|
||
] = await Promise.all([
|
||
issueRowsPromise,
|
||
db
|
||
.select({
|
||
companyId: issueRelations.companyId,
|
||
blockerIssueId: issueRelations.issueId,
|
||
blockedIssueId: issueRelations.relatedIssueId,
|
||
})
|
||
.from(issueRelations)
|
||
.where(eq(issueRelations.type, "blocks")),
|
||
db
|
||
.select({
|
||
id: agents.id,
|
||
companyId: agents.companyId,
|
||
name: agents.name,
|
||
role: agents.role,
|
||
title: agents.title,
|
||
status: agents.status,
|
||
reportsTo: agents.reportsTo,
|
||
})
|
||
.from(agents),
|
||
db
|
||
.select({
|
||
companyId: heartbeatRuns.companyId,
|
||
agentId: heartbeatRuns.agentId,
|
||
status: heartbeatRuns.status,
|
||
contextSnapshot: heartbeatRuns.contextSnapshot,
|
||
})
|
||
.from(heartbeatRuns)
|
||
.where(inArray(heartbeatRuns.status, [...EXECUTION_PATH_HEARTBEAT_RUN_STATUSES])),
|
||
db
|
||
.select({
|
||
companyId: issues.companyId,
|
||
agentId: heartbeatRuns.agentId,
|
||
status: heartbeatRuns.status,
|
||
issueId: issues.id,
|
||
})
|
||
.from(issues)
|
||
.innerJoin(heartbeatRuns, eq(issues.executionRunId, heartbeatRuns.id))
|
||
.where(
|
||
and(
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.originKind, [RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation]),
|
||
inArray(heartbeatRuns.status, [...EXECUTION_PATH_HEARTBEAT_RUN_STATUSES]),
|
||
),
|
||
),
|
||
db
|
||
.select({
|
||
companyId: agentWakeupRequests.companyId,
|
||
agentId: agentWakeupRequests.agentId,
|
||
status: agentWakeupRequests.status,
|
||
payload: agentWakeupRequests.payload,
|
||
})
|
||
.from(agentWakeupRequests)
|
||
.where(inArray(agentWakeupRequests.status, ["queued", "deferred_issue_execution"])),
|
||
db
|
||
.select({
|
||
companyId: issueThreadInteractions.companyId,
|
||
issueId: issueThreadInteractions.issueId,
|
||
status: issueThreadInteractions.status,
|
||
})
|
||
.from(issueThreadInteractions)
|
||
.where(eq(issueThreadInteractions.status, "pending")),
|
||
db
|
||
.select({
|
||
companyId: issueApprovals.companyId,
|
||
issueId: issueApprovals.issueId,
|
||
status: approvals.status,
|
||
})
|
||
.from(issueApprovals)
|
||
.innerJoin(approvals, eq(issueApprovals.approvalId, approvals.id))
|
||
.where(inArray(approvals.status, ["pending", "revision_requested"])),
|
||
db
|
||
.select({
|
||
companyId: issues.companyId,
|
||
id: issues.id,
|
||
status: issues.status,
|
||
originKind: issues.originKind,
|
||
originId: issues.originId,
|
||
})
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
isNull(issues.hiddenAt),
|
||
inArray(issues.originKind, [
|
||
STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||
RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation,
|
||
]),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
),
|
||
issueRowsPromise.then((rows) => {
|
||
const issueIdsUnderAnalysis = rows.map((row) => row.id);
|
||
return issueIdsUnderAnalysis.length === 0
|
||
? []
|
||
: db
|
||
.select({
|
||
companyId: issueRecoveryActions.companyId,
|
||
issueId: issueRecoveryActions.sourceIssueId,
|
||
status: issueRecoveryActions.status,
|
||
})
|
||
.from(issueRecoveryActions)
|
||
.where(
|
||
and(
|
||
inArray(issueRecoveryActions.status, ["active", "escalated"]),
|
||
inArray(issueRecoveryActions.sourceIssueId, issueIdsUnderAnalysis),
|
||
),
|
||
);
|
||
}),
|
||
]);
|
||
|
||
const openRecoveryIssues = recoveryIssueRows.flatMap((row) => {
|
||
if (row.originKind === RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation) {
|
||
const parsed = parseIssueGraphLivenessIncidentKey(row.originId);
|
||
if (!parsed || parsed.companyId !== row.companyId) return [];
|
||
return [
|
||
{
|
||
companyId: row.companyId,
|
||
issueId: parsed.issueId,
|
||
status: row.status,
|
||
},
|
||
{
|
||
companyId: row.companyId,
|
||
issueId: parsed.leafIssueId,
|
||
status: row.status,
|
||
},
|
||
];
|
||
}
|
||
|
||
const issueId = readNonEmptyString(row.originId);
|
||
if (!issueId) return [];
|
||
return [{
|
||
companyId: row.companyId,
|
||
issueId,
|
||
status: row.status,
|
||
}];
|
||
});
|
||
|
||
return classifyIssueGraphLiveness({
|
||
issues: issueRows,
|
||
relations: relationRows,
|
||
agents: agentRows,
|
||
activeRuns: activeRunRows.map((row) => ({
|
||
companyId: row.companyId,
|
||
agentId: row.agentId,
|
||
status: row.status,
|
||
issueId: issueIdFromRunContext(row.contextSnapshot),
|
||
})).concat(activeIssueRunRows.map((row) => ({
|
||
companyId: row.companyId,
|
||
agentId: row.agentId,
|
||
status: row.status,
|
||
issueId: row.issueId,
|
||
}))),
|
||
queuedWakeRequests: wakeRows.map((row) => ({
|
||
companyId: row.companyId,
|
||
agentId: row.agentId,
|
||
status: row.status,
|
||
issueId: issueIdFromWakePayload(row.payload),
|
||
})),
|
||
pendingInteractions: interactionRows,
|
||
pendingApprovals: approvalRows,
|
||
openRecoveryIssues: openRecoveryIssues.concat(recoveryActionRows),
|
||
now: new Date(),
|
||
});
|
||
}
|
||
|
||
async function findOpenLivenessEscalation(companyId: string, incidentKey: string) {
|
||
return db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, companyId),
|
||
eq(issues.originKind, RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation),
|
||
eq(issues.originId, incidentKey),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null);
|
||
}
|
||
|
||
async function findOpenLivenessRecoveryIssueForLeaf(finding: IssueLivenessFinding) {
|
||
const byFingerprint = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, finding.companyId),
|
||
eq(issues.originKind, RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation),
|
||
eq(issues.originFingerprint, livenessRecoveryLeafFingerprint(finding)),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null);
|
||
if (byFingerprint) return byFingerprint;
|
||
|
||
const leafIssueId = livenessRecoveryLeafIssueId(finding);
|
||
const openRecoveries = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, finding.companyId),
|
||
eq(issues.originKind, RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
);
|
||
return openRecoveries.find((row) => {
|
||
const parsed = parseLivenessIncidentKey(row.originId);
|
||
return parsed?.state === finding.state && parsed.leafIssueId === leafIssueId;
|
||
}) ?? null;
|
||
}
|
||
|
||
async function removeRecoveryBlockerFromSource(recovery: typeof issues.$inferSelect) {
|
||
const parsed = parseLivenessIncidentKey(recovery.originId);
|
||
if (!parsed) return false;
|
||
const sourceIssue = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(and(eq(issues.companyId, recovery.companyId), eq(issues.id, parsed.issueId)))
|
||
.then((rows) => rows[0] ?? null);
|
||
if (!sourceIssue) return false;
|
||
|
||
const blockerIds = await existingBlockerIssueIds(sourceIssue.companyId, sourceIssue.id);
|
||
if (!blockerIds.includes(recovery.id)) return false;
|
||
await issuesSvc.update(sourceIssue.id, {
|
||
blockedByIssueIds: blockerIds.filter((blockerId) => blockerId !== recovery.id),
|
||
});
|
||
return true;
|
||
}
|
||
|
||
async function hasActiveRunForIssueId(companyId: string, issueId: string) {
|
||
const [contextRun, issueRun] = await Promise.all([
|
||
db
|
||
.select({ id: heartbeatRuns.id })
|
||
.from(heartbeatRuns)
|
||
.where(
|
||
and(
|
||
eq(heartbeatRuns.companyId, companyId),
|
||
inArray(heartbeatRuns.status, [...EXECUTION_PATH_HEARTBEAT_RUN_STATUSES]),
|
||
sql`(${heartbeatRuns.contextSnapshot}->>'issueId' = ${issueId}
|
||
OR ${heartbeatRuns.contextSnapshot}->>'taskId' = ${issueId})`,
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null),
|
||
db
|
||
.select({ id: heartbeatRuns.id })
|
||
.from(issues)
|
||
.innerJoin(heartbeatRuns, eq(issues.executionRunId, heartbeatRuns.id))
|
||
.where(
|
||
and(
|
||
eq(issues.companyId, companyId),
|
||
eq(issues.id, issueId),
|
||
inArray(heartbeatRuns.status, [...EXECUTION_PATH_HEARTBEAT_RUN_STATUSES]),
|
||
),
|
||
)
|
||
.limit(1)
|
||
.then((rows) => rows[0] ?? null),
|
||
]);
|
||
return Boolean(contextRun || issueRun);
|
||
}
|
||
|
||
async function retireObsoleteLivenessRecoveryIssues(findings: IssueLivenessFinding[]) {
|
||
const currentIncidentKeys = new Set(findings.map((finding) => finding.incidentKey));
|
||
const currentLeafKeys = new Set(
|
||
findings.map((finding) =>
|
||
livenessRecoveryLeafKey(
|
||
finding.companyId,
|
||
finding.state,
|
||
livenessRecoveryLeafIssueId(finding),
|
||
),
|
||
),
|
||
);
|
||
const openRecoveries = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.originKind, RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation),
|
||
isNull(issues.hiddenAt),
|
||
notInArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
);
|
||
const result = {
|
||
retired: 0,
|
||
activeSkipped: 0,
|
||
blockerRelationsRemoved: 0,
|
||
retiredIssueIds: [] as string[],
|
||
};
|
||
|
||
for (const recovery of openRecoveries) {
|
||
if (recovery.originId && currentIncidentKeys.has(recovery.originId)) continue;
|
||
const parsed = parseLivenessIncidentKey(recovery.originId);
|
||
if (!parsed) continue;
|
||
if (
|
||
currentLeafKeys.has(
|
||
livenessRecoveryLeafKey(parsed.companyId, parsed.state, parsed.leafIssueId),
|
||
)
|
||
) {
|
||
continue;
|
||
}
|
||
const sourceIssue = await db
|
||
.select({
|
||
id: issues.id,
|
||
status: issues.status,
|
||
})
|
||
.from(issues)
|
||
.where(and(eq(issues.companyId, parsed.companyId), eq(issues.id, parsed.issueId)))
|
||
.then((rows) => rows[0] ?? null);
|
||
if (sourceIssue && !["done", "cancelled"].includes(sourceIssue.status)) {
|
||
const blockerIds = await existingBlockerIssueIds(parsed.companyId, sourceIssue.id);
|
||
if (blockerIds.includes(recovery.id)) {
|
||
result.activeSkipped += 1;
|
||
continue;
|
||
}
|
||
}
|
||
if (await removeRecoveryBlockerFromSource(recovery)) {
|
||
result.blockerRelationsRemoved += 1;
|
||
}
|
||
if (await hasActiveRunForIssueId(recovery.companyId, recovery.id)) {
|
||
result.activeSkipped += 1;
|
||
continue;
|
||
}
|
||
await issuesSvc.update(recovery.id, { status: "cancelled" });
|
||
result.retired += 1;
|
||
result.retiredIssueIds.push(recovery.id);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
async function retireDoneLivenessRecoveryBlockers() {
|
||
const closedRecoveries = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(
|
||
and(
|
||
eq(issues.originKind, RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation),
|
||
isNull(issues.hiddenAt),
|
||
inArray(issues.status, ["done", "cancelled"]),
|
||
),
|
||
);
|
||
|
||
let blockerRelationsRemoved = 0;
|
||
for (const recovery of closedRecoveries) {
|
||
if (await removeRecoveryBlockerFromSource(recovery)) {
|
||
blockerRelationsRemoved += 1;
|
||
}
|
||
}
|
||
|
||
return { blockerRelationsRemoved };
|
||
}
|
||
|
||
function normalizeIssueGraphLivenessAutoRecoveryLookbackHours(raw: unknown) {
|
||
const numeric = Math.floor(asNumber(raw, DEFAULT_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS));
|
||
return Math.min(
|
||
MAX_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS,
|
||
Math.max(MIN_ISSUE_GRAPH_LIVENESS_AUTO_RECOVERY_LOOKBACK_HOURS, numeric),
|
||
);
|
||
}
|
||
|
||
function livenessDependencyIssueKey(companyId: string, issueId: string) {
|
||
return `${companyId}:${issueId}`;
|
||
}
|
||
|
||
async function loadLivenessDependencyUpdatedAtByIssue(findings: IssueLivenessFinding[]) {
|
||
const issueIds = [
|
||
...new Set(
|
||
findings.flatMap((finding) => finding.dependencyPath.map((entry) => entry.issueId)),
|
||
),
|
||
];
|
||
if (issueIds.length === 0) return new Map<string, Date>();
|
||
const rows = await db
|
||
.select({ id: issues.id, companyId: issues.companyId, updatedAt: issues.updatedAt })
|
||
.from(issues)
|
||
.where(inArray(issues.id, issueIds));
|
||
return new Map(rows.map((row) => [
|
||
livenessDependencyIssueKey(row.companyId, row.id),
|
||
row.updatedAt,
|
||
]));
|
||
}
|
||
|
||
function latestDependencyUpdatedAtForLivenessFinding(
|
||
finding: IssueLivenessFinding,
|
||
updatedAtByIssueKey: Map<string, Date>,
|
||
) {
|
||
const dependencyIssueIds = [...new Set(finding.dependencyPath.map((entry) => entry.issueId))];
|
||
if (dependencyIssueIds.length === 0) return null;
|
||
const timestamps = dependencyIssueIds.map((issueId) =>
|
||
updatedAtByIssueKey.get(livenessDependencyIssueKey(finding.companyId, issueId)) ?? null
|
||
);
|
||
if (timestamps.some((timestamp) => !timestamp)) return null;
|
||
const [firstTimestamp, ...remainingTimestamps] = timestamps as Date[];
|
||
return remainingTimestamps.reduce((latest, updatedAt) =>
|
||
updatedAt > latest ? updatedAt : latest,
|
||
firstTimestamp!);
|
||
}
|
||
|
||
function isLivenessFindingInsideAutoRecoveryLookback(
|
||
finding: IssueLivenessFinding,
|
||
cutoff: Date,
|
||
updatedAtByIssueKey: Map<string, Date>,
|
||
) {
|
||
const latestUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(finding, updatedAtByIssueKey);
|
||
return Boolean(latestUpdatedAt && latestUpdatedAt >= cutoff);
|
||
}
|
||
|
||
async function buildIssueGraphLivenessAutoRecoveryPreview(
|
||
opts?: { lookbackHours?: number; now?: Date },
|
||
): Promise<IssueGraphLivenessAutoRecoveryPreview> {
|
||
const now = opts?.now ?? new Date();
|
||
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(opts?.lookbackHours);
|
||
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
|
||
const findings = await collectIssueGraphLivenessFindings();
|
||
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
|
||
const issueIds = [...new Set(findings.map((finding) => finding.recoveryIssueId))];
|
||
const recoveryRows = issueIds.length > 0
|
||
? await db
|
||
.select({ id: issues.id, identifier: issues.identifier, title: issues.title })
|
||
.from(issues)
|
||
.where(inArray(issues.id, issueIds))
|
||
: [];
|
||
const recoveryById = new Map(recoveryRows.map((row) => [row.id, row]));
|
||
const items: IssueGraphLivenessAutoRecoveryPreviewItem[] = [];
|
||
let skippedOutsideLookback = 0;
|
||
|
||
for (const finding of findings) {
|
||
const latestDependencyUpdatedAt = latestDependencyUpdatedAtForLivenessFinding(
|
||
finding,
|
||
updatedAtByIssueKey,
|
||
);
|
||
if (!latestDependencyUpdatedAt || latestDependencyUpdatedAt < cutoff) {
|
||
skippedOutsideLookback += 1;
|
||
continue;
|
||
}
|
||
const recoveryIssue = recoveryById.get(finding.recoveryIssueId);
|
||
items.push({
|
||
issueId: finding.issueId,
|
||
identifier: finding.identifier,
|
||
title: finding.dependencyPath[0]?.title ?? finding.identifier ?? finding.issueId,
|
||
state: finding.state,
|
||
severity: finding.severity,
|
||
reason: finding.reason,
|
||
recoveryIssueId: finding.recoveryIssueId,
|
||
recoveryIdentifier: recoveryIssue?.identifier ?? null,
|
||
recoveryTitle: recoveryIssue?.title ?? null,
|
||
recommendedOwnerAgentId: finding.recommendedOwnerAgentId,
|
||
incidentKey: finding.incidentKey,
|
||
latestDependencyUpdatedAt: latestDependencyUpdatedAt.toISOString(),
|
||
dependencyPath: finding.dependencyPath,
|
||
});
|
||
}
|
||
|
||
return {
|
||
lookbackHours,
|
||
cutoff: cutoff.toISOString(),
|
||
generatedAt: now.toISOString(),
|
||
findings: findings.length,
|
||
recoverableFindings: items.length,
|
||
skippedOutsideLookback,
|
||
items,
|
||
};
|
||
}
|
||
|
||
async function resolveEscalationOwnerAgentId(
|
||
finding: IssueLivenessFinding,
|
||
issue: typeof issues.$inferSelect,
|
||
) {
|
||
const detailedCandidates = finding.recommendedOwnerCandidates.length > 0
|
||
? finding.recommendedOwnerCandidates
|
||
: finding.recommendedOwnerCandidateAgentIds.map((agentId) => ({
|
||
agentId,
|
||
reason: "ordered_invokable_fallback" as const,
|
||
sourceIssueId: finding.recoveryIssueId,
|
||
}));
|
||
const seenCandidates = new Set<string>();
|
||
const candidates = detailedCandidates.filter((candidate) => {
|
||
if (seenCandidates.has(candidate.agentId)) return false;
|
||
seenCandidates.add(candidate.agentId);
|
||
return true;
|
||
});
|
||
const budgetBlockedCandidateAgentIds: string[] = [];
|
||
|
||
for (const candidate of candidates) {
|
||
const budgetBlock = await budgets.getInvocationBlock(issue.companyId, candidate.agentId, {
|
||
issueId: issue.id,
|
||
projectId: issue.projectId,
|
||
});
|
||
if (!budgetBlock) {
|
||
return {
|
||
agentId: candidate.agentId,
|
||
reason: candidate.reason,
|
||
sourceIssueId: candidate.sourceIssueId,
|
||
candidateAgentIds: candidates.map((entry) => entry.agentId),
|
||
candidateReasons: candidates.map((entry) => ({
|
||
agentId: entry.agentId,
|
||
reason: entry.reason,
|
||
sourceIssueId: entry.sourceIssueId,
|
||
})),
|
||
budgetBlockedCandidateAgentIds,
|
||
};
|
||
}
|
||
budgetBlockedCandidateAgentIds.push(candidate.agentId);
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
function shouldReuseRecoveryExecutionWorkspace(input: {
|
||
finding: IssueLivenessFinding;
|
||
recoveryIssue: typeof issues.$inferSelect;
|
||
ownerAgentId: string;
|
||
}) {
|
||
if (input.finding.recoveryIssueId === input.finding.issueId) return false;
|
||
return input.recoveryIssue.assigneeAgentId === input.ownerAgentId;
|
||
}
|
||
|
||
async function ensureIssueBlockedByEscalation(input: {
|
||
issue: typeof issues.$inferSelect;
|
||
escalationIssueId: string;
|
||
finding: IssueLivenessFinding;
|
||
runId?: string | null;
|
||
}) {
|
||
const blockerIds = await existingBlockerIssueIds(input.issue.companyId, input.issue.id);
|
||
const nextBlockerIds = [...new Set([...blockerIds, input.escalationIssueId])];
|
||
const isAlreadyBlockedByEscalation = blockerIds.includes(input.escalationIssueId);
|
||
const isAlreadyBlocked = input.issue.status === "blocked";
|
||
if (isAlreadyBlockedByEscalation && isAlreadyBlocked) {
|
||
return input.issue;
|
||
}
|
||
|
||
const update: Partial<typeof issues.$inferInsert> & { blockedByIssueIds: string[] } = {
|
||
blockedByIssueIds: nextBlockerIds,
|
||
};
|
||
if (!isAlreadyBlocked) {
|
||
update.status = "blocked";
|
||
}
|
||
|
||
const updated = await issuesSvc.update(input.issue.id, update);
|
||
if (!updated) return null;
|
||
|
||
await logActivity(db, {
|
||
companyId: input.issue.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: null,
|
||
runId: input.runId ?? null,
|
||
action: "issue.blockers.updated",
|
||
entityType: "issue",
|
||
entityId: input.issue.id,
|
||
details: {
|
||
source: "recovery.reconcile_issue_graph_liveness",
|
||
incidentKey: input.finding.incidentKey,
|
||
findingState: input.finding.state,
|
||
blockerIssueIds: nextBlockerIds,
|
||
escalationIssueId: input.escalationIssueId,
|
||
status: update.status ?? input.issue.status,
|
||
previousStatus: input.issue.status,
|
||
},
|
||
});
|
||
|
||
return updated;
|
||
}
|
||
|
||
async function createIssueGraphLivenessEscalation(input: {
|
||
finding: IssueLivenessFinding;
|
||
runId?: string | null;
|
||
}) {
|
||
const issue = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(eq(issues.id, input.finding.issueId))
|
||
.then((rows) => rows[0] ?? null);
|
||
if (!issue || issue.companyId !== input.finding.companyId) return { kind: "skipped" as const };
|
||
if (await isAutomaticRecoverySuppressedByPauseHold(db, issue.companyId, issue.id, treeControlSvc)) {
|
||
return { kind: "skipped" as const };
|
||
}
|
||
|
||
const recoveryIssue = await db
|
||
.select()
|
||
.from(issues)
|
||
.where(and(eq(issues.id, input.finding.recoveryIssueId), eq(issues.companyId, issue.companyId)))
|
||
.then((rows) => rows[0] ?? null);
|
||
if (!recoveryIssue) return { kind: "skipped" as const };
|
||
|
||
const existing =
|
||
await findOpenLivenessEscalation(issue.companyId, input.finding.incidentKey) ??
|
||
await findOpenLivenessRecoveryIssueForLeaf(input.finding);
|
||
if (existing) {
|
||
await ensureIssueBlockedByEscalation({
|
||
issue,
|
||
escalationIssueId: existing.id,
|
||
finding: input.finding,
|
||
runId: input.runId ?? null,
|
||
});
|
||
return { kind: "existing" as const, escalationIssueId: existing.id };
|
||
}
|
||
|
||
const ownerSelection = await resolveEscalationOwnerAgentId(input.finding, recoveryIssue);
|
||
if (!ownerSelection) return { kind: "skipped" as const };
|
||
const reuseRecoveryExecutionWorkspace = shouldReuseRecoveryExecutionWorkspace({
|
||
finding: input.finding,
|
||
recoveryIssue,
|
||
ownerAgentId: ownerSelection.agentId,
|
||
});
|
||
|
||
let escalation: Awaited<ReturnType<typeof issuesSvc.create>>;
|
||
try {
|
||
escalation = await issuesSvc.create(issue.companyId, {
|
||
title: `Unblock liveness incident for ${issue.identifier ?? issue.id}`,
|
||
description: buildLivenessEscalationDescription(input.finding),
|
||
status: "todo",
|
||
priority: "high",
|
||
parentId: recoveryIssue.id,
|
||
projectId: recoveryIssue.projectId,
|
||
goalId: recoveryIssue.goalId,
|
||
assigneeAgentId: ownerSelection.agentId,
|
||
assigneeAdapterOverrides: recoveryAssigneeAdapterOverrides("status_only"),
|
||
originKind: RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation,
|
||
originId: input.finding.incidentKey,
|
||
originFingerprint: livenessRecoveryLeafFingerprint(input.finding),
|
||
billingCode: recoveryIssue.billingCode,
|
||
...(reuseRecoveryExecutionWorkspace
|
||
? { inheritExecutionWorkspaceFromIssueId: recoveryIssue.id }
|
||
: {
|
||
executionWorkspaceId: null,
|
||
executionWorkspacePreference: null,
|
||
executionWorkspaceSettings: null,
|
||
}),
|
||
});
|
||
} catch (error) {
|
||
if (!isUniqueLivenessRecoveryConflict(error)) throw error;
|
||
const raced =
|
||
await findOpenLivenessEscalation(issue.companyId, input.finding.incidentKey) ??
|
||
await findOpenLivenessRecoveryIssueForLeaf(input.finding);
|
||
if (!raced) throw error;
|
||
await ensureIssueBlockedByEscalation({
|
||
issue,
|
||
escalationIssueId: raced.id,
|
||
finding: input.finding,
|
||
runId: input.runId ?? null,
|
||
});
|
||
return { kind: "existing" as const, escalationIssueId: raced.id };
|
||
}
|
||
|
||
await ensureIssueBlockedByEscalation({
|
||
issue,
|
||
escalationIssueId: escalation.id,
|
||
finding: input.finding,
|
||
runId: input.runId ?? null,
|
||
});
|
||
|
||
await issuesSvc.addComment(
|
||
issue.id,
|
||
buildLivenessOriginalIssueComment(input.finding, escalation),
|
||
{ runId: input.runId ?? null },
|
||
);
|
||
|
||
await logActivity(db, {
|
||
companyId: issue.companyId,
|
||
actorType: "system",
|
||
actorId: "system",
|
||
agentId: ownerSelection.agentId,
|
||
runId: input.runId ?? null,
|
||
action: "issue.harness_liveness_escalation_created",
|
||
entityType: "issue",
|
||
entityId: escalation.id,
|
||
details: {
|
||
source: "recovery.reconcile_issue_graph_liveness",
|
||
incidentKey: input.finding.incidentKey,
|
||
findingState: input.finding.state,
|
||
sourceIssueId: issue.id,
|
||
sourceIdentifier: issue.identifier,
|
||
recoveryIssueId: recoveryIssue.id,
|
||
recoveryIdentifier: recoveryIssue.identifier,
|
||
escalationIssueId: escalation.id,
|
||
escalationIdentifier: escalation.identifier,
|
||
dependencyPath: input.finding.dependencyPath,
|
||
ownerSelection: {
|
||
selectedAgentId: ownerSelection.agentId,
|
||
selectedReason: ownerSelection.reason,
|
||
selectedSourceIssueId: ownerSelection.sourceIssueId,
|
||
candidateAgentIds: ownerSelection.candidateAgentIds,
|
||
candidateReasons: ownerSelection.candidateReasons,
|
||
budgetBlockedCandidateAgentIds: ownerSelection.budgetBlockedCandidateAgentIds,
|
||
},
|
||
workspaceSelection: {
|
||
reuseRecoveryExecutionWorkspace,
|
||
inheritedExecutionWorkspaceFromIssueId: reuseRecoveryExecutionWorkspace ? recoveryIssue.id : null,
|
||
projectWorkspaceSourceIssueId: recoveryIssue.id,
|
||
},
|
||
},
|
||
});
|
||
|
||
const wake = await deps.enqueueWakeup(ownerSelection.agentId, {
|
||
source: "assignment",
|
||
triggerDetail: "system",
|
||
reason: "issue_assigned",
|
||
payload: withRecoveryModelProfileHint({
|
||
issueId: escalation.id,
|
||
sourceIssueId: issue.id,
|
||
recoveryIssueId: recoveryIssue.id,
|
||
incidentKey: input.finding.incidentKey,
|
||
}, "status_only"),
|
||
requestedByActorType: "system",
|
||
requestedByActorId: null,
|
||
contextSnapshot: withRecoveryModelProfileHint({
|
||
issueId: escalation.id,
|
||
taskId: escalation.id,
|
||
wakeReason: "issue_assigned",
|
||
source: RECOVERY_ORIGIN_KINDS.issueGraphLivenessEscalation,
|
||
sourceIssueId: issue.id,
|
||
recoveryIssueId: recoveryIssue.id,
|
||
incidentKey: input.finding.incidentKey,
|
||
}, "status_only"),
|
||
});
|
||
|
||
logger.warn({
|
||
incidentKey: input.finding.incidentKey,
|
||
findingState: input.finding.state,
|
||
sourceIssueId: issue.id,
|
||
recoveryIssueId: recoveryIssue.id,
|
||
escalationIssueId: escalation.id,
|
||
ownerAgentId: ownerSelection.agentId,
|
||
ownerSelectionReason: ownerSelection.reason,
|
||
wakeupRunId: wake?.id ?? null,
|
||
}, "created issue graph liveness escalation");
|
||
|
||
return { kind: "created" as const, escalationIssueId: escalation.id };
|
||
}
|
||
|
||
async function reconcileIssueGraphLiveness(opts?: {
|
||
runId?: string | null;
|
||
force?: boolean;
|
||
lookbackHours?: number;
|
||
}) {
|
||
const findings = await collectIssueGraphLivenessFindings();
|
||
const experimentalSettings = await instanceSettings.getExperimental();
|
||
const autoRecoveryEnabled = asBoolean(
|
||
experimentalSettings.enableIssueGraphLivenessAutoRecovery,
|
||
true,
|
||
) || opts?.force === true;
|
||
const lookbackHours = normalizeIssueGraphLivenessAutoRecoveryLookbackHours(
|
||
opts?.lookbackHours ?? experimentalSettings.issueGraphLivenessAutoRecoveryLookbackHours,
|
||
);
|
||
const now = new Date();
|
||
const cutoff = new Date(now.getTime() - lookbackHours * 60 * 60 * 1000);
|
||
const obsoleteRecoveryCleanup = await retireObsoleteLivenessRecoveryIssues(findings);
|
||
const doneRecoveryBlockerCleanup = await retireDoneLivenessRecoveryBlockers();
|
||
const updatedAtByIssueKey = await loadLivenessDependencyUpdatedAtByIssue(findings);
|
||
const result = {
|
||
findings: findings.length,
|
||
autoRecoveryEnabled,
|
||
lookbackHours,
|
||
cutoff: cutoff.toISOString(),
|
||
escalationsCreated: 0,
|
||
existingEscalations: 0,
|
||
skipped: 0,
|
||
skippedAutoRecoveryDisabled: 0,
|
||
skippedOutsideLookback: 0,
|
||
obsoleteRecoveriesRetired: obsoleteRecoveryCleanup.retired,
|
||
obsoleteRecoveriesActiveSkipped: obsoleteRecoveryCleanup.activeSkipped,
|
||
obsoleteRecoveryBlockerRelationsRemoved: obsoleteRecoveryCleanup.blockerRelationsRemoved,
|
||
doneRecoveryBlockerRelationsRemoved: doneRecoveryBlockerCleanup.blockerRelationsRemoved,
|
||
issueIds: [] as string[],
|
||
escalationIssueIds: [] as string[],
|
||
retiredRecoveryIssueIds: obsoleteRecoveryCleanup.retiredIssueIds,
|
||
};
|
||
|
||
if (!autoRecoveryEnabled) {
|
||
result.skippedAutoRecoveryDisabled = findings.length;
|
||
return result;
|
||
}
|
||
|
||
for (const finding of findings) {
|
||
if (!isLivenessFindingInsideAutoRecoveryLookback(finding, cutoff, updatedAtByIssueKey)) {
|
||
result.skippedOutsideLookback += 1;
|
||
result.skipped += 1;
|
||
continue;
|
||
}
|
||
const escalation = await createIssueGraphLivenessEscalation({
|
||
finding,
|
||
runId: opts?.runId ?? null,
|
||
});
|
||
if (escalation.kind === "created") {
|
||
result.escalationsCreated += 1;
|
||
result.issueIds.push(finding.issueId);
|
||
result.escalationIssueIds.push(escalation.escalationIssueId);
|
||
} else if (escalation.kind === "existing") {
|
||
result.existingEscalations += 1;
|
||
result.issueIds.push(finding.issueId);
|
||
result.escalationIssueIds.push(escalation.escalationIssueId);
|
||
} else {
|
||
result.skipped += 1;
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
function readRecoveryTimerIntervalMs(raw: unknown, fallback: number) {
|
||
return Math.max(1, Math.floor(asNumber(raw, fallback)));
|
||
}
|
||
|
||
return {
|
||
buildRunOutputSilence,
|
||
escalateStrandedRecoveryIssueInPlace,
|
||
escalateStrandedAssignedIssue,
|
||
recordWatchdogDecision,
|
||
scanSilentActiveRuns,
|
||
reconcileStrandedAssignedIssues,
|
||
buildIssueGraphLivenessAutoRecoveryPreview,
|
||
reconcileIssueGraphLiveness,
|
||
readRecoveryTimerIntervalMs,
|
||
};
|
||
}
|