Files
paperclip/packages/adapters/codex-local/src/server/parse.ts
T
Dotta 09d0678840 [codex] Harden heartbeat scheduling and runtime controls (#4223)
## Thinking Path

> - Paperclip orchestrates AI agents through issue checkout, heartbeat
runs, routines, and auditable control-plane state
> - The runtime path has to recover from lost local processes, transient
adapter failures, blocked dependencies, and routine coalescing without
stranding work
> - The existing branch carried several reliability fixes across
heartbeat scheduling, issue runtime controls, routine dispatch, and
operator-facing run state
> - These changes belong together because they share backend contracts,
migrations, and runtime status semantics
> - This pull request groups the control-plane/runtime slice so it can
merge independently from board UI polish and adapter sandbox work
> - The benefit is safer heartbeat recovery, clearer runtime controls,
and more predictable recurring execution behavior

## What Changed

- Adds bounded heartbeat retry scheduling, scheduled retry state, and
Codex transient failure recovery handling.
- Tightens heartbeat process recovery, blocker wake behavior, issue
comment wake handling, routine dispatch coalescing, and
activity/dashboard bounds.
- Adds runtime-control MCP tools and Paperclip skill docs for issue
workspace runtime management.
- Adds migrations `0061_lively_thor_girl.sql` and
`0062_routine_run_dispatch_fingerprint.sql`.
- Surfaces retry state in run ledger/agent UI and keeps related shared
types synchronized.

## Verification

- `pnpm exec vitest run
server/src/__tests__/heartbeat-retry-scheduling.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/routines-service.test.ts`
- `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server`

## Risks

- Medium risk: this touches heartbeat recovery and routine dispatch,
which are central execution paths.
- Migration order matters if split branches land out of order: merge
this PR before branches that assume the new runtime/routine fields.
- Runtime retry behavior should be watched in CI and in local operator
smoke tests because it changes how transient failures are resumed.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use
enabled. Exact hosted model build and context window are not exposed in
this Paperclip heartbeat environment.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
2026-04-21 12:24:11 -05:00

100 lines
3.1 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { asString, asNumber, parseObject, parseJson } from "@paperclipai/adapter-utils/server-utils";
const CODEX_TRANSIENT_UPSTREAM_RE =
/(?:we(?:'|)re\s+currently\s+experiencing\s+high\s+demand|temporary\s+errors|rate[-\s]?limit(?:ed)?|too\s+many\s+requests|\b429\b|server\s+overloaded|service\s+unavailable|try\s+again\s+later)/i;
const CODEX_REMOTE_COMPACTION_RE = /remote\s+compact\s+task/i;
export function parseCodexJsonl(stdout: string) {
let sessionId: string | null = null;
let finalMessage: string | null = null;
let errorMessage: string | null = null;
const usage = {
inputTokens: 0,
cachedInputTokens: 0,
outputTokens: 0,
};
for (const rawLine of stdout.split(/\r?\n/)) {
const line = rawLine.trim();
if (!line) continue;
const event = parseJson(line);
if (!event) continue;
const type = asString(event.type, "");
if (type === "thread.started") {
sessionId = asString(event.thread_id, sessionId ?? "") || sessionId;
continue;
}
if (type === "error") {
const msg = asString(event.message, "").trim();
if (msg) errorMessage = msg;
continue;
}
if (type === "item.completed") {
const item = parseObject(event.item);
if (asString(item.type, "") === "agent_message") {
const text = asString(item.text, "");
if (text) finalMessage = text;
}
continue;
}
if (type === "turn.completed") {
const usageObj = parseObject(event.usage);
usage.inputTokens = asNumber(usageObj.input_tokens, usage.inputTokens);
usage.cachedInputTokens = asNumber(usageObj.cached_input_tokens, usage.cachedInputTokens);
usage.outputTokens = asNumber(usageObj.output_tokens, usage.outputTokens);
continue;
}
if (type === "turn.failed") {
const err = parseObject(event.error);
const msg = asString(err.message, "").trim();
if (msg) errorMessage = msg;
}
}
return {
sessionId,
summary: finalMessage?.trim() ?? "",
usage,
errorMessage,
};
}
export function isCodexUnknownSessionError(stdout: string, stderr: string): boolean {
const haystack = `${stdout}\n${stderr}`
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.join("\n");
return /unknown (session|thread)|session .* not found|thread .* not found|conversation .* not found|missing rollout path for thread|state db missing rollout path|no rollout found for thread id/i.test(
haystack,
);
}
export function isCodexTransientUpstreamError(input: {
stdout?: string | null;
stderr?: string | null;
errorMessage?: string | null;
}): boolean {
const haystack = [
input.errorMessage ?? "",
input.stdout ?? "",
input.stderr ?? "",
]
.join("\n")
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.join("\n");
if (!CODEX_TRANSIENT_UPSTREAM_RE.test(haystack)) return false;
// Keep automatic retries scoped to the observed remote-compaction/high-demand
// failure shape; broader 429s may be caused by user or account limits.
return CODEX_REMOTE_COMPACTION_RE.test(haystack) || /high\s+demand|temporary\s+errors/i.test(haystack);
}