Files
paperclip/scripts/smoke/terminal-bench-loop-skill-smoke.mjs
Dotta 685ee84e4a [codex] Document terminal bench dispatch config (#4961)
## Thinking Path

> - Paperclip agents rely on skills for repeatable operating procedures
> - The Terminal-Bench loop skill needs to preserve enough dispatch
configuration to reproduce real heartbeat behavior
> - A bare benchmark command can create unassigned work with no
heartbeat-enabled agent, which is a harness setup failure rather than
product evidence
> - The Paperclip heartbeat skill also needs to keep escalation biased
toward agent-owned follow-through
> - This pull request documents dispatch runner config requirements and
strengthens the agent follow-through rule
> - The benefit is fewer misleading benchmark loops and clearer agent
operating guidance

## What Changed

- Documented `PAPERCLIP_HARBOR_RUNNER_CONFIG` / runner dispatch config
as required Terminal-Bench loop input.
- Updated the Terminal-Bench loop smoke check to require the dispatch
config mention.
- Added stronger Paperclip skill guidance to avoid asking humans for
work an agent can perform.

## Verification

- `pnpm smoke:terminal-bench-loop-skill`

## Risks

- Low risk: documentation and smoke expectation changes only. The
stricter smoke assertion is intentional so future edits do not drop the
dispatch config requirement.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5 coding agent, tool use and local command
execution. Exact context window was not exposed in the runtime.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
2026-05-01 12:00:47 -05:00

362 lines
12 KiB
JavaScript
Executable File

#!/usr/bin/env node
import { readFile } from "node:fs/promises";
import { join } from "node:path";
import { fileURLToPath } from "node:url";
const repoRoot = join(fileURLToPath(new URL(".", import.meta.url)), "..", "..");
function parseArgs(argv) {
const parsed = {
keep: false,
sourceIssueId: process.env.PAPERCLIP_TASK_ID ?? null,
projectId: process.env.PAPERCLIP_PROJECT_ID ?? null,
goalId: process.env.PAPERCLIP_GOAL_ID ?? null,
runKey: null,
};
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
if (arg === "--keep") {
parsed.keep = true;
continue;
}
if (arg === "--source-issue-id") {
parsed.sourceIssueId = argv[++index] ?? null;
continue;
}
if (arg === "--project-id") {
parsed.projectId = argv[++index] ?? null;
continue;
}
if (arg === "--goal-id") {
parsed.goalId = argv[++index] ?? null;
continue;
}
if (arg === "--run-key") {
parsed.runKey = argv[++index] ?? null;
continue;
}
if (arg === "--help" || arg === "-h") {
printUsage();
process.exit(0);
}
throw new Error(`Unknown argument: ${arg}`);
}
return parsed;
}
function printUsage() {
console.log(`
Usage:
PAPERCLIP_API_URL=http://localhost:3100 \\
PAPERCLIP_API_KEY=... \\
PAPERCLIP_COMPANY_ID=... \\
pnpm smoke:terminal-bench-loop-skill
Options:
--source-issue-id <uuid> Attach smoke issues under an existing Paperclip issue.
--project-id <uuid> Override inferred project id.
--goal-id <uuid> Override inferred goal id.
--run-key <string> Stable key used in smoke titles and mocked artifact paths.
--keep Leave smoke issues in their verified blocked/in_review posture.
`);
}
function requireEnv(name) {
const value = process.env[name];
if (!value) {
throw new Error(`${name} is required. Run against a local Paperclip server with an agent or board API token.`);
}
return value;
}
function slugify(value) {
return value.replace(/[^a-zA-Z0-9_.-]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 80);
}
function assert(condition, message) {
if (!condition) {
throw new Error(message);
}
}
async function assertLocalSkillPackage() {
const skillPath = join(repoRoot, "skills", "terminal-bench-loop", "SKILL.md");
const markdown = await readFile(skillPath, "utf8");
for (const expected of [
"name: terminal-bench-loop",
"request_confirmation",
"diagnosis",
"blockedByIssueIds",
"PAPERCLIPAI_CMD",
"PAPERCLIP_HARBOR_RUNNER_CONFIG",
]) {
assert(markdown.includes(expected), `Skill smoke expected ${skillPath} to mention ${expected}`);
}
}
function createApiClient({ apiUrl, apiKey, runId }) {
const baseUrl = apiUrl.replace(/\/+$/, "");
return async function api(method, path, { body, ok } = {}) {
const expectedStatuses = ok ?? (method === "POST" || method === "PUT" ? [200, 201] : [200]);
const headers = {
Authorization: `Bearer ${apiKey}`,
Accept: "application/json",
};
if (body !== undefined) {
headers["Content-Type"] = "application/json";
}
if (runId && method !== "GET") {
headers["X-Paperclip-Run-Id"] = runId;
}
const response = await fetch(`${baseUrl}${path}`, {
method,
headers,
body: body === undefined ? undefined : JSON.stringify(body),
});
const text = await response.text();
const data = text ? JSON.parse(text) : null;
if (!expectedStatuses.includes(response.status)) {
throw new Error(`${method} ${path} returned ${response.status}: ${text}`);
}
return data;
};
}
async function main() {
const args = parseArgs(process.argv.slice(2));
const apiUrl = requireEnv("PAPERCLIP_API_URL");
const apiKey = requireEnv("PAPERCLIP_API_KEY");
const companyId = requireEnv("PAPERCLIP_COMPANY_ID");
const runId = process.env.PAPERCLIP_RUN_ID ?? null;
const api = createApiClient({ apiUrl, apiKey, runId });
await assertLocalSkillPackage();
const sourceIssue = args.sourceIssueId
? await api("GET", `/api/issues/${args.sourceIssueId}`)
: null;
const projectId = args.projectId ?? sourceIssue?.projectId ?? null;
const goalId = args.goalId ?? sourceIssue?.goalId ?? null;
const runKey = slugify(args.runKey ?? runId ?? `local-${new Date().toISOString()}`);
const artifactRoot = `mock://terminal-bench-loop-smoke/${runKey}`;
const titlePrefix = `[smoke:${runKey}]`;
const commonIssueFields = {
...(projectId ? { projectId } : {}),
...(goalId ? { goalId } : {}),
priority: "low",
};
const loop = await api("POST", `/api/companies/${companyId}/issues`, {
body: {
...commonIssueFields,
...(sourceIssue ? { parentId: sourceIssue.id } : {}),
title: `${titlePrefix} Terminal-Bench loop skill smoke`,
status: "todo",
description: [
"Deterministic smoke for the /terminal-bench-loop skill.",
"",
"- Task: terminal-bench/fix-git",
"- Iteration budget: 1",
"- Benchmark command: mocked; no Terminal-Bench, Harbor, model, or provider process is started.",
`- Artifact root: ${artifactRoot}`,
].join("\n"),
},
});
const iteration = await api("POST", `/api/companies/${companyId}/issues`, {
body: {
...commonIssueFields,
parentId: loop.id,
title: `${titlePrefix} Iteration 1: terminal-bench/fix-git`,
status: "todo",
description: [
"Smoke iteration child created by the deterministic terminal-bench-loop skill smoke.",
"",
"This issue records mocked run artifacts, diagnosis, and the pending confirmation path.",
].join("\n"),
},
});
const runDocument = await api("PUT", `/api/issues/${iteration.id}/documents/run`, {
body: {
title: "Mocked benchmark run",
format: "markdown",
body: [
"# Mocked benchmark run",
"",
"- Label: smoke / non-comparable",
"- Terminal-Bench task: terminal-bench/fix-git",
"- Stop reason: verifier_failed",
`- Manifest: ${artifactRoot}/manifest.json`,
`- Results JSONL: ${artifactRoot}/results.jsonl`,
`- Harbor raw job folder: ${artifactRoot}/harbor/raw-job`,
"- Dispatch config: PAPERCLIP_HARBOR_RUNNER_CONFIG=<omitted - harness/setup no-dispatch smoke>",
"- Heartbeat-enabled agents: 0 (harness/setup no-dispatch; not a product signal)",
"",
"No benchmark process, Harbor job, model call, or provider call was started.",
].join("\n"),
changeSummary: "Record deterministic mocked benchmark artifact paths.",
},
});
const diagnosisDocument = await api("PUT", `/api/issues/${iteration.id}/documents/diagnosis`, {
body: {
title: "Smoke diagnosis",
format: "markdown",
body: [
"# Smoke diagnosis",
"",
`Exact stop point: ${iteration.identifier ?? iteration.id} is waiting on a product-fix confirmation after a mocked verifier failure.`,
"",
"Next-action owner: board/user must accept or reject the confirmation before implementation subtasks exist.",
"",
"Failure taxonomy: Paperclip product gap, mocked for smoke coverage.",
"",
"Invariant check:",
"",
"- Productive work continues: acceptance wakes the assignee and would create the implementation path.",
"- Only real blockers stop work: the loop parent is blocked by this iteration child while the confirmation is pending.",
"- No infinite loops: iteration budget is 1 and the smoke does not start a rerun.",
].join("\n"),
changeSummary: "Record exact stop point and next-action owner.",
},
});
const planDocument = await api("PUT", `/api/issues/${iteration.id}/documents/plan`, {
body: {
title: "Smoke fix proposal",
format: "markdown",
body: [
"# Smoke fix proposal",
"",
"Proposed product rule: a Terminal-Bench loop iteration that identifies a product gap must create a request_confirmation interaction before implementation subtasks exist.",
"",
`Evidence: mocked run document ${runDocument.id}; diagnosis document ${diagnosisDocument.id}.`,
].join("\n"),
changeSummary: "Record smoke proposal for confirmation target.",
},
});
const confirmation = await api("POST", `/api/issues/${iteration.id}/interactions`, {
body: {
kind: "request_confirmation",
idempotencyKey: `confirmation:${iteration.id}:plan:${planDocument.latestRevisionId}`,
title: "Smoke plan confirmation",
continuationPolicy: "wake_assignee",
payload: {
version: 1,
prompt: "Accept the mocked terminal-bench-loop product-fix proposal?",
acceptLabel: "Accept smoke plan",
rejectLabel: "Reject smoke plan",
rejectRequiresReason: true,
rejectReasonLabel: "What should change?",
detailsMarkdown: "This deterministic smoke verifies the waiting path only; do not treat it as a real benchmark result.",
supersedeOnUserComment: true,
target: {
type: "issue_document",
issueId: iteration.id,
documentId: planDocument.id,
key: "plan",
revisionId: planDocument.latestRevisionId,
revisionNumber: planDocument.latestRevisionNumber,
label: "Smoke fix proposal",
},
},
},
});
await api("PATCH", `/api/issues/${iteration.id}`, {
body: {
status: "in_review",
comment: [
"Smoke waiting path opened.",
"",
`Pending confirmation: ${confirmation.id}`,
"Next-action owner: board/user accepts or rejects the mocked proposal.",
].join("\n"),
},
});
await api("PATCH", `/api/issues/${loop.id}`, {
body: {
status: "blocked",
blockedByIssueIds: [iteration.id],
comment: [
"Smoke loop parent is blocked by its iteration child while the typed confirmation is pending.",
"",
`Blocking iteration: ${iteration.identifier ?? iteration.id}`,
].join("\n"),
},
});
const [verifiedLoop, verifiedIteration, verifiedRunDoc, verifiedDiagnosisDoc, interactions] = await Promise.all([
api("GET", `/api/issues/${loop.id}`),
api("GET", `/api/issues/${iteration.id}`),
api("GET", `/api/issues/${iteration.id}/documents/run`),
api("GET", `/api/issues/${iteration.id}/documents/diagnosis`),
api("GET", `/api/issues/${iteration.id}/interactions`),
]);
assert(verifiedLoop.status === "blocked", `Expected loop issue to be blocked, got ${verifiedLoop.status}`);
assert(
Array.isArray(verifiedLoop.blockedBy) && verifiedLoop.blockedBy.some((blocker) => blocker.id === iteration.id),
"Expected loop issue to be blocked by the iteration child",
);
assert(
verifiedIteration.status === "in_review",
`Expected iteration issue to be in_review, got ${verifiedIteration.status}`,
);
assert(verifiedRunDoc.body.includes(`${artifactRoot}/results.jsonl`), "Expected run doc to include mocked results path");
assert(verifiedRunDoc.body.includes("PAPERCLIP_HARBOR_RUNNER_CONFIG"), "Expected run doc to record dispatch config");
assert(
verifiedDiagnosisDoc.body.includes("Exact stop point") && verifiedDiagnosisDoc.body.includes("Next-action owner"),
"Expected diagnosis doc to include exact stop point and next-action owner",
);
assert(
interactions.some((interaction) =>
interaction.id === confirmation.id
&& interaction.kind === "request_confirmation"
&& interaction.status === "pending"
&& interaction.continuationPolicy === "wake_assignee"
),
"Expected a pending request_confirmation interaction with wake_assignee continuation",
);
if (!args.keep) {
await api("PATCH", `/api/issues/${loop.id}`, {
body: {
status: "cancelled",
blockedByIssueIds: [],
comment: "Smoke cleanup: verified topology and cancelled the short-lived loop parent.",
},
});
await api("PATCH", `/api/issues/${iteration.id}`, {
body: {
status: "cancelled",
comment: "Smoke cleanup: verified confirmation/waiting posture and cancelled the short-lived iteration child.",
},
});
}
console.log(JSON.stringify({
ok: true,
cleanup: !args.keep,
loopIssue: { id: loop.id, identifier: loop.identifier ?? null },
iterationIssue: { id: iteration.id, identifier: iteration.identifier ?? null },
runDocument: runDocument.id,
diagnosisDocument: diagnosisDocument.id,
confirmation: confirmation.id,
artifactRoot,
}, null, 2));
}
main().catch((error) => {
console.error(`terminal-bench-loop skill smoke failed: ${error.message}`);
process.exit(1);
});