diff --git a/scripts/smoke/terminal-bench-loop-skill-smoke.mjs b/scripts/smoke/terminal-bench-loop-skill-smoke.mjs index 9674b675..f8253a59 100755 --- a/scripts/smoke/terminal-bench-loop-skill-smoke.mjs +++ b/scripts/smoke/terminal-bench-loop-skill-smoke.mjs @@ -91,6 +91,7 @@ async function assertLocalSkillPackage() { "diagnosis", "blockedByIssueIds", "PAPERCLIPAI_CMD", + "PAPERCLIP_HARBOR_RUNNER_CONFIG", ]) { assert(markdown.includes(expected), `Skill smoke expected ${skillPath} to mention ${expected}`); } @@ -194,6 +195,8 @@ async function main() { `- Manifest: ${artifactRoot}/manifest.json`, `- Results JSONL: ${artifactRoot}/results.jsonl`, `- Harbor raw job folder: ${artifactRoot}/harbor/raw-job`, + "- Dispatch config: PAPERCLIP_HARBOR_RUNNER_CONFIG=", + "- Heartbeat-enabled agents: 0 (harness/setup no-dispatch; not a product signal)", "", "No benchmark process, Harbor job, model call, or provider call was started.", ].join("\n"), @@ -309,6 +312,7 @@ async function main() { `Expected iteration issue to be in_review, got ${verifiedIteration.status}`, ); assert(verifiedRunDoc.body.includes(`${artifactRoot}/results.jsonl`), "Expected run doc to include mocked results path"); + assert(verifiedRunDoc.body.includes("PAPERCLIP_HARBOR_RUNNER_CONFIG"), "Expected run doc to record dispatch config"); assert( verifiedDiagnosisDoc.body.includes("Exact stop point") && verifiedDiagnosisDoc.body.includes("Next-action owner"), "Expected diagnosis doc to include exact stop point and next-action owner", diff --git a/skills/paperclip/SKILL.md b/skills/paperclip/SKILL.md index a3bce7c1..241e237b 100644 --- a/skills/paperclip/SKILL.md +++ b/skills/paperclip/SKILL.md @@ -233,6 +233,10 @@ For commands, response fields, and MCP tools, read: - **Hiring**: use the `paperclip-create-agent` skill for new agent creation workflows (links to reusable `AGENTS.md` templates like `Coder` and `QA`). - **Commit Co-author**: if you make a git commit you MUST add EXACTLY `Co-Authored-By: Paperclip ` to the end of each commit message. Do not put in your agent name, put `Co-Authored-By: Paperclip `. +This is rule #1: + +IMPORTANT: **NEVER ASK A HUMAN TO DO WHAT AN AGENT COULD DO**. If you need to escalate, escalate. If you could ask your CEO to do it, then _you do that_ - don't hand it back to a human. Again: Never ask a human to do what an agent _could_ do. Rule number 1. + ## Comment Style (Required) When posting issue comments or writing issue descriptions, use concise markdown with: @@ -349,3 +353,5 @@ Results are ranked by relevance: title matches first, then identifier, descripti ## Full Reference For detailed API tables, JSON response schemas, worked examples (IC and Manager heartbeats), governance/approvals, cross-team delegation rules, error codes, issue lifecycle diagram, and the common mistakes table, read: `skills/paperclip/references/api-reference.md` + +Again, rule #1 is: never ask a human to do what an agent could do. Try harder. Try again. Ask another agent to help. Keep working until the goal is fully accomplished. diff --git a/skills/terminal-bench-loop/SKILL.md b/skills/terminal-bench-loop/SKILL.md index 8311d720..ce2c4f22 100644 --- a/skills/terminal-bench-loop/SKILL.md +++ b/skills/terminal-bench-loop/SKILL.md @@ -59,6 +59,7 @@ Collect these on the top-level loop issue before iteration 1. Any input that can - **Iteration budget.** Maximum number of iterations before the loop must stop without further fixes (typical: 3–5). Also record a per-iteration wall-clock cap. - **Paperclip App worktree issue.** The implementation-side issue under the Paperclip App project whose execution workspace owns the isolated worktree. First iteration creates it; later iterations reuse it via `inheritExecutionWorkspaceFromIssueId` or equivalent. - **Benchmark command.** The exact `paperclip-bench` invocation, including the `PAPERCLIPAI_CMD` (or equivalent) binding pinned to the Paperclip App worktree under test. Record verbatim on the loop issue. +- **Dispatch runner config.** The exact Harbor/Paperclip runner dispatch config required for the smoke to actually start a Paperclip heartbeat. For the current Harbor wrapper, record the `PAPERCLIP_HARBOR_RUNNER_CONFIG` JSON (or equivalent config file) verbatim enough to preserve: `assignee`, `heartbeat_strategy`, `agent_adapter` / `agent_adapters`, `reuse_host_home` when local credentials are intentionally needed, and the stop budget. A bare Harbor command that creates `BEN-1` as unassigned `todo` with zero heartbeat-enabled agents is a harness/setup failure, not a valid product diagnosis. - **Latest artifact root.** Filesystem or storage path under which `paperclip-bench` writes run artifacts (manifest, `results.jsonl`, Harbor raw job folders, redacted telemetry). Each iteration appends; nothing is overwritten. - **Approval policy.** Who must accept a proposed product fix before implementation (default: board via `request_confirmation`; CTO if delegated; never the loop driver alone). @@ -95,11 +96,14 @@ Before opening or advancing a loop, read `doc/execution-semantics.md`. Use that ### 3. Run the bounded smoke - The benchmark command must use the Paperclip App worktree under test. Set `PAPERCLIPAI_CMD` (or the equivalent command binding) to the CLI entrypoint inside that worktree. Never let the smoke run against the operator's current Paperclip checkout. +- The same command block must include the runner dispatch config that makes the benchmark issue actionable. For the current Harbor wrapper, export `PAPERCLIP_HARBOR_RUNNER_CONFIG` with the intended assignee, heartbeat strategy, agent adapter, credential/home mode, and stop budget. Do not treat a bare `uvx harbor run ...` as the canonical smoke if it omits the dispatch config; record that as a harness/setup miss and rerun with the recorded config. - Bound the run by wall-clock and by Paperclip's run-budget controls. If the smoke would exceed the per-iteration cap, kill it and record the truncation reason. - Capture, in the iteration child or a dedicated `run` document: - Paperclip run id and heartbeat run ids - benchmark run id, manifest, `results.jsonl` row, Harbor raw job folder + - dispatch config used (`PAPERCLIP_HARBOR_RUNNER_CONFIG` or equivalent), including assignee and adapter type - the exact stop reason reported by the harness (pass, harness fail, verifier fail, timeout, agent gave up, infrastructure error) + - heartbeat-enabled and heartbeat-observed agent counts when Paperclip telemetry exports them - failure taxonomy bucket (task/model, Paperclip product, harness/setup, verifier/infrastructure, security, unclear) - artifact paths under the latest artifact root - Label the iteration as **smoke / non-comparable**. Comparable runs are out of scope for this skill. @@ -172,7 +176,7 @@ The loop must not test whatever Paperclip checkout happens to be current for the - The first iteration creates the Paperclip App implementation child; that project's git-worktree policy spawns a fresh worktree. - The loop issue records the worktree-owning issue id and the workspace path (or workspace id). - Every later implementation, QA, and rerun child sets `inheritExecutionWorkspaceFromIssueId` to that worktree-owning issue, so all subsequent loop work shares one workspace. -- The benchmark command always sets `PAPERCLIPAI_CMD` (or the equivalent command binding) to the CLI entrypoint inside that worktree. The benchmark command stored on the loop issue is the source of truth — if a heartbeat needs to run the smoke from a different shell, it copies the recorded command verbatim. +- The benchmark command always sets `PAPERCLIPAI_CMD` (or the equivalent command binding) to the CLI entrypoint inside that worktree, and it carries the recorded dispatch runner config (`PAPERCLIP_HARBOR_RUNNER_CONFIG` or equivalent) needed to assign the benchmark issue and start the heartbeat. The benchmark command stored on the loop issue is the source of truth — if a heartbeat needs to run the smoke from a different shell, it copies the recorded command block verbatim, not only the Harbor invocation line. - If the workspace is pruned or the worktree path no longer resolves, the loop is invalid until rebuilt. Mark the loop `blocked` and name the unblock owner (typically CodexCoder or the Paperclip App owner). ## Liveness rule @@ -189,6 +193,7 @@ If a loop issue does not fit one of these on exit, the heartbeat is not done. Fi ## Pitfalls - **Running the smoke against the operator's Paperclip checkout.** The whole point of the worktree rule is that the bench tests the worktree the fix lands in. Always set `PAPERCLIPAI_CMD` and verify the path before launching the run. +- **Dropping the dispatch config.** A Harbor run that omits `PAPERCLIP_HARBOR_RUNNER_CONFIG` (or equivalent) may boot Paperclip and create `BEN-1`, but leave it unassigned with zero heartbeat-enabled agents. That is not a Terminal-Bench product signal. Preserve and rerun the full command block, including assignee and adapter config. - **Coding before approval.** No implementation child exists until a board confirmation accepts the iteration's `plan` document. Do not push code in the diagnostic phase. - **Skipping the recent-work survey.** When proposing a Paperclip product rule, check what already shipped in the affected liveness/execution area in the last few days. A rule that contradicts last-week's accepted contract is rework. - **Letting `in_review` mean done.** A loop or iteration child sitting in `in_review` with no participant, no interaction, no approval, and no human owner is a stop, not progress. Treat it as a liveness violation and route it. @@ -200,10 +205,11 @@ If a loop issue does not fit one of these on exit, the heartbeat is not done. Fi ## Verification checklist (before exiting a heartbeat that touched the loop) -- [ ] All inputs are recorded on the top-level loop issue, including the exact benchmark command and `PAPERCLIPAI_CMD` binding. +- [ ] All inputs are recorded on the top-level loop issue, including the exact benchmark command, `PAPERCLIPAI_CMD` binding, and dispatch runner config. - [ ] Iteration counter is up to date and within budget. - [ ] The Paperclip App worktree pointer still resolves, and the iteration's run/implementation/rerun children share that workspace. - [ ] The smoke run is captured with run ids, manifest, `results.jsonl`, Harbor raw job folder, and stop reason. +- [ ] Paperclip telemetry shows the benchmark issue was assigned and a heartbeat was enabled/observed, or the iteration is explicitly classified as harness/setup no-dispatch. - [ ] Diagnosis applies the `/diagnose-why-work-stopped` pattern, classifies every non-progressing issue, and checks the three invariants. - [ ] No implementation child exists for an unapproved fix proposal; if one was proposed, a `request_confirmation` is open against the latest plan revision. - [ ] Every loop and iteration issue rests in a terminal, explicitly-live, explicitly-waiting, or named-blocker state.