From 1bd44c8a0dc2bfd5a8876143a11f8461fb57bf77 Mon Sep 17 00:00:00 2001 From: Devin Foley Date: Wed, 13 May 2026 22:00:10 -0700 Subject: [PATCH] Harden Cloudflare sandbox execution (#5967) ## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - Remote-managed adapters need sandbox/environment execution to behave like real agent runs, not just local host probes. > - The Cloudflare sandbox path was the weakest leg in the SSH + Cloudflare QA matrix because bridge execution could truncate output, time out long-running installs, and under-provision the worker instance. > - That made several adapters fail for reasons unrelated to their actual business logic, which blocks confidence in Paperclip's non-local environment model. > - This pull request hardens the Cloudflare bridge/runtime path and adjusts sandbox probe budgets so adapter verification matches the measured behavior of the fixed environment. > - It also corrects the Pi sandbox install command so the QA matrix exercises a real, supported install path. > - The benefit is a materially more reliable SSH + Cloudflare adapter matrix with fewer false negatives and clearer failure boundaries. ## What Changed - Switched the Cloudflare bridge worker instance type to `standard-2` for the QA-matrix execution path. - Raised Cloudflare bridge/plugin-worker timeout budgets and added SSE keepalives so long-running install/exec calls can complete instead of dying at the transport layer. - Fixed Cloudflare bridge-channel command handling to avoid dropped final stdout chunks on short-lived execs. - Made Claude, OpenCode, and Cursor sandbox probe timeouts configurable/sandbox-aware, then tightened the defaults to the measured post-fix range. - Updated the Pi sandbox install command to use the package currently installed by the official `pi.dev` installer, pinned to a specific npm version. - Added/updated tests around Cloudflare bridge behavior and adapter sandbox probe paths. ## Verification - `pnpm --filter @paperclipai/adapter-claude-local typecheck` - `pnpm --filter @paperclipai/adapter-opencode-local typecheck` - `pnpm --filter @paperclipai/adapter-cursor-local typecheck` - `pnpm vitest run packages/adapters/cursor-local packages/adapters/claude-local packages/adapters/opencode-local packages/adapters/pi-local packages/plugins/sandbox-providers/cloudflare server/src/services/__tests__/plugin-worker-manager.test.ts` - Manual QA on the dedicated dev instance using the SSH + Cloudflare environment matrix (`ENV-29` through `ENV-40`). Clean end-to-end passes: SSH `claude_local`, `codex_local`, `cursor`, `gemini_local`; Cloudflare `claude_local`, `codex_local`, `cursor`, `gemini_local`. ## Risks - Cloudflare sandbox cost increases because the bridge worker now runs on `standard-2` instead of `lite`. - Higher timeout ceilings can delay surfacing truly hung Cloudflare bridge calls, even though they remove transport-level false negatives. - The manual heartbeat matrix still exposed follow-on execution/sync/disposition bugs in `opencode_local` and `pi_local`; those are not fixed by this PR. ## Model Used - OpenAI `gpt-5.4` via Paperclip `codex_local`, reasoning effort `high`, tool use enabled, repo search enabled. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [x] If this change affects the UI, I have included before/after screenshots (not applicable) - [x] I have updated relevant documentation to reflect my changes (not applicable) - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip --- .../adapters/claude-local/src/server/test.ts | 10 +++- .../adapters/cursor-local/src/server/test.ts | 18 +++++- .../opencode-local/src/server/test.ts | 12 +++- packages/adapters/pi-local/src/index.ts | 2 +- .../cloudflare/bridge-template/src/routes.ts | 12 ++++ .../cloudflare/bridge-template/wrangler.jsonc | 2 +- .../cloudflare/src/config.ts | 2 +- .../cloudflare/src/plugin.test.ts | 55 ++++++++++++++++++- .../cloudflare/src/plugin.ts | 8 ++- server/src/services/plugin-worker-manager.ts | 4 +- 10 files changed, 113 insertions(+), 12 deletions(-) diff --git a/packages/adapters/claude-local/src/server/test.ts b/packages/adapters/claude-local/src/server/test.ts index 4b23fcb0..66e451cc 100644 --- a/packages/adapters/claude-local/src/server/test.ts +++ b/packages/adapters/claude-local/src/server/test.ts @@ -212,6 +212,14 @@ export async function testEnvironment( if (maxTurns > 0) args.push("--max-turns", String(maxTurns)); if (extraArgs.length > 0) args.push(...extraArgs); + // Sandbox bridges still add lease warmup and transport overhead, but + // the standard-2 Cloudflare tier now probes fast enough that a 90s + // budget leaves headroom without masking real hangs. + const helloProbeTimeoutSec = Math.max( + 1, + asNumber(config.helloProbeTimeoutSec, targetIsSandbox ? 90 : 45), + ); + const probe = await runAdapterExecutionTargetProcess( runId, target, @@ -220,7 +228,7 @@ export async function testEnvironment( { cwd, env, - timeoutSec: 45, + timeoutSec: helloProbeTimeoutSec, graceSec: 5, stdin: "Respond with hello.", onLog: async () => {}, diff --git a/packages/adapters/cursor-local/src/server/test.ts b/packages/adapters/cursor-local/src/server/test.ts index cc2c2916..8059d897 100644 --- a/packages/adapters/cursor-local/src/server/test.ts +++ b/packages/adapters/cursor-local/src/server/test.ts @@ -4,6 +4,7 @@ import type { AdapterEnvironmentTestResult, } from "@paperclipai/adapter-utils"; import { + asNumber, asString, asStringArray, parseObject, @@ -98,6 +99,7 @@ export async function testEnvironment( let command = asString(config.command, "agent"); const target = ctx.executionTarget ?? null; const targetIsRemote = target?.kind === "remote"; + const targetIsSandbox = target?.kind === "remote" && target.transport === "sandbox"; const cwd = resolveAdapterExecutionTargetCwd(target, asString(config.cwd, ""), process.cwd()); const targetLabel = targetIsRemote ? ctx.environmentName ?? describeAdapterExecutionTarget(target) @@ -230,6 +232,12 @@ export async function testEnvironment( hint: "Use `agent` or `cursor-agent` to run the automatic installation and auth probe.", }); } else { + // Cursor's `agent` binary still pays cold-start overhead in container + // sandboxes, but standard-2 probes no longer need a 120s version budget. + const versionProbeTimeoutSec = Math.max( + 1, + asNumber(config.versionProbeTimeoutSec, targetIsSandbox ? 60 : 45), + ); const versionProbe = await runAdapterExecutionTargetProcess( runId, target, @@ -238,7 +246,7 @@ export async function testEnvironment( { cwd, env, - timeoutSec: 45, + timeoutSec: versionProbeTimeoutSec, graceSec: 5, onLog: async () => {}, }, @@ -295,6 +303,12 @@ export async function testEnvironment( if (extraArgs.length > 0) args.push(...extraArgs); args.push("Respond with hello."); + // Sandbox bridges still add cursor CLI cold-start overhead, but the + // standard-2 tier now completes probes fast enough that 90s is ample. + const helloProbeTimeoutSec = Math.max( + 1, + asNumber(config.helloProbeTimeoutSec, targetIsSandbox ? 90 : 45), + ); const probe = await runAdapterExecutionTargetProcess( runId, target, @@ -303,7 +317,7 @@ export async function testEnvironment( { cwd, env, - timeoutSec: 45, + timeoutSec: helloProbeTimeoutSec, graceSec: 5, onLog: async () => {}, }, diff --git a/packages/adapters/opencode-local/src/server/test.ts b/packages/adapters/opencode-local/src/server/test.ts index 7335fda1..634d920c 100644 --- a/packages/adapters/opencode-local/src/server/test.ts +++ b/packages/adapters/opencode-local/src/server/test.ts @@ -9,6 +9,7 @@ import type { import type { AdapterExecutionTarget } from "@paperclipai/adapter-utils/execution-target"; import { asBoolean, + asNumber, asString, asStringArray, parseObject, @@ -72,6 +73,7 @@ export async function testEnvironment( const command = asString(config.command, "opencode"); const target = ctx.executionTarget ?? null; const targetIsRemote = target?.kind === "remote"; + const targetIsSandbox = target?.kind === "remote" && target.transport === "sandbox"; const cwd = resolveAdapterExecutionTargetCwd(target, asString(config.cwd, ""), process.cwd()); const targetLabel = targetIsRemote ? ctx.environmentName ?? describeAdapterExecutionTarget(target) @@ -334,6 +336,14 @@ export async function testEnvironment( if (variant) args.push("--variant", variant); if (extraArgs.length > 0) args.push(...extraArgs); + // Sandbox bridges still add cold-start and transport overhead, but the + // standard-2 Cloudflare tier now probes quickly enough that 90s keeps + // useful headroom without letting slow hangs linger. + const helloProbeTimeoutSec = Math.max( + 1, + asNumber(config.helloProbeTimeoutSec, targetIsSandbox ? 90 : 60), + ); + try { const probe = await runAdapterExecutionTargetProcess( runId, @@ -343,7 +353,7 @@ export async function testEnvironment( { cwd: runtimeCwd, env: runtimeEnv, - timeoutSec: 60, + timeoutSec: helloProbeTimeoutSec, graceSec: 5, stdin: "Respond with hello.", onLog: async () => {}, diff --git a/packages/adapters/pi-local/src/index.ts b/packages/adapters/pi-local/src/index.ts index 4d13eb76..fef8bc2b 100644 --- a/packages/adapters/pi-local/src/index.ts +++ b/packages/adapters/pi-local/src/index.ts @@ -3,7 +3,7 @@ import type { AdapterModelProfileDefinition } from "@paperclipai/adapter-utils"; export const type = "pi_local"; export const label = "Pi (local)"; -export const SANDBOX_INSTALL_COMMAND = "npm install -g @mariozechner/pi-coding-agent"; +export const SANDBOX_INSTALL_COMMAND = "npm install -g @earendil-works/pi-coding-agent@0.74.0"; export const models: Array<{ id: string; label: string }> = []; diff --git a/packages/plugins/sandbox-providers/cloudflare/bridge-template/src/routes.ts b/packages/plugins/sandbox-providers/cloudflare/bridge-template/src/routes.ts index 02369921..79dd758c 100644 --- a/packages/plugins/sandbox-providers/cloudflare/bridge-template/src/routes.ts +++ b/packages/plugins/sandbox-providers/cloudflare/bridge-template/src/routes.ts @@ -423,6 +423,17 @@ export async function handleBridgeRequest(request: Request, env: BridgeEnv): Pro const encoder = new TextEncoder(); const stream = new ReadableStream({ async start(controller) { + // Heartbeat keeps the SSE response alive during silent stretches + // (e.g. npm install downloading silently). SSE comment lines (`:`) + // are ignored by the client parser but keep the underlying HTTP + // connection from idling out at the Cloudflare edge. + const heartbeat = setInterval(() => { + try { + controller.enqueue(encoder.encode(": keepalive\n\n")); + } catch { + // Controller may already be closed; ignore. + } + }, 15_000); try { const result = await executeInSandbox({ sandbox, @@ -444,6 +455,7 @@ export async function handleBridgeRequest(request: Request, env: BridgeEnv): Pro error: error instanceof Error ? error.message : String(error), }))); } finally { + clearInterval(heartbeat); controller.close(); } }, diff --git a/packages/plugins/sandbox-providers/cloudflare/bridge-template/wrangler.jsonc b/packages/plugins/sandbox-providers/cloudflare/bridge-template/wrangler.jsonc index 24266c99..e306d229 100644 --- a/packages/plugins/sandbox-providers/cloudflare/bridge-template/wrangler.jsonc +++ b/packages/plugins/sandbox-providers/cloudflare/bridge-template/wrangler.jsonc @@ -7,7 +7,7 @@ { "class_name": "Sandbox", "image": "./Dockerfile", - "instance_type": "lite", + "instance_type": "standard-2", "max_instances": 10 } ], diff --git a/packages/plugins/sandbox-providers/cloudflare/src/config.ts b/packages/plugins/sandbox-providers/cloudflare/src/config.ts index 9aff3ac3..1ed62a26 100644 --- a/packages/plugins/sandbox-providers/cloudflare/src/config.ts +++ b/packages/plugins/sandbox-providers/cloudflare/src/config.ts @@ -3,7 +3,7 @@ import type { CloudflareDriverConfig } from "./types.js"; const DEFAULT_REQUESTED_CWD = "/workspace/paperclip"; const DEFAULT_SLEEP_AFTER = "10m"; const DEFAULT_TIMEOUT_MS = 300_000; -const DEFAULT_BRIDGE_REQUEST_TIMEOUT_MS = 30_000; +const DEFAULT_BRIDGE_REQUEST_TIMEOUT_MS = 300_000; const LOCALHOST_HOSTNAMES = new Set(["localhost", "127.0.0.1", "::1"]); function readTrimmedString(value: unknown): string | null { diff --git a/packages/plugins/sandbox-providers/cloudflare/src/plugin.test.ts b/packages/plugins/sandbox-providers/cloudflare/src/plugin.test.ts index 4452e97b..84a6077b 100644 --- a/packages/plugins/sandbox-providers/cloudflare/src/plugin.test.ts +++ b/packages/plugins/sandbox-providers/cloudflare/src/plugin.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; -import plugin from "./plugin.js"; const fetchMock = vi.fn(); +let plugin: typeof import("./plugin.js").default; function jsonResponse(body: unknown, status = 200): Response { return new Response(JSON.stringify(body), { @@ -23,9 +23,11 @@ function requestBodyAt(index = 0): Record { } describe("Cloudflare sandbox provider plugin", () => { - beforeEach(() => { + beforeEach(async () => { fetchMock.mockReset(); vi.stubGlobal("fetch", fetchMock); + vi.resetModules(); + plugin = (await import("./plugin.js")).default; }); it("declares the Cloudflare environment lifecycle handlers", async () => { @@ -210,6 +212,12 @@ describe("Cloudflare sandbox provider plugin", () => { }); it("routes bridge-channel execute calls through a dedicated session", async () => { + // pluginLogger must be set for the streaming branch to be reachable, so + // we can assert that bridge-channel calls take the non-streaming path + // even when adapter sessions would otherwise stream. + await plugin.definition.setup?.({ + logger: { info: () => undefined, warn: () => undefined, error: () => undefined, debug: () => undefined }, + } as never); fetchMock.mockResolvedValueOnce( jsonResponse({ exitCode: 0, @@ -248,6 +256,49 @@ describe("Cloudflare sandbox provider plugin", () => { }, }); expect(requestBodyAt().env).not.toHaveProperty("PAPERCLIP_SANDBOX_EXEC_CHANNEL"); + // Bridge-channel commands must use the non-streaming exec path. The + // @cloudflare/sandbox SDK's streaming mode can drop the final stdout + // chunk when a short shell exits the same tick it writes — bridge ops + // carry machine-consumed stdout (readiness JSON, base64 file payloads, + // queue response bodies) where that data loss surfaces as opaque + // "invalid readiness JSON" / "Invalid bridge request payload" errors. + expect(requestBodyAt().streamOutput).toBe(false); + }); + + it("uses streaming exec for non-bridge adapter commands so live logs flow", async () => { + // Streaming is gated on `pluginLogger` being set, which normally happens + // in `setup()`. Wire a minimal logger so the streaming branch is reachable. + await plugin.definition.setup?.({ + logger: { info: () => undefined, warn: () => undefined, error: () => undefined, debug: () => undefined }, + } as never); + fetchMock.mockResolvedValueOnce( + new Response( + "event: stdout\ndata: {\"data\":\"hello\\n\"}\n\nevent: complete\ndata: {\"exitCode\":0,\"signal\":null,\"timedOut\":false,\"stdout\":\"hello\\n\",\"stderr\":\"\"}\n\n", + { + status: 200, + headers: { "Content-Type": "text/event-stream" }, + }, + ), + ); + + await plugin.definition.onEnvironmentExecute?.({ + driverKey: "cloudflare", + companyId: "company-1", + environmentId: "env-1", + lease: { providerLeaseId: "pc-run-1-abcd1234", metadata: {} }, + command: "echo", + args: ["hello"], + cwd: "/workspace/paperclip", + env: { KEEP_ME: "visible" }, + config: { + bridgeBaseUrl: "https://bridge.example.workers.dev", + bridgeAuthToken: "resolved-token", + sessionStrategy: "named", + sessionId: "paperclip", + }, + }); + + expect(requestBodyAt().streamOutput).toBe(true); }); it("maps lost-lease execute errors into a deterministic command failure", async () => { diff --git a/packages/plugins/sandbox-providers/cloudflare/src/plugin.ts b/packages/plugins/sandbox-providers/cloudflare/src/plugin.ts index ad579a45..63a96dbe 100644 --- a/packages/plugins/sandbox-providers/cloudflare/src/plugin.ts +++ b/packages/plugins/sandbox-providers/cloudflare/src/plugin.ts @@ -317,7 +317,13 @@ const plugin = definePlugin({ const { config, client } = bridgeClientFor(params.config); const session = resolveExecuteSession(config, params.env); try { - const streamingOptions = pluginLogger + // Bridge-channel commands carry machine-consumed stdout (JSON, base64, + // file contents). The @cloudflare/sandbox SDK's streaming mode can drop + // the final stdout chunk when the inner shell exits the same tick as it + // writes (e.g. `cat ready.json && exit 0`), so we never stream for + // bridge control traffic — only adapter sessions get live log forwarding. + const isBridgeChannel = params.env?.[SANDBOX_EXEC_CHANNEL_ENV] === SANDBOX_EXEC_CHANNEL_BRIDGE; + const streamingOptions = pluginLogger && !isBridgeChannel ? { onOutput: async (stream: "stdout" | "stderr", chunk: string) => { logCloudflareExecChunk(pluginLogger, stream, chunk); diff --git a/server/src/services/plugin-worker-manager.ts b/server/src/services/plugin-worker-manager.ts index cdb0eb0b..daedc521 100644 --- a/server/src/services/plugin-worker-manager.ts +++ b/server/src/services/plugin-worker-manager.ts @@ -57,8 +57,8 @@ import { logger } from "../middleware/logger.js"; /** Default timeout for RPC calls in milliseconds. */ const DEFAULT_RPC_TIMEOUT_MS = 30_000; -/** Hard upper bound for any RPC timeout (5 minutes). Prevents unbounded waits. */ -const MAX_RPC_TIMEOUT_MS = 5 * 60 * 1_000; +/** Hard upper bound for any RPC timeout (15 minutes). Prevents unbounded waits. */ +const MAX_RPC_TIMEOUT_MS = 15 * 60 * 1_000; /** Timeout for the initialize RPC call. */ const INITIALIZE_TIMEOUT_MS = 15_000;