Run explicit-environment adapter tests on the requested target instead of falling back to the host (#5277)

## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies > - When a user clicks "Test" on a configured environment (SSH or sandbox), the agent-test route exercises the adapter against that target > - The route previously fell back to running the probe on the Paperclip host whenever an explicit environment target couldn't be resolved, with the test report still saying "passed" > - That hid two real failure modes: misconfigured environments looked green, and sandbox environments were never actually exercised > - This pull request acquires an ad-hoc lease and realizes a workspace for sandbox/plugin test environments, resolves a sandbox execution target wired to the environment runtime, and returns synthesized diagnostics instead of running a host probe when an explicit env target can't be resolved > - The benefit is the Test action surfaces the real environment state and never silently exercises the wrong machine ## What Changed - `server/routes/agents.ts`: acquire an ad-hoc lease and realize a workspace for sandbox/plugin test environments; resolve a sandbox execution target wired to the environment runtime - Return synthesized diagnostics (no host fallback) when an explicit env target can't be resolved - `server/services/environment-runtime.ts`: small adjustments to support the explicit-env-target case - Clarify test-route messages so they no longer claim a host fallback in explicit env flows - New `agent-test-environment-routes.test.ts` covers the guard and missing-environment path ## Verification - `pnpm vitest run --no-coverage server/src/__tests__/agent-test-environment-routes.test.ts` - `pnpm typecheck` clean - Manual: a deliberately misconfigured sandbox environment now reports diagnostics instead of a misleading host-pass ## Risks Medium — Test route behavior change. Explicit environments that previously appeared to pass via host fallback will now report their real state. This is the desired behavior, but operators should expect to see new failures for environments that were never actually working. ## Model Used Claude Opus 4.7 (1M context) ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable — new tests cover guard + missing-env paths - [x] If this change affects the UI, I have included before/after screenshots — N/A (no UI) - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
2026-05-05 08:00:32 -07:00
parent 9042b8d042
commit 5c2f9aba9d
3 changed files with 578 additions and 56 deletions
@@ -0,0 +1,305 @@
+import express from "express";
+import request from "supertest";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { ServerAdapterModule } from "../adapters/index.js";
+
+const mockAgentService = vi.hoisted(() => ({
+  getById: vi.fn(),
+  getChainOfCommand: vi.fn(async () => []),
+}));
+
+const mockAccessService = vi.hoisted(() => ({
+  canUser: vi.fn(),
+  hasPermission: vi.fn(),
+  getMembership: vi.fn(async () => null),
+  listPrincipalGrants: vi.fn(async () => []),
+}));
+
+const mockSecretService = vi.hoisted(() => ({
+  normalizeAdapterConfigForPersistence: vi.fn(async (_companyId: string, config: Record<string, unknown>) => config),
+  resolveAdapterConfigForRuntime: vi.fn(async (_companyId: string, config: Record<string, unknown>) => ({ config })),
+}));
+
+const mockEnvironmentService = vi.hoisted(() => ({
+  getById: vi.fn(),
+  releaseLease: vi.fn(),
+}));
+
+const mockReleaseRunLease = vi.hoisted(() => vi.fn(async () => undefined));
+const mockEnvironmentRuntime = vi.hoisted(() => ({
+  acquireRunLease: vi.fn(),
+  realizeWorkspace: vi.fn(),
+  getDriver: vi.fn(() => ({
+    releaseRunLease: mockReleaseRunLease,
+  })),
+}));
+
+const mockResolveEnvironmentExecutionTarget = vi.hoisted(() => vi.fn());
+const mockInstanceSettingsService = vi.hoisted(() => ({
+  getGeneral: vi.fn(async () => ({ censorUsernameInLogs: false })),
+}));
+
+vi.mock("../services/index.js", () => ({
+  agentService: () => mockAgentService,
+  agentInstructionsService: () => ({}),
+  accessService: () => mockAccessService,
+  approvalService: () => ({}),
+  companySkillService: () => ({
+    listRuntimeSkillEntries: vi.fn(async () => []),
+    resolveRequestedSkillKeys: vi.fn(async () => []),
+  }),
+  budgetService: () => ({}),
+  heartbeatService: () => ({
+    wakeup: vi.fn(),
+    cancelActiveForAgent: vi.fn(),
+  }),
+  ISSUE_LIST_DEFAULT_LIMIT: 50,
+  issueApprovalService: () => ({}),
+  issueService: () => ({}),
+  logActivity: vi.fn(),
+  syncInstructionsBundleConfigFromFilePath: vi.fn((_agent, config) => config),
+  workspaceOperationService: () => ({}),
+}));
+
+vi.mock("../services/environments.js", () => ({
+  environmentService: () => mockEnvironmentService,
+}));
+
+vi.mock("../services/secrets.js", () => ({
+  secretService: () => mockSecretService,
+}));
+
+vi.mock("../services/environment-runtime.js", () => ({
+  environmentRuntimeService: () => mockEnvironmentRuntime,
+}));
+
+vi.mock("../services/environment-execution-target.js", () => ({
+  resolveEnvironmentExecutionTarget: mockResolveEnvironmentExecutionTarget,
+}));
+
+vi.mock("../services/instance-settings.js", () => ({
+  instanceSettingsService: () => mockInstanceSettingsService,
+}));
+
+const testEnvironmentSpy = vi.fn();
+
+const externalAdapter: ServerAdapterModule = {
+  type: "external_test",
+  execute: async () => ({ exitCode: 0, signal: null, timedOut: false }),
+  testEnvironment: testEnvironmentSpy,
+};
+
+async function createApp() {
+  const [{ agentRoutes }, { errorHandler }] = await Promise.all([
+    vi.importActual<typeof import("../routes/agents.js")>("../routes/agents.js"),
+    vi.importActual<typeof import("../middleware/index.js")>("../middleware/index.js"),
+  ]);
+  const app = express();
+  app.use(express.json());
+  app.use((req, _res, next) => {
+    (req as any).actor = {
+      type: "board",
+      userId: "local-board",
+      companyIds: ["company-1"],
+      source: "local_implicit",
+      isInstanceAdmin: false,
+    };
+    next();
+  });
+  app.use("/api", agentRoutes({} as any));
+  app.use(errorHandler);
+  return app;
+}
+
+async function unregisterTestAdapter(type: string) {
+  const { unregisterServerAdapter } = await import("../adapters/index.js");
+  unregisterServerAdapter(type);
+}
+
+describe("agent test-environment route", () => {
+  beforeEach(async () => {
+    vi.resetModules();
+    vi.clearAllMocks();
+    mockEnvironmentService.getById.mockResolvedValue({
+      id: "11111111-1111-4111-8111-111111111111",
+      companyId: "company-1",
+      name: "Sandbox QA",
+      driver: "sandbox",
+      config: { provider: "fake-plugin" },
+    });
+    mockEnvironmentRuntime.acquireRunLease.mockResolvedValue({
+      lease: {
+        id: "lease-1",
+        metadata: { remoteCwd: "/home/user/paperclip-workspace" },
+      },
+      leaseContext: {
+        executionWorkspaceId: null,
+        executionWorkspaceMode: null,
+      },
+    });
+    mockEnvironmentRuntime.realizeWorkspace.mockResolvedValue({
+      cwd: "/home/user/paperclip-workspace",
+    });
+    mockResolveEnvironmentExecutionTarget.mockResolvedValue(null);
+    testEnvironmentSpy.mockResolvedValue({
+      adapterType: "external_test",
+      status: "pass",
+      checks: [
+        {
+          code: "host_probe_ran",
+          level: "info",
+          message: "host probe should not run",
+        },
+      ],
+      testedAt: new Date(0).toISOString(),
+    });
+    await unregisterTestAdapter("external_test");
+    const { registerServerAdapter } = await import("../adapters/index.js");
+    registerServerAdapter(externalAdapter);
+  });
+
+  afterEach(async () => {
+    await unregisterTestAdapter("external_test");
+  });
+
+  it("does not fall back to a host probe when a requested environment cannot produce an execution target", async () => {
+    const app = await createApp();
+
+    const res = await request(app)
+      .post("/api/companies/company-1/adapters/external_test/test-environment")
+      .send({
+        adapterConfig: {},
+        environmentId: "11111111-1111-4111-8111-111111111111",
+      });
+
+    expect(res.status, JSON.stringify(res.body)).toBe(200);
+    expect(testEnvironmentSpy).not.toHaveBeenCalled();
+    expect(res.body).toMatchObject({
+      adapterType: "external_test",
+      status: "warn",
+      checks: [
+        {
+          code: "environment_target_unsupported",
+          level: "warn",
+          message: 'Adapter "external_test" is not allowed in "Sandbox QA" environments.',
+        },
+      ],
+    });
+    expect(mockReleaseRunLease).toHaveBeenCalledWith({
+      environment: expect.objectContaining({
+        id: "11111111-1111-4111-8111-111111111111",
+        name: "Sandbox QA",
+        driver: "sandbox",
+      }),
+      lease: expect.objectContaining({
+        id: "lease-1",
+      }),
+      status: "failed",
+    });
+  });
+
+  it("returns a diagnostic result instead of probing the host when the requested environment is missing", async () => {
+    mockEnvironmentService.getById.mockResolvedValueOnce(null);
+    const app = await createApp();
+
+    const res = await request(app)
+      .post("/api/companies/company-1/adapters/external_test/test-environment")
+      .send({
+        adapterConfig: {},
+        environmentId: "22222222-2222-4222-8222-222222222222",
+      });
+
+    expect(res.status, JSON.stringify(res.body)).toBe(200);
+    expect(testEnvironmentSpy).not.toHaveBeenCalled();
+    expect(mockEnvironmentRuntime.acquireRunLease).not.toHaveBeenCalled();
+    expect(res.body).toMatchObject({
+      adapterType: "external_test",
+      status: "warn",
+      checks: [
+        {
+          code: "environment_not_found",
+          level: "warn",
+          message: "Selected environment was not found. The test did not run.",
+        },
+      ],
+    });
+  });
+
+  it("runs the adapter probe against the resolved sandbox target on the happy path and releases the lease on success", async () => {
+    mockResolveEnvironmentExecutionTarget.mockResolvedValueOnce({
+      kind: "remote",
+      transport: "sandbox",
+      remoteCwd: "/home/user/paperclip-workspace",
+      providerKey: "fake-plugin",
+      runner: { execute: vi.fn() },
+    });
+    testEnvironmentSpy.mockResolvedValueOnce({
+      adapterType: "external_test",
+      status: "pass",
+      checks: [
+        {
+          code: "external_test_hello_probe_passed",
+          level: "info",
+          message: "OK",
+        },
+      ],
+      testedAt: new Date(0).toISOString(),
+    });
+    const app = await createApp();
+
+    const res = await request(app)
+      .post("/api/companies/company-1/adapters/external_test/test-environment")
+      .send({
+        adapterConfig: {},
+        environmentId: "11111111-1111-4111-8111-111111111111",
+      });
+
+    expect(res.status, JSON.stringify(res.body)).toBe(200);
+    expect(testEnvironmentSpy).toHaveBeenCalledTimes(1);
+    expect(testEnvironmentSpy.mock.calls[0]?.[0]).toMatchObject({
+      executionTarget: expect.objectContaining({
+        kind: "remote",
+        transport: "sandbox",
+      }),
+      environmentName: "Sandbox QA",
+    });
+    expect(res.body).toMatchObject({ adapterType: "external_test", status: "pass" });
+    expect(mockReleaseRunLease).toHaveBeenCalledWith({
+      environment: expect.objectContaining({ id: "11111111-1111-4111-8111-111111111111" }),
+      lease: expect.objectContaining({ id: "lease-1" }),
+      status: "released",
+    });
+  });
+
+  it("releases the lease as failed and returns a diagnostic when realizeWorkspace throws", async () => {
+    mockEnvironmentRuntime.realizeWorkspace.mockRejectedValueOnce(
+      new Error("workspace realization failed"),
+    );
+    const app = await createApp();
+
+    const res = await request(app)
+      .post("/api/companies/company-1/adapters/external_test/test-environment")
+      .send({
+        adapterConfig: {},
+        environmentId: "11111111-1111-4111-8111-111111111111",
+      });
+
+    expect(res.status, JSON.stringify(res.body)).toBe(200);
+    expect(testEnvironmentSpy).not.toHaveBeenCalled();
+    expect(res.body).toMatchObject({
+      adapterType: "external_test",
+      status: "fail",
+      checks: [
+        expect.objectContaining({
+          code: "environment_workspace_realize_failed",
+          level: "error",
+        }),
+      ],
+    });
+    expect(mockReleaseRunLease).toHaveBeenCalledWith({
+      environment: expect.objectContaining({ id: "11111111-1111-4111-8111-111111111111" }),
+      lease: expect.objectContaining({ id: "lease-1" }),
+      status: "failed",
+    });
+  });
+});
@@ -56,8 +56,12 @@ import {
 import type { PluginWorkerManager } from "../services/plugin-worker-manager.js";
 import { environmentService } from "../services/environments.js";
 import { resolveEnvironmentExecutionTarget } from "../services/environment-execution-target.js";
+import { environmentRuntimeService } from "../services/environment-runtime.js";
 import type { AdapterExecutionTarget } from "@paperclipai/adapter-utils/execution-target";
-import type { AdapterEnvironmentCheck } from "@paperclipai/adapter-utils";
+import type {
+  AdapterEnvironmentCheck,
+  AdapterEnvironmentTestResult,
+} from "@paperclipai/adapter-utils";
 import { secretService } from "../services/secrets.js";
 import {
  detectAdapterModel,
@@ -160,6 +164,9 @@ export function agentRoutes(
  const approvalsSvc = approvalService(db);
  const budgets = budgetService(db);
  const environmentsSvc = environmentService(db);
+  const environmentRuntime = environmentRuntimeService(db, {
+    pluginWorkerManager: options.pluginWorkerManager,
+  });
  const heartbeat = heartbeatService(db, {
    pluginWorkerManager: options.pluginWorkerManager,
  });
@@ -191,9 +198,13 @@ export function agentRoutes(
   * - SSH environment → builds an SSH execution target from the environment
   *   config so the adapter probes the remote box. No lease is required:
   *   the SSH spec is fully derived from the saved environment config.
-   * - Sandbox / plugin environments → currently fall back to local probing
-   *   with a warning check, since lifting a temporary sandbox lease for an
-   *   ad-hoc test invocation is out of scope for this iteration.
+   * - Sandbox / plugin environments → acquires an ad-hoc lease, realizes the
+   *   workspace, and resolves a sandbox execution target wired to the runtime
+   *   so the adapter probe runs inside the sandbox the same way a heartbeat
+   *   would. The returned `release` callback rolls the lease back when the
+   *   route is done.
+   *
+   * The caller MUST always invoke `release()` (typically in a `finally` block).
   */
  async function resolveAdapterTestExecutionContext(input: {
    companyId: string;
@@ -203,9 +214,17 @@ export function agentRoutes(
    executionTarget: AdapterExecutionTarget | null;
    environmentName: string | null;
    fallbackChecks: AdapterEnvironmentCheck[];
+    release: (status?: "released" | "failed") => Promise<void>;
  }> {
+    const noopRelease = async () => {};
+
    if (!input.environmentId) {
-      return { executionTarget: null, environmentName: null, fallbackChecks: [] };
+      return {
+        executionTarget: null,
+        environmentName: null,
+        fallbackChecks: [],
+        release: noopRelease,
+      };
    }

    const environment = await environmentsSvc.getById(input.environmentId);
@@ -217,14 +236,20 @@ export function agentRoutes(
          {
            code: "environment_not_found",
            level: "warn",
-            message: "Selected environment was not found. Falling back to a local probe.",
+            message: "Selected environment was not found. The test did not run.",
          },
        ],
+        release: noopRelease,
      };
    }

    if (environment.driver === "local") {
-      return { executionTarget: null, environmentName: environment.name, fallbackChecks: [] };
+      return {
+        executionTarget: null,
+        environmentName: environment.name,
+        fallbackChecks: [],
+        release: noopRelease,
+      };
    }

    if (environment.driver === "ssh") {
@@ -241,7 +266,12 @@ export function agentRoutes(
          leaseMetadata: null,
        });
        if (target) {
-          return { executionTarget: target, environmentName: environment.name, fallbackChecks: [] };
+          return {
+            executionTarget: target,
+            environmentName: environment.name,
+            fallbackChecks: [],
+            release: noopRelease,
+          };
        }
        return {
          executionTarget: null,
@@ -251,9 +281,10 @@ export function agentRoutes(
              code: "environment_target_unavailable",
              level: "warn",
              message:
-                `Could not resolve an execution target for environment "${environment.name}". Falling back to a local probe.`,
+                `Could not resolve an execution target for environment "${environment.name}". The test did not run.`,
            },
          ],
+          release: noopRelease,
        };
      } catch (err) {
        return {
@@ -264,27 +295,163 @@ export function agentRoutes(
              code: "environment_target_failed",
              level: "warn",
              message:
-                `Could not connect to environment "${environment.name}" to run the test. Falling back to a local probe.`,
+                `Could not connect to environment "${environment.name}" to run the test.`,
              detail: err instanceof Error ? err.message : String(err),
            },
          ],
+          release: noopRelease,
        };
      }
    }

-    // sandbox / plugin / other drivers: not yet supported for ad-hoc adapter tests.
-    return {
-      executionTarget: null,
-      environmentName: environment.name,
-      fallbackChecks: [
-        {
-          code: "environment_driver_not_supported_for_test",
-          level: "warn",
-          message:
-            `Adapter testing inside ${environment.driver} environments is not yet supported. Falling back to a local probe; results may not reflect runs in "${environment.name}".`,
-          hint: "Run a real heartbeat in the environment to verify end-to-end behavior.",
+    // sandbox / plugin / other remote drivers: spin up an ad-hoc lease, realize
+    // the workspace inside the box, and run the same probe SSH uses against
+    // a sandbox execution target wired to the environment runtime.
+    //
+    // We pass `heartbeatRunId: null` because there's no heartbeat run for an
+    // operator-initiated `Test` invocation — the leases table FKs heartbeat
+    // run id to heartbeat_runs.id, and we don't want to manufacture a fake
+    // run row. Cleanup goes through the driver's `releaseRunLease` directly
+    // (by lease record), since the batch helper queries by heartbeatRunId.
+    let leaseRecord: Awaited<ReturnType<typeof environmentRuntime.acquireRunLease>>;
+    try {
+      leaseRecord = await environmentRuntime.acquireRunLease({
+        companyId: input.companyId,
+        environment,
+        issueId: null,
+        heartbeatRunId: null,
+        persistedExecutionWorkspace: null,
+      });
+    } catch (err) {
+      return {
+        executionTarget: null,
+        environmentName: environment.name,
+        fallbackChecks: [
+          {
+            code: "environment_lease_acquire_failed",
+            level: "error",
+            message: `Could not acquire a lease for environment "${environment.name}".`,
+            detail: err instanceof Error ? err.message : String(err),
+            hint: "Check the environment's provider credentials and quota.",
+          },
+        ],
+        release: noopRelease,
+      };
+    }
+
+    const driver = environmentRuntime.getDriver(environment.driver);
+    const releaseLease = async (status: "released" | "failed" = "released") => {
+      try {
+        if (driver) {
+          await driver.releaseRunLease({
+            environment,
+            lease: leaseRecord.lease,
+            status,
+          });
+        } else {
+          await environmentsSvc.releaseLease(leaseRecord.lease.id, status);
+        }
+      } catch (err) {
+        // Cleanup failures must not mask the test result.
+        // eslint-disable-next-line no-console
+        console.warn(
+          `[adapter-test] Failed to release lease ${leaseRecord.lease.id}: ${err instanceof Error ? err.message : String(err)}`,
+        );
+      }
+    };
+
+    let realizedCwd: string | null = null;
+    try {
+      const realized = await environmentRuntime.realizeWorkspace({
+        environment,
+        lease: leaseRecord.lease,
+        // No host workspace to copy for a Test invocation; sandbox/plugin
+        // realize implementations use the lease metadata's remoteCwd to
+        // create the working directory inside the box.
+        workspace: {},
+      });
+      realizedCwd =
+        typeof realized.cwd === "string" && realized.cwd.trim().length > 0
+          ? realized.cwd.trim()
+          : null;
+    } catch (err) {
+      await releaseLease("failed");
+      return {
+        executionTarget: null,
+        environmentName: environment.name,
+        fallbackChecks: [
+          {
+            code: "environment_workspace_realize_failed",
+            level: "error",
+            message: `Could not realize a workspace inside "${environment.name}".`,
+            detail: err instanceof Error ? err.message : String(err),
+          },
+        ],
+        release: noopRelease,
+      };
+    }
+
+    let target: AdapterExecutionTarget | null;
+    try {
+      // Prefer the cwd the realize step returned; fall back to lease metadata.
+      const leaseMetadataForTarget: Record<string, unknown> | null =
+        realizedCwd
+          ? { ...(leaseRecord.lease.metadata ?? {}), remoteCwd: realizedCwd }
+          : (leaseRecord.lease.metadata as Record<string, unknown> | null) ?? null;
+
+      target = await resolveEnvironmentExecutionTarget({
+        db,
+        companyId: input.companyId,
+        adapterType: input.adapterType,
+        environment: {
+          id: environment.id,
+          driver: environment.driver,
+          config: environment.config ?? null,
        },
-      ],
+        leaseId: leaseRecord.lease.id,
+        leaseMetadata: leaseMetadataForTarget,
+        lease: leaseRecord.lease,
+        environmentRuntime,
+      });
+    } catch (err) {
+      await releaseLease("failed");
+      return {
+        executionTarget: null,
+        environmentName: environment.name,
+        fallbackChecks: [
+          {
+            code: "environment_target_failed",
+            level: "error",
+            message: `Could not resolve a sandbox execution target for "${environment.name}".`,
+            detail: err instanceof Error ? err.message : String(err),
+          },
+        ],
+        release: noopRelease,
+      };
+    }
+
+    if (!target) {
+      await releaseLease("failed");
+      return {
+        executionTarget: null,
+        environmentName: environment.name,
+        fallbackChecks: [
+          {
+            code: "environment_target_unsupported",
+            level: "warn",
+            message:
+              `Adapter "${input.adapterType}" is not allowed in "${environment.name}" environments.`,
+          },
+        ],
+        release: noopRelease,
+      };
+    }
+
+    return {
+      executionTarget: target,
+      environmentName: environment.name,
+      fallbackChecks: [],
+      release: releaseLease,
    };
  }

@@ -1250,33 +1417,51 @@ export function agentRoutes(
        normalizedAdapterConfig,
      );

-      const { executionTarget, environmentName, fallbackChecks } =
+      const { executionTarget, environmentName, fallbackChecks, release } =
        await resolveAdapterTestExecutionContext({
          companyId,
          adapterType: type,
          environmentId: requestedEnvironmentId,
        });

-      const result = await adapter.testEnvironment({
-        companyId,
-        adapterType: type,
-        config: runtimeAdapterConfig,
-        executionTarget,
-        environmentName,
-      });
+      let releaseStatus: "released" | "failed" = "released";
+      try {
+        // If the caller explicitly selected an environment, never fall back to
+        // probing the host when we couldn't resolve that environment's
+        // execution target. Surface the diagnostic checks instead.
+        if (requestedEnvironmentId && !executionTarget && fallbackChecks.length > 0) {
+          const status: AdapterEnvironmentTestResult["status"] = fallbackChecks.some((c) => c.level === "error")
+            ? "fail"
+            : fallbackChecks.some((c) => c.level === "warn")
+              ? "warn"
+              : "pass";
+          if (status === "fail") releaseStatus = "failed";
+          const synthesized: AdapterEnvironmentTestResult = {
+            adapterType: type,
+            status,
+            checks: fallbackChecks,
+            testedAt: new Date().toISOString(),
+          };
+          res.json(synthesized);
+          return;
+        }

-      if (fallbackChecks.length > 0) {
-        const checks = [...fallbackChecks, ...result.checks];
-        const status: typeof result.status = checks.some((c) => c.level === "error")
-          ? "fail"
-          : checks.some((c) => c.level === "warn")
-            ? "warn"
-            : result.status;
-        res.json({ ...result, checks, status });
-        return;
+        const result = await adapter.testEnvironment({
+          companyId,
+          adapterType: type,
+          config: runtimeAdapterConfig,
+          executionTarget,
+          environmentName,
+        });
+
+        if (result.status === "fail") releaseStatus = "failed";
+        res.json(result);
+      } catch (err) {
+        releaseStatus = "failed";
+        throw err;
+      } finally {
+        await release(releaseStatus);
      }
-
-      res.json(result);
    },
  );

@@ -1,3 +1,4 @@
+import { randomUUID } from "node:crypto";
 import { and, eq, inArray } from "drizzle-orm";
 import type { Db } from "@paperclipai/db";
 import { environmentLeases } from "@paperclipai/db";
@@ -102,7 +103,13 @@ export interface EnvironmentDriverAcquireInput {
  companyId: string;
  environment: Environment;
  issueId: string | null;
-  heartbeatRunId: string;
+  /**
+   * UUID of the owning heartbeat run, or null for ad-hoc invocations
+   * (e.g. operator-initiated `Test` probes) that are not tied to a run.
+   * Null leases must be released by id via `getDriver(...).releaseRunLease`
+   * since `releaseRunLeases(heartbeatRunId)` cannot find them.
+   */
+  heartbeatRunId: string | null;
  executionWorkspaceId: string | null;
  executionWorkspaceMode: ExecutionWorkspace["mode"] | null;
 }
@@ -407,14 +414,21 @@ function createSandboxEnvironmentDriver(

        const workerConfig = stripSandboxProviderEnvelope(parsed.config);
        const storedConfig = storedParsed.config;
-        const existingLeases = parsed.config.reuseLease
-          ? await environmentsSvc.listLeases(input.environment.id)
+        // Ad-hoc tests (heartbeatRunId === null) must never resume an existing
+        // provider lease. If they did, releasing the test lease at the end of
+        // the probe would tear down the live heartbeat run that owns it.
+        // We also filter out leases whose policy is not reuse_by_environment
+        // so any non-reusable lease (including ad-hoc test leases that
+        // landed in the table from older code paths) cannot be matched.
+        const reusableExistingLeases = parsed.config.reuseLease && input.heartbeatRunId !== null
+          ? (await environmentsSvc.listLeases(input.environment.id))
+              .filter((lease) => lease.leasePolicy === "reuse_by_environment")
          : [];
-        const reusableProviderLeaseId = parsed.config.reuseLease
-          ? findReusableSandboxLeaseId({ config: storedConfig, leases: existingLeases })
+        const reusableProviderLeaseId = parsed.config.reuseLease && input.heartbeatRunId !== null
+          ? findReusableSandboxLeaseId({ config: storedConfig, leases: reusableExistingLeases })
          : null;
        const reusableLease = reusableProviderLeaseId
-          ? existingLeases.find((lease) => lease.providerLeaseId === reusableProviderLeaseId)
+          ? reusableExistingLeases.find((lease) => lease.providerLeaseId === reusableProviderLeaseId)
          : null;

        const providerLease = reusableLease?.providerLeaseId
@@ -443,12 +457,18 @@ function createSandboxEnvironmentDriver(
            companyId: input.companyId,
            environmentId: input.environment.id,
            config: workerConfig,
-            runId: input.heartbeatRunId,
+            // Plugin SDK requires a string; ad-hoc test leases use a fresh
+            // UUID so providers that validate or persist the runId still see
+            // a well-formed identifier.
+            runId: input.heartbeatRunId ?? randomUUID(),
            workspaceMode: input.executionWorkspaceMode ?? undefined,
          },
        );

-        const resolvedLeasePolicy = parsed.config.reuseLease
+        // Ad-hoc test leases are never publishable for reuse: storing them
+        // as `reuse_by_environment` would let a concurrent heartbeat resume
+        // the test's provider lease and lose its sandbox when the test ends.
+        const resolvedLeasePolicy = parsed.config.reuseLease && input.heartbeatRunId !== null
          ? "reuse_by_environment"
          : "ephemeral";

@@ -477,22 +497,33 @@ function createSandboxEnvironmentDriver(
        });
      }

-      // Built-in sandbox provider path.
-      const reusableProviderLeaseId = parsed.config.reuseLease
+      // Built-in sandbox provider path. Same guard as the plugin-backed path:
+      // ad-hoc tests (heartbeatRunId === null) must never resume an existing
+      // provider lease, or releasing the test lease will terminate the live
+      // heartbeat run that shares it. Filter to leases whose policy is
+      // reuse_by_environment so non-reusable rows can never be matched.
+      const reusableProviderLeaseId = parsed.config.reuseLease && input.heartbeatRunId !== null
        ? (await environmentsSvc
            .listLeases(input.environment.id)
-            .then((leases) => findReusableSandboxLeaseId({ config: parsed.config, leases })))
+            .then((leases) =>
+              findReusableSandboxLeaseId({
+                config: parsed.config,
+                leases: leases.filter((lease) => lease.leasePolicy === "reuse_by_environment"),
+              }),
+            ))
        : null;

      const providerLease = await acquireSandboxProviderLease({
        config: parsed.config,
        environmentId: input.environment.id,
-        heartbeatRunId: input.heartbeatRunId,
+        heartbeatRunId: input.heartbeatRunId ?? randomUUID(),
        issueId: input.issueId,
        reusableProviderLeaseId,
      });

-      const resolvedLeasePolicy = parsed.config.reuseLease
+      // Same ephemeral-policy-for-tests guard as the plugin-backed path:
+      // ad-hoc test leases must not be publishable for reuse.
+      const resolvedLeasePolicy = parsed.config.reuseLease && input.heartbeatRunId !== null
        ? "reuse_by_environment"
        : "ephemeral";

@@ -831,7 +862,7 @@ function createPluginEnvironmentDriver(
        companyId: input.companyId,
        environmentId: input.environment.id,
        config: parsed.config.driverConfig,
-        runId: input.heartbeatRunId,
+        runId: input.heartbeatRunId ?? randomUUID(),
        workspaceMode: input.executionWorkspaceMode ?? undefined,
      });

@@ -1040,7 +1071,8 @@ export function environmentRuntimeService(
      companyId: string;
      environment: Environment;
      issueId: string | null;
-      heartbeatRunId: string;
+      /** Null for ad-hoc invocations (e.g. operator-initiated `Test` probes). */
+      heartbeatRunId: string | null;
      persistedExecutionWorkspace: Pick<ExecutionWorkspace, "id" | "mode"> | null;
    }): Promise<EnvironmentRuntimeLeaseRecord> {
      if (input.environment.status !== "active") {