mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-16 10:50:38 +09:00
Serialize sandbox callback bridge against concurrent heartbeats (#5326)
> **Stacked PR.** This PR's branch carries cumulative content from #5324 (bridge allowlist expand) and #5325 (env sanitization) — the mutex/sha256 logic in this PR sits on top of both. Reviewers should focus on the files this PR's commit touches: `packages/adapter-utils/src/sandbox-callback-bridge.{ts,test.ts}`, `packages/adapter-utils/src/ssh.ts`, and `packages/adapter-utils/src/ssh-fixture.test.ts`. Will rebase onto `master` and force-push once both prerequisite PRs are merged. ## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies > - Each agent that runs in a sandbox or via SSH talks back to the Paperclip server through a per-lease callback bridge whose entrypoint script is uploaded to the remote > - When two heartbeats target the same agent on the same machine concurrently, both upload the bridge entrypoint and both write to the same response files — producing torn-write races: `SyntaxError: Identifier 'randomUUID' has already been declared` from a concatenated upload, `mv: cannot stat …` from colliding `.json.tmp` writes, and 0-byte commits from a truncated stdin > - This pull request serializes those operations with a POSIX `mkdir`-mutex (PID liveness check + atomic rename) at the bridge entrypoint upload, applies the same lock to the bridge response writer, forwards stdin into remote ssh commands so the entrypoint payload arrives intact, and verifies a sha256 of the upload before promoting it > - The benefit is concurrent heartbeats no longer corrupt each other's bridge state ## What Changed - `packages/adapter-utils/src/sandbox-callback-bridge.ts`: serialize entrypoint upload and response writes via POSIX `mkdir`-mutex with PID liveness; sha256 the upload before promoting via `mv`; content-skip when the existing entrypoint already matches - `packages/adapter-utils/src/ssh.ts`: forward stdin into remote ssh commands through the SSH managed runtime so `cat > "$remote_upload"` actually receives the base64-encoded entrypoint - `packages/adapter-utils/src/ssh-fixture.test.ts`: cover the stdin-forwarded SSH path - `packages/adapter-utils/src/sandbox-callback-bridge.test.ts`: cover the mutex, content-skip, sha256-verify, and atomic-rename paths ## Verification - `pnpm vitest run --no-coverage --project @paperclipai/adapter-utils` - `pnpm typecheck` clean - Manual: two parallel heartbeats targeting the same SSH agent no longer race on the bridge entrypoint or response files ## Risks Medium. Serializing previously-parallel operations adds latency on the contended path (one heartbeat waits on another), bounded by the entrypoint upload time. The mutex includes PID liveness so a crashed heartbeat doesn't deadlock subsequent ones. Sha256-verify gives a clear "torn upload" failure mode instead of silent 0-byte commits. ## Model Used Claude Opus 4.7 (1M context) ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable — tests cover mutex + sha256-verify + stdin-forwarded ssh - [x] If this change affects the UI, I have included before/after screenshots — N/A (no UI) - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
This commit is contained in:
parent
f6bad8f6bf
commit
50db8c01d2
4 changed files with 633 additions and 16 deletions
|
|
@ -8,10 +8,12 @@ import { afterEach, describe, expect, it } from "vitest";
|
|||
import { prepareCommandManagedRuntime } from "./command-managed-runtime.js";
|
||||
import {
|
||||
authorizeSandboxCallbackBridgeRequestWithRoutes,
|
||||
createCommandManagedSandboxCallbackBridgeQueueClient,
|
||||
createFileSystemSandboxCallbackBridgeQueueClient,
|
||||
createSandboxCallbackBridgeAsset,
|
||||
createSandboxCallbackBridgeToken,
|
||||
sandboxCallbackBridgeDirectories,
|
||||
syncSandboxCallbackBridgeEntrypoint,
|
||||
startSandboxCallbackBridgeServer,
|
||||
startSandboxCallbackBridgeWorker,
|
||||
} from "./sandbox-callback-bridge.js";
|
||||
|
|
@ -420,6 +422,98 @@ describe("sandbox callback bridge", () => {
|
|||
);
|
||||
});
|
||||
|
||||
it("serializes remote response writes so stop does not recreate a late orphaned response", async () => {
|
||||
const rootDir = await mkdtemp(path.join(os.tmpdir(), "paperclip-bridge-response-lock-"));
|
||||
cleanupDirs.push(rootDir);
|
||||
|
||||
const localWorkspaceDir = path.join(rootDir, "local-workspace");
|
||||
const remoteWorkspaceDir = path.join(rootDir, "remote-workspace");
|
||||
await mkdir(localWorkspaceDir, { recursive: true });
|
||||
await mkdir(remoteWorkspaceDir, { recursive: true });
|
||||
await writeFile(path.join(localWorkspaceDir, "README.md"), "bridge response lock test\n", "utf8");
|
||||
|
||||
const runner = createExecRunner();
|
||||
const bridgeAsset = await createSandboxCallbackBridgeAsset();
|
||||
cleanupFns.push(bridgeAsset.cleanup);
|
||||
const prepared = await prepareCommandManagedRuntime({
|
||||
runner,
|
||||
spec: {
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
timeoutMs: 30_000,
|
||||
},
|
||||
adapterKey: "codex",
|
||||
workspaceLocalDir: localWorkspaceDir,
|
||||
assets: [{ key: "bridge", localDir: bridgeAsset.localDir }],
|
||||
});
|
||||
|
||||
const queueDir = path.posix.join(prepared.runtimeRootDir, "paperclip-bridge");
|
||||
const directories = sandboxCallbackBridgeDirectories(queueDir);
|
||||
const bridgeToken = createSandboxCallbackBridgeToken();
|
||||
const seenRequestIds: string[] = [];
|
||||
|
||||
const worker = await startSandboxCallbackBridgeWorker({
|
||||
client: createCommandManagedSandboxCallbackBridgeQueueClient({
|
||||
runner,
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
queueDir,
|
||||
authorizeRequest: async () => null,
|
||||
handleRequest: async (request) => {
|
||||
seenRequestIds.push(request.id);
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
return {
|
||||
status: 200,
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ ok: true, id: request.id }),
|
||||
};
|
||||
},
|
||||
});
|
||||
cleanupFns.push(async () => {
|
||||
await worker.stop();
|
||||
});
|
||||
|
||||
const bridge = await startSandboxCallbackBridgeServer({
|
||||
runner,
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
assetRemoteDir: prepared.assetDirs.bridge,
|
||||
queueDir,
|
||||
bridgeToken,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
cleanupFns.push(async () => {
|
||||
await bridge.stop();
|
||||
});
|
||||
|
||||
const responsePromise = fetch(`${bridge.baseUrl}/api/agents/me`, {
|
||||
headers: {
|
||||
authorization: `Bearer ${bridgeToken}`,
|
||||
},
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < 50 && seenRequestIds.length === 0; attempt += 1) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 5));
|
||||
}
|
||||
|
||||
expect(seenRequestIds).toHaveLength(1);
|
||||
await worker.stop({ drainTimeoutMs: 10 });
|
||||
|
||||
const response = await responsePromise;
|
||||
expect(response.status).toBe(503);
|
||||
await expect(response.json()).resolves.toEqual({
|
||||
error: "Bridge worker stopped before request could be handled.",
|
||||
});
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 150));
|
||||
|
||||
await expect(readdir(directories.responsesDir)).resolves.toEqual([]);
|
||||
await expect(
|
||||
readdir(directories.responsesDir).then((entries) =>
|
||||
entries.filter((entry) => entry.endsWith(".tmp") || entry.includes(".paperclip-write.lock")),
|
||||
),
|
||||
).resolves.toEqual([]);
|
||||
});
|
||||
|
||||
it("rejects non-JSON request bodies and full queues at the bridge server", async () => {
|
||||
const rootDir = await mkdtemp(path.join(os.tmpdir(), "paperclip-bridge-server-guards-"));
|
||||
cleanupDirs.push(rootDir);
|
||||
|
|
@ -615,6 +709,112 @@ describe("sandbox callback bridge", () => {
|
|||
});
|
||||
});
|
||||
|
||||
it("reuses an already-uploaded bridge entrypoint when the remote file hash matches", async () => {
|
||||
const rootDir = await mkdtemp(path.join(os.tmpdir(), "paperclip-bridge-sync-"));
|
||||
cleanupDirs.push(rootDir);
|
||||
|
||||
const remoteWorkspaceDir = path.join(rootDir, "remote-workspace");
|
||||
const remoteAssetDir = path.posix.join(
|
||||
remoteWorkspaceDir,
|
||||
".paperclip-runtime",
|
||||
"codex",
|
||||
"paperclip-bridge",
|
||||
"server",
|
||||
);
|
||||
await mkdir(remoteWorkspaceDir, { recursive: true });
|
||||
|
||||
const bridgeAsset = await createSandboxCallbackBridgeAsset();
|
||||
cleanupFns.push(bridgeAsset.cleanup);
|
||||
const originalSource = await readFile(bridgeAsset.entrypoint, "utf8");
|
||||
const expandedSource = `${originalSource}\n// bridge payload padding\n`;
|
||||
await writeFile(bridgeAsset.entrypoint, expandedSource, "utf8");
|
||||
|
||||
const runner = createExecRunner();
|
||||
|
||||
const first = await syncSandboxCallbackBridgeEntrypoint({
|
||||
runner,
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
assetRemoteDir: remoteAssetDir,
|
||||
bridgeAsset,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
const second = await syncSandboxCallbackBridgeEntrypoint({
|
||||
runner,
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
assetRemoteDir: remoteAssetDir,
|
||||
bridgeAsset,
|
||||
timeoutMs: 30_000,
|
||||
});
|
||||
|
||||
expect(first.uploaded).toBe(true);
|
||||
expect(second.uploaded).toBe(false);
|
||||
await expect(readFile(path.posix.join(remoteAssetDir, "paperclip-bridge-server.mjs"), "utf8")).resolves.toBe(expandedSource);
|
||||
await expect(
|
||||
readdir(remoteAssetDir).then((entries) =>
|
||||
entries.filter(
|
||||
(entry) =>
|
||||
entry.endsWith(".paperclip-upload.b64") ||
|
||||
entry.endsWith(".partial") ||
|
||||
entry === ".paperclip-bridge-upload.lock",
|
||||
),
|
||||
),
|
||||
).resolves.toEqual([]);
|
||||
});
|
||||
|
||||
it("rejects a corrupted bridge entrypoint upload without committing a torn remote file", async () => {
|
||||
const rootDir = await mkdtemp(path.join(os.tmpdir(), "paperclip-bridge-sync-corrupt-"));
|
||||
cleanupDirs.push(rootDir);
|
||||
|
||||
const remoteWorkspaceDir = path.join(rootDir, "remote-workspace");
|
||||
const remoteAssetDir = path.posix.join(
|
||||
remoteWorkspaceDir,
|
||||
".paperclip-runtime",
|
||||
"codex",
|
||||
"paperclip-bridge",
|
||||
"server",
|
||||
);
|
||||
await mkdir(remoteWorkspaceDir, { recursive: true });
|
||||
|
||||
const bridgeAsset = await createSandboxCallbackBridgeAsset();
|
||||
cleanupFns.push(bridgeAsset.cleanup);
|
||||
const runner = {
|
||||
execute: async (input: {
|
||||
command: string;
|
||||
args?: string[];
|
||||
cwd?: string;
|
||||
env?: Record<string, string>;
|
||||
stdin?: string;
|
||||
timeoutMs?: number;
|
||||
}) =>
|
||||
await createExecRunner().execute({
|
||||
...input,
|
||||
stdin: input.stdin != null ? "" : input.stdin,
|
||||
}),
|
||||
};
|
||||
|
||||
await expect(
|
||||
syncSandboxCallbackBridgeEntrypoint({
|
||||
runner,
|
||||
remoteCwd: remoteWorkspaceDir,
|
||||
assetRemoteDir: remoteAssetDir,
|
||||
bridgeAsset,
|
||||
timeoutMs: 30_000,
|
||||
}),
|
||||
).rejects.toThrow(/sha mismatch/i);
|
||||
|
||||
await expect(readFile(path.posix.join(remoteAssetDir, "paperclip-bridge-server.mjs"), "utf8")).rejects.toThrow();
|
||||
await expect(
|
||||
readdir(remoteAssetDir).then((entries) =>
|
||||
entries.filter(
|
||||
(entry) =>
|
||||
entry.endsWith(".paperclip-upload.b64") ||
|
||||
entry.endsWith(".partial") ||
|
||||
entry === ".paperclip-bridge-upload.lock",
|
||||
),
|
||||
),
|
||||
).resolves.toEqual([]);
|
||||
});
|
||||
|
||||
it("permits the documented heartbeat surface and denies unrelated routes", () => {
|
||||
const allowed: Array<{ method: string; path: string }> = [
|
||||
{ method: "GET", path: "/api/agents/me" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue