mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-15 18:30:39 +09:00
Fix runtime state race, workspace sync, plugin startup, and orphaned leases (#4804)
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies > - Agents run inside environments that are leased, and the server manages runtime state, workspace configuration, and plugin lifecycle > - Several edge cases caused failures during concurrent operations: a race condition in runtime state insertion could produce duplicate-key errors, reused workspaces didn't sync their configuration when the parent issue was updated, sandbox provider plugins could be queried before registration completed, and orphaned environment leases from failed runs were never released > - This PR fixes these four runtime/environment issues > - The benefit is more reliable concurrent agent execution and proper resource cleanup ## What Changed - `services/heartbeat.ts`: Fixed a race condition where concurrent runtime state inserts could fail with a duplicate-key error by using an upsert pattern - `services/issues.ts`: Sync reused workspace configuration when an issue is updated, so the workspace reflects the latest issue state - `services/environment-runtime.ts`: Fixed a startup race where sandbox provider plugins could be queried before registration completed, by awaiting plugin readiness before resolving environment drivers - `services/heartbeat.ts`: Release environment leases for orphaned runs that lost their process without cleanup ## Verification - `pnpm test` — all existing and new tests pass, including new tests for runtime state upsert and process recovery lease cleanup - `pnpm typecheck` — clean - Manual: trigger concurrent agent runs to verify no duplicate-key failures; verify orphaned leases are released after process loss ## Risks - Low risk. The runtime state upsert changes insert-to-upsert behavior, which could mask a legitimate duplicate if two different runs produce the same key — but this is prevented by the run ID being part of the key. The plugin startup await is bounded by the existing registration timeout. ## Model Used Codex GPT 5.4 high via Paperclip. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
This commit is contained in:
parent
f9cf1d2f6a
commit
4cf612a92d
7 changed files with 542 additions and 27 deletions
|
|
@ -700,6 +700,98 @@ describeEmbeddedPostgres("environmentRuntimeService", () => {
|
|||
}));
|
||||
});
|
||||
|
||||
it("waits briefly for a ready sandbox provider plugin worker to come online", async () => {
|
||||
const pluginId = randomUUID();
|
||||
const { companyId, environment: baseEnvironment, runId } = await seedEnvironment();
|
||||
const providerConfig = {
|
||||
provider: "fake-plugin",
|
||||
image: "fake:test",
|
||||
timeoutMs: 1234,
|
||||
reuseLease: false,
|
||||
};
|
||||
const environment = {
|
||||
...baseEnvironment,
|
||||
name: "Eventually Running Plugin Sandbox",
|
||||
driver: "sandbox",
|
||||
config: providerConfig,
|
||||
};
|
||||
await environmentService(db).update(environment.id, {
|
||||
driver: "sandbox",
|
||||
name: environment.name,
|
||||
config: providerConfig,
|
||||
});
|
||||
await db.insert(plugins).values({
|
||||
id: pluginId,
|
||||
pluginKey: "acme.eventually-running-sandbox-provider",
|
||||
packageName: "@acme/eventually-running-sandbox-provider",
|
||||
version: "1.0.0",
|
||||
apiVersion: 1,
|
||||
categories: ["automation"],
|
||||
manifestJson: {
|
||||
id: "acme.eventually-running-sandbox-provider",
|
||||
apiVersion: 1,
|
||||
version: "1.0.0",
|
||||
displayName: "Eventually Running Sandbox Provider",
|
||||
description: "Test plugin worker startup grace period",
|
||||
author: "Acme",
|
||||
categories: ["automation"],
|
||||
capabilities: ["environment.drivers.register"],
|
||||
entrypoints: { worker: "dist/worker.js" },
|
||||
environmentDrivers: [
|
||||
{
|
||||
driverKey: "fake-plugin",
|
||||
kind: "sandbox_provider",
|
||||
displayName: "Fake Plugin",
|
||||
configSchema: { type: "object" },
|
||||
},
|
||||
],
|
||||
},
|
||||
status: "ready",
|
||||
installOrder: 1,
|
||||
updatedAt: new Date(),
|
||||
} as any);
|
||||
|
||||
let runningChecks = 0;
|
||||
const workerManager = {
|
||||
isRunning: vi.fn((id: string) => {
|
||||
if (id !== pluginId) return false;
|
||||
runningChecks += 1;
|
||||
return runningChecks >= 3;
|
||||
}),
|
||||
call: vi.fn(async (_pluginId: string, method: string) => {
|
||||
if (method === "environmentAcquireLease") {
|
||||
return {
|
||||
providerLeaseId: "sandbox-1",
|
||||
metadata: {
|
||||
provider: "fake-plugin",
|
||||
image: "fake:test",
|
||||
timeoutMs: 1234,
|
||||
reuseLease: false,
|
||||
},
|
||||
};
|
||||
}
|
||||
throw new Error(`Unexpected plugin method: ${method}`);
|
||||
}),
|
||||
} as unknown as PluginWorkerManager;
|
||||
const runtimeWithPlugin = environmentRuntimeService(db, {
|
||||
pluginWorkerManager: workerManager,
|
||||
pluginWorkerReadyTimeoutMs: 25,
|
||||
pluginWorkerReadyPollMs: 1,
|
||||
});
|
||||
|
||||
const acquired = await runtimeWithPlugin.acquireRunLease({
|
||||
companyId,
|
||||
environment,
|
||||
issueId: null,
|
||||
heartbeatRunId: runId,
|
||||
persistedExecutionWorkspace: null,
|
||||
});
|
||||
|
||||
expect(acquired.lease.providerLeaseId).toBe("sandbox-1");
|
||||
expect(workerManager.isRunning).toHaveBeenCalledTimes(3);
|
||||
expect(workerManager.call).toHaveBeenCalledWith(pluginId, "environmentAcquireLease", expect.anything());
|
||||
});
|
||||
|
||||
it("falls back to acquire when plugin-backed sandbox lease resume throws", async () => {
|
||||
const pluginId = randomUUID();
|
||||
const { companyId, environment: baseEnvironment, runId } = await seedEnvironment();
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@ import {
|
|||
createDb,
|
||||
documentRevisions,
|
||||
documents,
|
||||
environmentLeases,
|
||||
environments,
|
||||
heartbeatRunEvents,
|
||||
heartbeatRuns,
|
||||
issueComments,
|
||||
|
|
@ -309,6 +311,8 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
|||
await db.delete(agentRuntimeState);
|
||||
await db.delete(companySkills);
|
||||
await db.delete(costEvents);
|
||||
await db.delete(environmentLeases);
|
||||
await db.delete(environments);
|
||||
await db.delete(issueComments);
|
||||
await db.delete(issueDocuments);
|
||||
await db.delete(documentRevisions);
|
||||
|
|
@ -466,6 +470,48 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
|||
return { companyId, agentId, runId, wakeupRequestId, issueId };
|
||||
}
|
||||
|
||||
async function seedEnvironmentLeaseFixture(input: {
|
||||
companyId: string;
|
||||
runId: string;
|
||||
issueId: string;
|
||||
provider?: string;
|
||||
}) {
|
||||
const environmentId = randomUUID();
|
||||
const leaseId = randomUUID();
|
||||
const now = new Date("2026-03-19T00:00:00.000Z");
|
||||
|
||||
await db.insert(environments).values({
|
||||
id: environmentId,
|
||||
companyId: input.companyId,
|
||||
name: "Local test environment",
|
||||
driver: "local",
|
||||
status: "active",
|
||||
config: {},
|
||||
metadata: null,
|
||||
});
|
||||
|
||||
await db.insert(environmentLeases).values({
|
||||
id: leaseId,
|
||||
companyId: input.companyId,
|
||||
environmentId,
|
||||
issueId: input.issueId,
|
||||
heartbeatRunId: input.runId,
|
||||
status: "active",
|
||||
leasePolicy: "ephemeral",
|
||||
provider: input.provider ?? "local",
|
||||
providerLeaseId: null,
|
||||
acquiredAt: now,
|
||||
lastUsedAt: now,
|
||||
metadata: {
|
||||
driver: "local",
|
||||
},
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
});
|
||||
|
||||
return { environmentId, leaseId };
|
||||
}
|
||||
|
||||
async function seedStrandedIssueFixture(input: {
|
||||
status: "todo" | "in_progress";
|
||||
runStatus: "failed" | "timed_out" | "cancelled" | "succeeded";
|
||||
|
|
@ -877,6 +923,30 @@ describeEmbeddedPostgres("heartbeat orphaned process recovery", () => {
|
|||
expect(issue?.checkoutRunId).toBe(runId);
|
||||
});
|
||||
|
||||
it("releases active environment leases when an orphaned run is reaped", async () => {
|
||||
const { runId, issueId, companyId } = await seedRunFixture({
|
||||
processPid: 999_999_999,
|
||||
});
|
||||
const { leaseId } = await seedEnvironmentLeaseFixture({
|
||||
companyId,
|
||||
runId,
|
||||
issueId,
|
||||
});
|
||||
const heartbeat = heartbeatService(db);
|
||||
|
||||
const result = await heartbeat.reapOrphanedRuns();
|
||||
expect(result.reaped).toBe(1);
|
||||
expect(result.runIds).toEqual([runId]);
|
||||
|
||||
const lease = await db
|
||||
.select()
|
||||
.from(environmentLeases)
|
||||
.where(eq(environmentLeases.id, leaseId))
|
||||
.then((rows) => rows[0] ?? null);
|
||||
expect(lease?.status).toBe("failed");
|
||||
expect(lease?.releasedAt).toBeTruthy();
|
||||
});
|
||||
|
||||
it.skipIf(process.platform === "win32")("reaps orphaned descendant process groups when the parent pid is already gone", async () => {
|
||||
const orphan = await spawnOrphanedProcessGroup();
|
||||
cleanupPids.add(orphan.descendantPid);
|
||||
|
|
|
|||
88
server/src/__tests__/heartbeat-runtime-state.test.ts
Normal file
88
server/src/__tests__/heartbeat-runtime-state.test.ts
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import { randomUUID } from "node:crypto";
|
||||
import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest";
|
||||
import { eq } from "drizzle-orm";
|
||||
import {
|
||||
agents,
|
||||
agentRuntimeState,
|
||||
agentWakeupRequests,
|
||||
companies,
|
||||
createDb,
|
||||
heartbeatRunEvents,
|
||||
heartbeatRuns,
|
||||
} from "@paperclipai/db";
|
||||
import {
|
||||
getEmbeddedPostgresTestSupport,
|
||||
startEmbeddedPostgresTestDatabase,
|
||||
} from "./helpers/embedded-postgres.js";
|
||||
import { heartbeatService } from "../services/heartbeat.ts";
|
||||
|
||||
const embeddedPostgresSupport = await getEmbeddedPostgresTestSupport();
|
||||
const describeEmbeddedPostgres = embeddedPostgresSupport.supported ? describe : describe.skip;
|
||||
|
||||
if (!embeddedPostgresSupport.supported) {
|
||||
console.warn(
|
||||
`Skipping embedded Postgres heartbeat runtime-state tests on this host: ${embeddedPostgresSupport.reason ?? "unsupported environment"}`,
|
||||
);
|
||||
}
|
||||
|
||||
describeEmbeddedPostgres("heartbeat runtime state deduplication", () => {
|
||||
let db!: ReturnType<typeof createDb>;
|
||||
let tempDb: Awaited<ReturnType<typeof startEmbeddedPostgresTestDatabase>> | null = null;
|
||||
|
||||
beforeAll(async () => {
|
||||
tempDb = await startEmbeddedPostgresTestDatabase("heartbeat-runtime-state-");
|
||||
db = createDb(tempDb.connectionString);
|
||||
}, 20_000);
|
||||
|
||||
afterEach(async () => {
|
||||
await db.delete(heartbeatRunEvents);
|
||||
await db.delete(heartbeatRuns);
|
||||
await db.delete(agentWakeupRequests);
|
||||
await db.delete(agentRuntimeState);
|
||||
await db.delete(agents);
|
||||
await db.delete(companies);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await tempDb?.cleanup();
|
||||
});
|
||||
|
||||
it("deduplicates concurrent runtime-state creation", async () => {
|
||||
const companyId = randomUUID();
|
||||
const agentId = randomUUID();
|
||||
const issuePrefix = `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`;
|
||||
|
||||
await db.insert(companies).values({
|
||||
id: companyId,
|
||||
name: "Paperclip",
|
||||
issuePrefix,
|
||||
requireBoardApprovalForNewAgents: false,
|
||||
});
|
||||
|
||||
await db.insert(agents).values({
|
||||
id: agentId,
|
||||
companyId,
|
||||
name: "CodexCoder",
|
||||
role: "engineer",
|
||||
status: "idle",
|
||||
adapterType: "codex_local",
|
||||
adapterConfig: {},
|
||||
runtimeConfig: {},
|
||||
permissions: {},
|
||||
});
|
||||
|
||||
const heartbeat = heartbeatService(db);
|
||||
const results = await Promise.all(Array.from({ length: 12 }, () => heartbeat.getRuntimeState(agentId)));
|
||||
|
||||
expect(results.every((row) => row?.agentId === agentId)).toBe(true);
|
||||
|
||||
const rows = await db.select().from(agentRuntimeState).where(eq(agentRuntimeState.agentId, agentId));
|
||||
expect(rows).toHaveLength(1);
|
||||
expect(rows[0]).toMatchObject({
|
||||
agentId,
|
||||
companyId,
|
||||
adapterType: "codex_local",
|
||||
stateJson: {},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -2116,6 +2116,109 @@ describeEmbeddedPostgres("issueService.create workspace inheritance", () => {
|
|||
mode: "operator_branch",
|
||||
});
|
||||
});
|
||||
|
||||
it("syncs reused execution workspace config when issue workspace settings are updated", async () => {
|
||||
const companyId = randomUUID();
|
||||
const projectId = randomUUID();
|
||||
const projectWorkspaceId = randomUUID();
|
||||
const executionWorkspaceId = randomUUID();
|
||||
const issueId = randomUUID();
|
||||
|
||||
await db.insert(companies).values({
|
||||
id: companyId,
|
||||
name: "Paperclip",
|
||||
issuePrefix: `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`,
|
||||
requireBoardApprovalForNewAgents: false,
|
||||
});
|
||||
await instanceSettingsService(db).updateExperimental({ enableIsolatedWorkspaces: true });
|
||||
|
||||
await db.insert(projects).values({
|
||||
id: projectId,
|
||||
companyId,
|
||||
name: "Workspace project",
|
||||
status: "in_progress",
|
||||
});
|
||||
|
||||
await db.insert(projectWorkspaces).values({
|
||||
id: projectWorkspaceId,
|
||||
companyId,
|
||||
projectId,
|
||||
name: "Primary workspace",
|
||||
});
|
||||
|
||||
await db.insert(executionWorkspaces).values({
|
||||
id: executionWorkspaceId,
|
||||
companyId,
|
||||
projectId,
|
||||
projectWorkspaceId,
|
||||
mode: "isolated_workspace",
|
||||
strategyType: "git_worktree",
|
||||
name: "Issue worktree",
|
||||
status: "active",
|
||||
providerType: "git_worktree",
|
||||
metadata: {
|
||||
config: {
|
||||
environmentId: "env-old",
|
||||
provisionCommand: "bash ./scripts/provision-old.sh",
|
||||
teardownCommand: "bash ./scripts/teardown-old.sh",
|
||||
workspaceRuntime: { profile: "old" },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await db.insert(issues).values({
|
||||
id: issueId,
|
||||
companyId,
|
||||
projectId,
|
||||
projectWorkspaceId,
|
||||
title: "Recovery issue",
|
||||
status: "in_progress",
|
||||
priority: "medium",
|
||||
executionWorkspaceId,
|
||||
executionWorkspacePreference: "reuse_existing",
|
||||
executionWorkspaceSettings: {
|
||||
mode: "isolated_workspace",
|
||||
environmentId: "env-old",
|
||||
workspaceStrategy: {
|
||||
type: "git_worktree",
|
||||
provisionCommand: "bash ./scripts/provision-old.sh",
|
||||
teardownCommand: "bash ./scripts/teardown-old.sh",
|
||||
},
|
||||
workspaceRuntime: { profile: "old" },
|
||||
},
|
||||
});
|
||||
|
||||
await svc.update(issueId, {
|
||||
executionWorkspaceSettings: {
|
||||
mode: "isolated_workspace",
|
||||
environmentId: "env-new",
|
||||
workspaceStrategy: {
|
||||
type: "cloud_sandbox",
|
||||
provisionCommand: "bash ./scripts/provision-new.sh",
|
||||
teardownCommand: "bash ./scripts/teardown-new.sh",
|
||||
},
|
||||
workspaceRuntime: { profile: "new" },
|
||||
},
|
||||
});
|
||||
|
||||
const workspace = await db
|
||||
.select({ metadata: executionWorkspaces.metadata })
|
||||
.from(executionWorkspaces)
|
||||
.where(eq(executionWorkspaces.id, executionWorkspaceId))
|
||||
.then((rows) => rows[0] ?? null);
|
||||
|
||||
expect(workspace?.metadata).toEqual({
|
||||
config: {
|
||||
environmentId: "env-new",
|
||||
provisionCommand: "bash ./scripts/provision-new.sh",
|
||||
teardownCommand: "bash ./scripts/teardown-new.sh",
|
||||
cleanupCommand: null,
|
||||
workspaceRuntime: { profile: "new" },
|
||||
desiredState: null,
|
||||
serviceStates: null,
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describeEmbeddedPostgres("issueService.findMentionedProjectIds", () => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue