[codex] Fix stale issue execution run locks (#4258)

## Thinking Path

> - Paperclip is a control plane for AI-agent companies, so issue
checkout and execution ownership are core safety contracts.
> - The affected subsystem is the issue service and route layer that
gates agent writes by `checkoutRunId` and `executionRunId`.
> - PAP-1982 exposed a stale-lock failure mode where a terminal
heartbeat run could leave `executionRunId` pinned after checkout
ownership had moved or been cleared.
> - That stale execution lock could reject legitimate
PATCH/comment/release requests from the rightful assignee after a
harness restart.
> - This pull request centralizes terminal-run cleanup, applies it
before ownership-gated writes, and adds a board-only recovery endpoint
for operator intervention.
> - The benefit is that crashed or terminal runs no longer strand issues
behind stale execution locks, while live execution locks still block
conflicting writes.

## What Changed

- Added `issueService.clearExecutionRunIfTerminal()` to atomically lock
the issue/run rows and clear terminal or missing execution-run locks.
- Reused stale execution-lock cleanup from checkout,
`assertCheckoutOwner()`, and `release()`.
- Allowed the same assigned agent/current run to adopt an unowned
`in_progress` checkout after stale execution-lock cleanup.
- Updated release to clear `executionRunId`, `executionAgentNameKey`,
and `executionLockedAt`.
- Added board-only `POST /api/issues/:id/admin/force-release` with
company access checks, optional `clearAssignee=true`, and
`issue.admin_force_release` audit logging.
- Added embedded Postgres service tests and route integration tests for
stale-lock recovery, release behavior, and admin force-release
authorization/audit behavior.
- Documented the new force-release API in `doc/SPEC-implementation.md`.

## Verification

- `pnpm vitest run server/src/__tests__/issues-service.test.ts
server/src/__tests__/issue-stale-execution-lock-routes.test.ts` passed.
- `pnpm vitest run
server/src/__tests__/issue-stale-execution-lock-routes.test.ts
server/src/__tests__/approval-routes-idempotency.test.ts
server/src/__tests__/issue-comment-reopen-routes.test.ts
server/src/__tests__/issue-telemetry-routes.test.ts` passed.
- `pnpm -r typecheck` passed.
- `pnpm build` passed.
- `git diff --check` passed.
- `pnpm lint` could not run because this repo has no `lint` command.
- Full `pnpm test:run` completed with 4 failures in existing route
suites: `approval-routes-idempotency.test.ts` (2),
`issue-comment-reopen-routes.test.ts` (1), and
`issue-telemetry-routes.test.ts` (1). Those same files pass when run
isolated and when run together with the new stale-lock route test, so
this appears to be a whole-suite ordering/mock-isolation issue outside
this patch path.

## Risks

- Medium: this changes ownership-gated write behavior. The new adoption
path is limited to the current run, the current assignee, `in_progress`
issues, and rows with no checkout owner after terminal-lock cleanup.
- Low: the admin force-release endpoint is board-only and
company-scoped, but misuse can intentionally clear a live lock. It
writes an audit event with prior lock IDs.
- No schema or migration changes.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5 coding agent (`gpt-5`), agentic coding with
terminal/tool use and local test execution.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
This commit is contained in:
Dotta 2026-04-22 10:43:38 -05:00 committed by GitHub
parent a957394420
commit b69b563aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 631 additions and 38 deletions

View file

@ -9,6 +9,7 @@ import {
createDb,
executionWorkspaces,
goals,
heartbeatRuns,
instanceSettings,
issueComments,
issueInboxArchives,
@ -1879,3 +1880,135 @@ describeEmbeddedPostgres("issueService.findMentionedProjectIds", () => {
]);
});
});
describeEmbeddedPostgres("issueService.clearExecutionRunIfTerminal", () => {
let db!: ReturnType<typeof createDb>;
let svc!: ReturnType<typeof issueService>;
let tempDb: Awaited<ReturnType<typeof startEmbeddedPostgresTestDatabase>> | null = null;
beforeAll(async () => {
tempDb = await startEmbeddedPostgresTestDatabase("paperclip-issues-execution-lock-");
db = createDb(tempDb.connectionString);
svc = issueService(db);
}, 20_000);
afterEach(async () => {
await db.delete(issueComments);
await db.delete(issueRelations);
await db.delete(issueInboxArchives);
await db.delete(activityLog);
await db.delete(issues);
await db.delete(heartbeatRuns);
await db.delete(executionWorkspaces);
await db.delete(projectWorkspaces);
await db.delete(projects);
await db.delete(goals);
await db.delete(agents);
await db.delete(instanceSettings);
await db.delete(companies);
});
afterAll(async () => {
await tempDb?.cleanup();
});
async function seedIssueWithRun(status: string | null) {
const companyId = randomUUID();
const agentId = randomUUID();
const issueId = randomUUID();
const runId = status ? randomUUID() : null;
await db.insert(companies).values({
id: companyId,
name: "Paperclip",
issuePrefix: `T${companyId.replace(/-/g, "").slice(0, 6).toUpperCase()}`,
requireBoardApprovalForNewAgents: false,
});
await db.insert(agents).values({
id: agentId,
companyId,
name: "CodexCoder",
role: "engineer",
status: "active",
adapterType: "codex_local",
adapterConfig: {},
runtimeConfig: {},
permissions: {},
});
if (runId) {
await db.insert(heartbeatRuns).values({
id: runId,
companyId,
agentId,
status,
invocationSource: "manual",
});
}
await db.insert(issues).values({
id: issueId,
companyId,
title: "Execution lock",
status: "in_progress",
priority: "medium",
assigneeAgentId: agentId,
executionRunId: runId,
executionAgentNameKey: runId ? "codexcoder" : null,
executionLockedAt: runId ? new Date() : null,
});
return { issueId, runId };
}
it("clears execution locks owned by terminal runs", async () => {
const { issueId } = await seedIssueWithRun("failed");
await expect(svc.clearExecutionRunIfTerminal(issueId)).resolves.toBe(true);
const row = await db
.select({
executionRunId: issues.executionRunId,
executionAgentNameKey: issues.executionAgentNameKey,
executionLockedAt: issues.executionLockedAt,
})
.from(issues)
.where(eq(issues.id, issueId))
.then((rows) => rows[0]);
expect(row).toEqual({
executionRunId: null,
executionAgentNameKey: null,
executionLockedAt: null,
});
});
it("does not clear execution locks owned by live runs", async () => {
const { issueId, runId } = await seedIssueWithRun("running");
await expect(svc.clearExecutionRunIfTerminal(issueId)).resolves.toBe(false);
const row = await db
.select({
executionRunId: issues.executionRunId,
executionAgentNameKey: issues.executionAgentNameKey,
executionLockedAt: issues.executionLockedAt,
})
.from(issues)
.where(eq(issues.id, issueId))
.then((rows) => rows[0]);
expect(row?.executionRunId).toBe(runId);
expect(row?.executionAgentNameKey).toBe("codexcoder");
expect(row?.executionLockedAt).toBeInstanceOf(Date);
});
it("does not update issues without an execution lock", async () => {
const { issueId } = await seedIssueWithRun(null);
await expect(svc.clearExecutionRunIfTerminal(issueId)).resolves.toBe(false);
const row = await db
.select({ executionRunId: issues.executionRunId, executionLockedAt: issues.executionLockedAt })
.from(issues)
.where(eq(issues.id, issueId))
.then((rows) => rows[0]);
expect(row).toEqual({ executionRunId: null, executionLockedAt: null });
});
});