[codex] Fix stale issue execution run locks (#4258)

## Thinking Path

> - Paperclip is a control plane for AI-agent companies, so issue
checkout and execution ownership are core safety contracts.
> - The affected subsystem is the issue service and route layer that
gates agent writes by `checkoutRunId` and `executionRunId`.
> - PAP-1982 exposed a stale-lock failure mode where a terminal
heartbeat run could leave `executionRunId` pinned after checkout
ownership had moved or been cleared.
> - That stale execution lock could reject legitimate
PATCH/comment/release requests from the rightful assignee after a
harness restart.
> - This pull request centralizes terminal-run cleanup, applies it
before ownership-gated writes, and adds a board-only recovery endpoint
for operator intervention.
> - The benefit is that crashed or terminal runs no longer strand issues
behind stale execution locks, while live execution locks still block
conflicting writes.

## What Changed

- Added `issueService.clearExecutionRunIfTerminal()` to atomically lock
the issue/run rows and clear terminal or missing execution-run locks.
- Reused stale execution-lock cleanup from checkout,
`assertCheckoutOwner()`, and `release()`.
- Allowed the same assigned agent/current run to adopt an unowned
`in_progress` checkout after stale execution-lock cleanup.
- Updated release to clear `executionRunId`, `executionAgentNameKey`,
and `executionLockedAt`.
- Added board-only `POST /api/issues/:id/admin/force-release` with
company access checks, optional `clearAssignee=true`, and
`issue.admin_force_release` audit logging.
- Added embedded Postgres service tests and route integration tests for
stale-lock recovery, release behavior, and admin force-release
authorization/audit behavior.
- Documented the new force-release API in `doc/SPEC-implementation.md`.

## Verification

- `pnpm vitest run server/src/__tests__/issues-service.test.ts
server/src/__tests__/issue-stale-execution-lock-routes.test.ts` passed.
- `pnpm vitest run
server/src/__tests__/issue-stale-execution-lock-routes.test.ts
server/src/__tests__/approval-routes-idempotency.test.ts
server/src/__tests__/issue-comment-reopen-routes.test.ts
server/src/__tests__/issue-telemetry-routes.test.ts` passed.
- `pnpm -r typecheck` passed.
- `pnpm build` passed.
- `git diff --check` passed.
- `pnpm lint` could not run because this repo has no `lint` command.
- Full `pnpm test:run` completed with 4 failures in existing route
suites: `approval-routes-idempotency.test.ts` (2),
`issue-comment-reopen-routes.test.ts` (1), and
`issue-telemetry-routes.test.ts` (1). Those same files pass when run
isolated and when run together with the new stale-lock route test, so
this appears to be a whole-suite ordering/mock-isolation issue outside
this patch path.

## Risks

- Medium: this changes ownership-gated write behavior. The new adoption
path is limited to the current run, the current assignee, `in_progress`
issues, and rows with no checkout owner after terminal-lock cleanup.
- Low: the admin force-release endpoint is board-only and
company-scoped, but misuse can intentionally clear a live lock. It
writes an audit event with prior lock IDs.
- No schema or migration changes.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5 coding agent (`gpt-5`), agentic coding with
terminal/tool use and local test execution.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
This commit is contained in:
Dotta 2026-04-22 10:43:38 -05:00 committed by GitHub
parent a957394420
commit b69b563aa8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 631 additions and 38 deletions

View file

@ -1242,7 +1242,87 @@ export function issueService(db: Db) {
return adopted;
}
async function adoptUnownedCheckoutRun(input: {
issueId: string;
actorAgentId: string;
actorRunId: string;
}) {
const now = new Date();
const adopted = await db
.update(issues)
.set({
checkoutRunId: input.actorRunId,
executionRunId: input.actorRunId,
executionLockedAt: now,
updatedAt: now,
})
.where(
and(
eq(issues.id, input.issueId),
eq(issues.status, "in_progress"),
eq(issues.assigneeAgentId, input.actorAgentId),
isNull(issues.checkoutRunId),
or(isNull(issues.executionRunId), eq(issues.executionRunId, input.actorRunId)),
),
)
.returning({
id: issues.id,
status: issues.status,
assigneeAgentId: issues.assigneeAgentId,
checkoutRunId: issues.checkoutRunId,
executionRunId: issues.executionRunId,
})
.then((rows) => rows[0] ?? null);
return adopted;
}
async function clearExecutionRunIfTerminal(issueId: string): Promise<boolean> {
return db.transaction(async (tx) => {
await tx.execute(
sql`select ${issues.id} from ${issues} where ${issues.id} = ${issueId} for update`,
);
const issue = await tx
.select({ executionRunId: issues.executionRunId })
.from(issues)
.where(eq(issues.id, issueId))
.then((rows) => rows[0] ?? null);
if (!issue?.executionRunId) return false;
await tx.execute(
sql`select ${heartbeatRuns.id} from ${heartbeatRuns} where ${heartbeatRuns.id} = ${issue.executionRunId} for update`,
);
const run = await tx
.select({ status: heartbeatRuns.status })
.from(heartbeatRuns)
.where(eq(heartbeatRuns.id, issue.executionRunId))
.then((rows) => rows[0] ?? null);
if (run && !TERMINAL_HEARTBEAT_RUN_STATUSES.has(run.status)) return false;
const updated = await tx
.update(issues)
.set({
executionRunId: null,
executionAgentNameKey: null,
executionLockedAt: null,
updatedAt: new Date(),
})
.where(
and(
eq(issues.id, issueId),
eq(issues.executionRunId, issue.executionRunId),
),
)
.returning({ id: issues.id })
.then((rows) => rows[0] ?? null);
return Boolean(updated);
});
}
return {
clearExecutionRunIfTerminal,
list: async (companyId: string, filters?: IssueFilters) => {
const conditions = [eq(issues.companyId, companyId)];
const limit = typeof filters?.limit === "number" && Number.isFinite(filters.limit)
@ -2112,38 +2192,7 @@ export function issueService(db: Db) {
const now = new Date();
// Fix C: staleness detection — if executionRunId references a run that is no
// longer queued or running, clear it before applying the execution lock condition
// so a dead lock can't produce a spurious 409.
// Wrapped in a transaction with SELECT FOR UPDATE to make the read + clear atomic,
// matching the existing pattern in enqueueWakeup().
await db.transaction(async (tx) => {
await tx.execute(
sql`select id from issues where id = ${id} for update`,
);
const preCheckRow = await tx
.select({ executionRunId: issues.executionRunId })
.from(issues)
.where(eq(issues.id, id))
.then((rows) => rows[0] ?? null);
if (!preCheckRow?.executionRunId) return;
const lockRun = await tx
.select({ id: heartbeatRuns.id, status: heartbeatRuns.status })
.from(heartbeatRuns)
.where(eq(heartbeatRuns.id, preCheckRow.executionRunId))
.then((rows) => rows[0] ?? null);
if (!lockRun || (lockRun.status !== "queued" && lockRun.status !== "running")) {
await tx
.update(issues)
.set({ executionRunId: null, executionAgentNameKey: null, executionLockedAt: null, updatedAt: now })
.where(
and(
eq(issues.id, id),
eq(issues.executionRunId, preCheckRow.executionRunId),
),
);
}
});
await clearExecutionRunIfTerminal(id);
const dependencyReadiness = await listIssueDependencyReadinessMap(db, issueCompany.companyId, [id]);
const unresolvedBlockerIssueIds = dependencyReadiness.get(id)?.unresolvedBlockerIssueIds ?? [];
@ -2272,12 +2321,14 @@ export function issueService(db: Db) {
},
assertCheckoutOwner: async (id: string, actorAgentId: string, actorRunId: string | null) => {
await clearExecutionRunIfTerminal(id);
const current = await db
.select({
id: issues.id,
status: issues.status,
assigneeAgentId: issues.assigneeAgentId,
checkoutRunId: issues.checkoutRunId,
executionRunId: issues.executionRunId,
})
.from(issues)
.where(eq(issues.id, id))
@ -2293,6 +2344,27 @@ export function issueService(db: Db) {
return { ...current, adoptedFromRunId: null as string | null };
}
if (
actorRunId &&
current.status === "in_progress" &&
current.assigneeAgentId === actorAgentId &&
current.checkoutRunId == null &&
(current.executionRunId == null || current.executionRunId === actorRunId)
) {
const adopted = await adoptUnownedCheckoutRun({
issueId: id,
actorAgentId,
actorRunId,
});
if (adopted) {
return {
...adopted,
adoptedFromRunId: null as string | null,
};
}
}
if (
actorRunId &&
current.status === "in_progress" &&
@ -2320,12 +2392,14 @@ export function issueService(db: Db) {
status: current.status,
assigneeAgentId: current.assigneeAgentId,
checkoutRunId: current.checkoutRunId,
executionRunId: current.executionRunId,
actorAgentId,
actorRunId,
});
},
release: async (id: string, actorAgentId?: string, actorRunId?: string | null) => {
await clearExecutionRunIfTerminal(id);
const existing = await db
.select()
.from(issues)
@ -2343,12 +2417,15 @@ export function issueService(db: Db) {
existing.checkoutRunId &&
!sameRunLock(existing.checkoutRunId, actorRunId ?? null)
) {
throw conflict("Only checkout run can release issue", {
issueId: existing.id,
assigneeAgentId: existing.assigneeAgentId,
checkoutRunId: existing.checkoutRunId,
actorRunId: actorRunId ?? null,
});
const stale = await isTerminalOrMissingHeartbeatRun(existing.checkoutRunId);
if (!stale) {
throw conflict("Only checkout run can release issue", {
issueId: existing.id,
assigneeAgentId: existing.assigneeAgentId,
checkoutRunId: existing.checkoutRunId,
actorRunId: actorRunId ?? null,
});
}
}
const updated = await db
@ -2357,6 +2434,9 @@ export function issueService(db: Db) {
status: "todo",
assigneeAgentId: null,
checkoutRunId: null,
executionRunId: null,
executionAgentNameKey: null,
executionLockedAt: null,
updatedAt: new Date(),
})
.where(eq(issues.id, id))
@ -2367,6 +2447,51 @@ export function issueService(db: Db) {
return enriched;
},
adminForceRelease: async (id: string, options: { clearAssignee?: boolean } = {}) =>
db.transaction(async (tx) => {
await tx.execute(
sql`select ${issues.id} from ${issues} where ${issues.id} = ${id} for update`,
);
const existing = await tx
.select({
id: issues.id,
checkoutRunId: issues.checkoutRunId,
executionRunId: issues.executionRunId,
})
.from(issues)
.where(eq(issues.id, id))
.then((rows) => rows[0] ?? null);
if (!existing) return null;
const patch: Partial<typeof issues.$inferInsert> = {
checkoutRunId: null,
executionRunId: null,
executionAgentNameKey: null,
executionLockedAt: null,
updatedAt: new Date(),
};
if (options.clearAssignee) {
patch.assigneeAgentId = null;
}
const updated = await tx
.update(issues)
.set(patch)
.where(eq(issues.id, id))
.returning()
.then((rows) => rows[0] ?? null);
if (!updated) return null;
const [enriched] = await withIssueLabels(tx, [updated]);
return {
issue: enriched,
previous: {
checkoutRunId: existing.checkoutRunId,
executionRunId: existing.executionRunId,
},
};
}),
listLabels: (companyId: string) =>
db.select().from(labels).where(eq(labels.companyId, companyId)).orderBy(asc(labels.name), asc(labels.id)),