[codex] Recover productive terminal continuations (#4956)

## Thinking Path

> - Paperclip orchestrates AI agents through issue-scoped heartbeat runs
> - Recovery logic decides whether in-progress work still has a live
path after a terminal run
> - A productive terminal continuation can still leave an issue stranded
when no active run or wake remains
> - Treating that state as healthy leaves work stuck despite evidence
that more action is needed
> - This pull request re-enqueues recovery for productive terminal
continuations that left no live path
> - The benefit is fewer silently stranded in-progress issues after
agents make partial progress

## What Changed

- Reclassified successful-but-productive terminal continuations as
recoverable when no live path remains.
- Enqueue a follow-up recovery wake with the original run id and
continuation metadata.
- Added regression tests covering productive terminal continuation
recovery and advanced liveness handoff.

## Verification

- `pnpm exec vitest run
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/run-continuations.test.ts`

## Risks

- Medium risk: recovery may schedule one more follow-up where Paperclip
previously considered the work observed. The existing uniqueness,
budget, and escalation checks still constrain retry loops.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5 coding agent, tool use and local command
execution. Exact context window was not exposed in the runtime.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta 2026-05-01 11:57:23 -05:00 committed by GitHub
parent 3cd26a78fc
commit 570a4206da
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 204 additions and 12 deletions

View file

@ -74,6 +74,7 @@ type LatestIssueRun = Pick<
typeof heartbeatRuns.$inferSelect,
"id" | "agentId" | "status" | "error" | "errorCode" | "contextSnapshot" | "livenessState"
> | null;
type SuccessfulLatestIssueRun = NonNullable<LatestIssueRun> & { status: "succeeded" };
type WatchdogDecisionActor =
| { type: "board"; userId?: string | null; runId?: string | null }
@ -188,7 +189,7 @@ function isUnsuccessfulTerminalIssueRun(latestRun: LatestIssueRun) {
);
}
function isSuccessfulInProgressContinuationRun(latestRun: LatestIssueRun) {
function isSuccessfulInProgressContinuationRun(latestRun: LatestIssueRun): latestRun is SuccessfulLatestIssueRun {
return latestRun?.status === "succeeded";
}
@ -200,6 +201,13 @@ function isProductiveContinuationRun(latestRun: LatestIssueRun) {
latestRun.livenessState === "needs_followup");
}
function isRepeatedProductiveContinuationRecovery(latestRun: SuccessfulLatestIssueRun) {
const latestContext = parseObject(latestRun.contextSnapshot);
return readNonEmptyString(latestContext.retryReason) === "issue_continuation_needed" &&
readNonEmptyString(latestContext.source) === "issue.productive_terminal_continuation_recovery" &&
isProductiveContinuationRun(latestRun);
}
function parseLivenessIncidentKey(incidentKey: string | null | undefined) {
if (!incidentKey) return null;
return parseIssueGraphLivenessIncidentKey(incidentKey);
@ -1706,12 +1714,51 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
continue;
}
if (isSuccessfulInProgressContinuationRun(latestRun)) {
if (isProductiveContinuationRun(latestRun)) {
result.productiveContinuationObserved += 1;
} else {
const successfulRun = latestRun;
if (!isProductiveContinuationRun(successfulRun)) {
result.successfulContinuationObserved += 1;
result.skipped += 1;
continue;
}
if (isRepeatedProductiveContinuationRecovery(successfulRun)) {
const updated = await escalateStrandedAssignedIssue({
issue,
previousStatus: "in_progress",
latestRun: successfulRun,
comment:
"Paperclip automatically retried continuation for this assigned `in_progress` issue and the retry " +
"made progress, but it still has no live execution path. Moving it to `blocked` so it is visible for intervention.",
});
if (updated) {
result.escalated += 1;
result.issueIds.push(issue.id);
} else {
result.skipped += 1;
}
continue;
}
if (await isInvocationBudgetBlocked(issue, agentId)) {
result.skipped += 1;
continue;
}
const queued = await enqueueStrandedIssueRecovery({
issueId: issue.id,
agentId,
reason: "issue_continuation_needed",
retryReason: "issue_continuation_needed",
source: "issue.productive_terminal_continuation_recovery",
retryOfRunId: successfulRun.id,
});
if (queued) {
result.continuationRequeued += 1;
result.issueIds.push(issue.id);
} else {
result.skipped += 1;
}
result.skipped += 1;
continue;
}
if (didAutomaticRecoveryFail(latestRun, "issue_continuation_needed")) {