mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-15 18:30:39 +09:00
[codex] Harden issue recovery reliability (#4875)
## Thinking Path > - Paperclip is the control plane for autonomous agent companies, so non-terminal issue state must always have a clear live, waiting, or recovery owner. > - This change stays inside the server reliability and liveness subsystem for assigned issue recovery, blocker attention, and live-run polling. > - Closed PR #4860 mixed this reliability work with separate mutation-boundary policy changes, which made review and merge risk too broad. > - [PAP-2981](/PAP/issues/PAP-2981) asked for a replacement PR containing only the remaining reliability slice and explicitly excluding user-assignment and execution-policy restrictions. > - Follow-up review also split `advanced` run-liveness continuation behavior out of this PR so it can be reviewed separately. > - The implementation hardens repeated recovery escalation, expands blocker-attention coverage for explicit waiting and recovery paths, and caps company live-run polling defaults. > - The benefit is a smaller reliability PR that improves liveness behavior without changing agent/user mutation authorization boundaries or `advanced` continuation semantics. ## What Changed - Avoid repeated liveness escalation updates when the source issue is already blocked by the same open escalation. - Treat open liveness escalation recovery issues, their source issues, and their leaf blockers as covered waiting paths in blocker attention. - Cap default company live-run polling at 50 rows for both `minCount` and `limit`, including explicit zero values, to avoid unbounded responses. - Preserve the existing behavior where succeeded `advanced` runs are considered productive/healthy for stranded-work recovery and are not actionable bounded run-liveness continuations. - Added focused server coverage for recovery dedupe, blocker attention, liveness escalation, run continuations, and live-run polling. ## Verification - `pnpm install --frozen-lockfile` - `pnpm exec vitest run server/src/__tests__/heartbeat-process-recovery.test.ts server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts server/src/__tests__/issue-blocker-attention.test.ts server/src/__tests__/run-continuations.test.ts server/src/__tests__/agent-live-run-routes.test.ts` - Result: 5 files passed, 63 tests passed. - `pnpm --filter @paperclipai/server typecheck` - Result: passed. - No UI changes; screenshots are not applicable. ## Risks - Recovery and blocker-attention classification changes can affect which blocked chains are shown as covered versus needing attention. - Live-run polling now treats omitted, invalid, or non-positive `limit` / `minCount` values as the capped default of 50. - `advanced` run-liveness continuation behavior is intentionally excluded from this PR and split for separate review. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5, code execution and GitHub CLI tool use, medium reasoning effort. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [x] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
parent
a3de1d764d
commit
ad5432fece
7 changed files with 210 additions and 32 deletions
|
|
@ -52,6 +52,7 @@ import {
|
|||
issueTreeControlService,
|
||||
type ActiveIssueTreePauseHoldGate,
|
||||
} from "./issue-tree-control.js";
|
||||
import { parseIssueGraphLivenessIncidentKey } from "./recovery/origins.js";
|
||||
|
||||
const ALL_ISSUE_STATUSES = ["backlog", "todo", "in_progress", "in_review", "blocked", "done", "cancelled"];
|
||||
const MAX_ISSUE_COMMENT_PAGE_LIMIT = 500;
|
||||
|
|
@ -1174,12 +1175,12 @@ async function listIssueBlockerAttentionMap(
|
|||
}
|
||||
}
|
||||
|
||||
const reviewNodeIds = [...nodesById.values()]
|
||||
.filter((node) => node.status === "in_review")
|
||||
const explicitWaitCandidateIds = [...nodesById.values()]
|
||||
.filter((node) => node.status !== "done")
|
||||
.map((node) => node.id);
|
||||
const explicitWaitingIssueIds = new Set<string>();
|
||||
if (reviewNodeIds.length > 0) {
|
||||
for (const chunk of chunkList(reviewNodeIds, ISSUE_LIST_RELATED_QUERY_CHUNK_SIZE)) {
|
||||
if (explicitWaitCandidateIds.length > 0) {
|
||||
for (const chunk of chunkList(explicitWaitCandidateIds, ISSUE_LIST_RELATED_QUERY_CHUNK_SIZE)) {
|
||||
const interactionRows: Array<{ issueId: string }> = await dbOrTx
|
||||
.select({ issueId: issueThreadInteractions.issueId })
|
||||
.from(issueThreadInteractions)
|
||||
|
|
@ -1204,22 +1205,28 @@ async function listIssueBlockerAttentionMap(
|
|||
),
|
||||
);
|
||||
for (const row of approvalRows) explicitWaitingIssueIds.add(row.issueId);
|
||||
}
|
||||
|
||||
const recoveryRows: Array<{ originId: string | null }> = await dbOrTx
|
||||
.select({ originId: issues.originId })
|
||||
.from(issues)
|
||||
.where(
|
||||
and(
|
||||
eq(issues.companyId, companyId),
|
||||
eq(issues.originKind, BLOCKER_ATTENTION_OPEN_RECOVERY_ORIGIN_KIND),
|
||||
isNull(issues.hiddenAt),
|
||||
inArray(issues.originId, chunk),
|
||||
notInArray(issues.status, BLOCKER_ATTENTION_OPEN_RECOVERY_TERMINAL_STATUSES),
|
||||
),
|
||||
);
|
||||
for (const row of recoveryRows) {
|
||||
if (row.originId) explicitWaitingIssueIds.add(row.originId);
|
||||
}
|
||||
// Recovery rows are intentionally company-wide: a liveness escalation for
|
||||
// the same leaf blocker represents an active waiting path even when that
|
||||
// blocker is reached through another blocked graph.
|
||||
const recoveryRows: Array<{ id: string; originId: string | null }> = await dbOrTx
|
||||
.select({ id: issues.id, originId: issues.originId })
|
||||
.from(issues)
|
||||
.where(
|
||||
and(
|
||||
eq(issues.companyId, companyId),
|
||||
eq(issues.originKind, BLOCKER_ATTENTION_OPEN_RECOVERY_ORIGIN_KIND),
|
||||
isNull(issues.hiddenAt),
|
||||
notInArray(issues.status, BLOCKER_ATTENTION_OPEN_RECOVERY_TERMINAL_STATUSES),
|
||||
),
|
||||
);
|
||||
for (const row of recoveryRows) {
|
||||
const parsed = parseIssueGraphLivenessIncidentKey(row.originId);
|
||||
if (!parsed || parsed.companyId !== companyId) continue;
|
||||
explicitWaitingIssueIds.add(row.id);
|
||||
explicitWaitingIssueIds.add(parsed.issueId);
|
||||
explicitWaitingIssueIds.add(parsed.leafIssueId);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1257,8 +1264,11 @@ async function listIssueBlockerAttentionMap(
|
|||
if (node.status === "done") {
|
||||
return { covered: true, stalled: false, sampleBlockerIdentifier: nodeSample, sampleStalledBlockerIdentifier: null };
|
||||
}
|
||||
if (explicitWaitingIssueIds.has(node.id)) {
|
||||
return { covered: true, stalled: false, sampleBlockerIdentifier: nodeSample, sampleStalledBlockerIdentifier: null };
|
||||
}
|
||||
if (node.status === "in_review") {
|
||||
const hasWaitingPath = activeIssueIds.has(node.id) || Boolean(node.assigneeUserId) || explicitWaitingIssueIds.has(node.id);
|
||||
const hasWaitingPath = activeIssueIds.has(node.id) || Boolean(node.assigneeUserId);
|
||||
if (hasWaitingPath) {
|
||||
return { covered: true, stalled: false, sampleBlockerIdentifier: nodeSample, sampleStalledBlockerIdentifier: null };
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue