mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-16 10:50:38 +09:00
Fix continuation recovery retry streaks by failure cause (#7031)
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies. > - The recovery subsystem is responsible for keeping assigned work moving when a live heartbeat run disappears or fails. > - `continuation_recovery` is the path that re-enqueues stranded `in_progress` issues after an interrupted continuation attempt. > - That path recently gained cause-aware retry classes and transient retry caps, but the streak counter was still aggregating mixed failure causes into one retry history. > - That meant a sequence like `timeout -> timeout -> adapter_failed -> adapter_failed` could escalate as a false `3x adapter_failed` streak even though the latest cause had only happened twice. > - This pull request makes continuation retry streaks count only consecutive failures whose `errorCode` matches the latest run and adds a regression test for the mixed-cause case. > - The benefit is that transient retry backoff and escalation now match the actual current failure cause instead of inheriting stale budget from unrelated failures. ## What Changed - Updated `summarizeRecentContinuationRetries(...)` to stop counting as soon as the continuation failure cause no longer matches the latest run's `errorCode`. - Wired the continuation recovery escalation/backoff path to pass the latest classified `errorCode` into the retry streak summarizer. - Added a regression test proving mixed-cause continuation failures do not consume the transient retry cap for a new failure cause. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-process-recovery.test.ts` ## Risks - Low risk. The behavioral change is intentionally narrow, but any future continuation retry modes that rely on `errorCode = null` will now be counted as a separate streak bucket and should be kept in mind when adding new retry classifications. ## Model Used - OpenAI Codex via Paperclip `codex_local` (GPT-5-based Codex coding agent; exact backend revision is not surfaced in the runtime), with tool use, shell execution, and patch application in the local repository. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [ ] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
parent
aea35fe695
commit
911a1e8b0d
2 changed files with 427 additions and 19 deletions
|
|
@ -159,6 +159,54 @@ function didAutomaticRecoveryFail(
|
|||
);
|
||||
}
|
||||
|
||||
const TRANSIENT_INFRA_CONTINUATION_ERROR_CODES = new Set<string>([
|
||||
"adapter_failed",
|
||||
"codex_transient_upstream",
|
||||
"claude_transient_upstream",
|
||||
"timeout",
|
||||
]);
|
||||
|
||||
const NON_RETRYABLE_CONTINUATION_ERROR_CODES = new Set<string>([
|
||||
"agent_not_invokable",
|
||||
"agent_not_found",
|
||||
"budget_blocked",
|
||||
"budget_exhausted",
|
||||
"issue_paused",
|
||||
"issue_dependencies_blocked",
|
||||
]);
|
||||
|
||||
const CONTINUATION_RECOVERY_TRANSIENT_MAX_ATTEMPTS = 3;
|
||||
const CONTINUATION_RECOVERY_DEFAULT_MAX_ATTEMPTS = 1;
|
||||
const CONTINUATION_RECOVERY_TRANSIENT_BASE_BACKOFF_MS = 60_000;
|
||||
|
||||
type ContinuationRetryClassification = {
|
||||
kind: "transient_infra" | "non_retryable" | "default";
|
||||
maxAttempts: number;
|
||||
baseBackoffMs: number;
|
||||
errorCode: string | null;
|
||||
};
|
||||
|
||||
function classifyContinuationFailure(latestRun: LatestIssueRun): ContinuationRetryClassification {
|
||||
const errorCode = readNonEmptyString(latestRun?.errorCode);
|
||||
if (errorCode && NON_RETRYABLE_CONTINUATION_ERROR_CODES.has(errorCode)) {
|
||||
return { kind: "non_retryable", maxAttempts: 0, baseBackoffMs: 0, errorCode };
|
||||
}
|
||||
if (errorCode && TRANSIENT_INFRA_CONTINUATION_ERROR_CODES.has(errorCode)) {
|
||||
return {
|
||||
kind: "transient_infra",
|
||||
maxAttempts: CONTINUATION_RECOVERY_TRANSIENT_MAX_ATTEMPTS,
|
||||
baseBackoffMs: CONTINUATION_RECOVERY_TRANSIENT_BASE_BACKOFF_MS,
|
||||
errorCode,
|
||||
};
|
||||
}
|
||||
return {
|
||||
kind: "default",
|
||||
maxAttempts: CONTINUATION_RECOVERY_DEFAULT_MAX_ATTEMPTS,
|
||||
baseBackoffMs: 0,
|
||||
errorCode,
|
||||
};
|
||||
}
|
||||
|
||||
function successfulRunHandoffRecoveryEvidence(latestRun: LatestIssueRun): SuccessfulRunHandoffRecoveryEvidence | null {
|
||||
if (!latestRun) return null;
|
||||
|
||||
|
|
@ -438,6 +486,54 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
.then((rows) => rows[0] ?? null);
|
||||
}
|
||||
|
||||
async function summarizeRecentContinuationRetries(
|
||||
companyId: string,
|
||||
issueId: string,
|
||||
errorCodeToMatch: string | null,
|
||||
) {
|
||||
const rows = await db
|
||||
.select({
|
||||
id: heartbeatRuns.id,
|
||||
status: heartbeatRuns.status,
|
||||
errorCode: heartbeatRuns.errorCode,
|
||||
contextSnapshot: heartbeatRuns.contextSnapshot,
|
||||
finishedAt: heartbeatRuns.finishedAt,
|
||||
})
|
||||
.from(heartbeatRuns)
|
||||
.where(
|
||||
and(
|
||||
eq(heartbeatRuns.companyId, companyId),
|
||||
sql`${heartbeatRuns.contextSnapshot} ->> 'issueId' = ${issueId}`,
|
||||
),
|
||||
)
|
||||
.orderBy(desc(heartbeatRuns.createdAt), desc(heartbeatRuns.id))
|
||||
.limit(10);
|
||||
|
||||
let consecutive = 0;
|
||||
let latestFinishedAt: Date | null = null;
|
||||
for (const row of rows) {
|
||||
const ctx = parseObject(row.contextSnapshot);
|
||||
const retryReason = readNonEmptyString(ctx.retryReason);
|
||||
if (retryReason !== "issue_continuation_needed") break;
|
||||
if (
|
||||
!UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||||
row.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||||
)
|
||||
) {
|
||||
break;
|
||||
}
|
||||
|
||||
const rowErrorCode = readNonEmptyString(row.errorCode);
|
||||
if (errorCodeToMatch !== rowErrorCode) {
|
||||
break;
|
||||
}
|
||||
|
||||
consecutive += 1;
|
||||
if (latestFinishedAt === null) latestFinishedAt = row.finishedAt ?? null;
|
||||
}
|
||||
return { consecutive, latestFinishedAt };
|
||||
}
|
||||
|
||||
async function hasActiveExecutionPath(companyId: string, issueId: string) {
|
||||
const [run, deferredWake] = await Promise.all([
|
||||
db
|
||||
|
|
@ -2545,24 +2641,69 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
}
|
||||
continue;
|
||||
}
|
||||
if (didAutomaticRecoveryFail(latestRun, "issue_continuation_needed")) {
|
||||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||||
const updated = await escalateStrandedAssignedIssue({
|
||||
issue,
|
||||
previousStatus: "in_progress",
|
||||
latestRun,
|
||||
comment:
|
||||
"Paperclip automatically retried continuation for this assigned `in_progress` issue after its live " +
|
||||
`execution disappeared, but it still has no live execution path.${failureSummary ?? ""} ` +
|
||||
"Moving it to `blocked` so it is visible for intervention.",
|
||||
});
|
||||
if (updated) {
|
||||
result.escalated += 1;
|
||||
result.issueIds.push(issue.id);
|
||||
} else {
|
||||
result.skipped += 1;
|
||||
if (isUnsuccessfulTerminalIssueRun(latestRun)) {
|
||||
const classification = classifyContinuationFailure(latestRun);
|
||||
|
||||
if (classification.kind === "non_retryable") {
|
||||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||||
const updated = await escalateStrandedAssignedIssue({
|
||||
issue,
|
||||
previousStatus: "in_progress",
|
||||
latestRun,
|
||||
comment:
|
||||
"Paperclip detected a non-retryable failure on this issue's continuation run " +
|
||||
`(\`${classification.errorCode}\`). Skipping automatic retries and moving it to \`blocked\` ` +
|
||||
`so it is visible for intervention.${failureSummary ?? ""}`,
|
||||
});
|
||||
if (updated) {
|
||||
result.escalated += 1;
|
||||
result.issueIds.push(issue.id);
|
||||
} else {
|
||||
result.skipped += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (didAutomaticRecoveryFail(latestRun, "issue_continuation_needed")) {
|
||||
const { consecutive, latestFinishedAt } = await summarizeRecentContinuationRetries(
|
||||
issue.companyId,
|
||||
issue.id,
|
||||
classification.errorCode,
|
||||
);
|
||||
if (consecutive >= classification.maxAttempts) {
|
||||
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
|
||||
const attemptCopy = consecutive <= 1 ? "" : ` (${consecutive}× attempts)`;
|
||||
const causeCopy = classification.errorCode
|
||||
? ` Latest cause: \`${classification.errorCode}\`.`
|
||||
: "";
|
||||
const updated = await escalateStrandedAssignedIssue({
|
||||
issue,
|
||||
previousStatus: "in_progress",
|
||||
latestRun,
|
||||
comment:
|
||||
"Paperclip automatically retried continuation for this assigned `in_progress` issue after its live " +
|
||||
`execution disappeared, but it still has no live execution path${attemptCopy}.${causeCopy}${failureSummary ?? ""} ` +
|
||||
"Moving it to `blocked` so it is visible for intervention.",
|
||||
});
|
||||
if (updated) {
|
||||
result.escalated += 1;
|
||||
result.issueIds.push(issue.id);
|
||||
} else {
|
||||
result.skipped += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (classification.baseBackoffMs > 0 && latestFinishedAt) {
|
||||
const elapsed = Date.now() - latestFinishedAt.getTime();
|
||||
const requiredDelay = classification.baseBackoffMs *
|
||||
Math.pow(2, Math.max(0, consecutive - 1));
|
||||
if (elapsed < requiredDelay) {
|
||||
result.skipped += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (await isInvocationBudgetBlocked(issue, agentId)) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue