mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-16 02:40:39 +09:00
[codex] Harden recovery issue handling (#4600)
## Thinking Path > - Paperclip orchestrates AI agents for zero-human companies > - The control plane must recover stranded agent work without creating new operational loops > - Stranded recovery issues can themselves fail, and exposing raw retry errors in comments can leak sensitive adapter details > - New local companies also should not force a hire-approval gate unless operators enable that policy > - This pull request hardens recovery issue handling, redacts retry failure details in issue copy, preserves `maxConcurrentRuns: 1`, and flips new-hire approval to an opt-in default > - The benefit is safer automatic recovery and smoother default company setup without hidden migration conflicts ## What Changed - Added migration `0071_default_hire_approval_off` and updated company schema/import/export/docs so hire approvals default off and serialize only when enabled. - Added migration `0072_large_sandman` with a partial unique index preventing duplicate active stranded recovery issues for the same source issue. - Blocked failed `stranded_issue_recovery` issues in place instead of creating nested recovery issues. - Redacted latest retry failure details from recovery issue comments while still linking reviewers to run evidence. - Allowed `maxConcurrentRuns: 1` to be honored by heartbeat concurrency normalization. - Added focused regression coverage for recovery recursion, redaction, migration ordering, and concurrency behavior. ## Verification - `pnpm --filter @paperclipai/db run check:migrations` - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/recovery-classifiers.test.ts` - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/company-portability.test.ts --pool=forks --poolOptions.forks.isolate=true` - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/agent-permissions-routes.test.ts --pool=forks --poolOptions.forks.isolate=true` - `pnpm --filter @paperclipai/server typecheck` - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/heartbeat-process-recovery.test.ts --pool=forks --poolOptions.forks.isolate=true` exits 0, but this host skipped the embedded Postgres tests with the existing init guard. - `pnpm exec vitest run --project @paperclipai/server server/src/__tests__/heartbeat-dependency-scheduling.test.ts --pool=forks --poolOptions.forks.isolate=true` exits 0, but this host skipped the embedded Postgres tests with the existing init guard. ## Risks - Migration risk is low but this PR intentionally owns both new migrations to avoid separate PR migration-journal conflicts. - Recovery comments now require operators to inspect linked run evidence for details instead of reading raw errors inline. - The hire approval default changes behavior for newly created/imported companies only; existing persisted company settings are not changed except by the SQL default for future rows. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5 coding agent, tool-enabled terminal/GitHub workflow, reasoning mode active. Context window not exposed in this environment. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [x] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge --------- Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
parent
6ccf80bcf2
commit
7a9b3a6037
18 changed files with 16535 additions and 65 deletions
|
|
@ -2264,7 +2264,7 @@ function buildEnvInputMap(inputs: CompanyPortabilityEnvInput[]) {
|
|||
}
|
||||
|
||||
function readCompanyApprovalDefault(_frontmatter: Record<string, unknown>) {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function readIncludeEntries(frontmatter: Record<string, unknown>): CompanyPackageIncludeEntry[] {
|
||||
|
|
@ -3465,7 +3465,7 @@ export function companyPortabilityService(db: Db, storage?: StorageService) {
|
|||
company: stripEmptyValues({
|
||||
brandColor: company.brandColor ?? null,
|
||||
logoPath: companyLogoPath,
|
||||
requireBoardApprovalForNewAgents: company.requireBoardApprovalForNewAgents ? undefined : false,
|
||||
requireBoardApprovalForNewAgents: company.requireBoardApprovalForNewAgents ? true : undefined,
|
||||
feedbackDataSharingEnabled: company.feedbackDataSharingEnabled ? true : undefined,
|
||||
feedbackDataSharingConsentAt: company.feedbackDataSharingConsentAt?.toISOString() ?? null,
|
||||
feedbackDataSharingConsentByUserId: company.feedbackDataSharingConsentByUserId ?? null,
|
||||
|
|
@ -3986,8 +3986,8 @@ export function companyPortabilityService(db: Db, storage?: StorageService) {
|
|||
description: include.company ? (sourceManifest.company?.description ?? null) : null,
|
||||
brandColor: include.company ? (sourceManifest.company?.brandColor ?? null) : null,
|
||||
requireBoardApprovalForNewAgents: include.company
|
||||
? (sourceManifest.company?.requireBoardApprovalForNewAgents ?? true)
|
||||
: true,
|
||||
? (sourceManifest.company?.requireBoardApprovalForNewAgents ?? false)
|
||||
: false,
|
||||
feedbackDataSharingEnabled: include.company
|
||||
? (sourceManifest.company?.feedbackDataSharingEnabled ?? false)
|
||||
: false,
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ import {
|
|||
} from "./execution-workspace-policy.js";
|
||||
import { instanceSettingsService } from "./instance-settings.js";
|
||||
import {
|
||||
RECOVERY_ORIGIN_KINDS,
|
||||
RUN_LIVENESS_CONTINUATION_REASON,
|
||||
buildRunLivenessContinuationIdempotencyKey,
|
||||
decideRunLivenessContinuation,
|
||||
|
|
@ -133,6 +134,7 @@ const MAX_RUN_EVENT_PAYLOAD_ARRAY_ITEMS = 50;
|
|||
const MAX_RUN_EVENT_PAYLOAD_OBJECT_KEYS = 100;
|
||||
const MAX_RUN_EVENT_PAYLOAD_DEPTH = 6;
|
||||
const HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT = AGENT_DEFAULT_MAX_CONCURRENT_RUNS;
|
||||
const HEARTBEAT_MAX_CONCURRENT_RUNS_MIN = 1;
|
||||
const HEARTBEAT_MAX_CONCURRENT_RUNS_MAX = 10;
|
||||
const LIVENESS_BOOKKEEPING_ACTIVITY_ACTIONS = [
|
||||
"environment.lease_acquired",
|
||||
|
|
@ -848,7 +850,7 @@ export function compactRunLogChunk(chunk: string, maxChars = MAX_PERSISTED_LOG_C
|
|||
function normalizeMaxConcurrentRuns(value: unknown) {
|
||||
const parsed = Math.floor(asNumber(value, HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT));
|
||||
if (!Number.isFinite(parsed)) return HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT;
|
||||
return Math.max(HEARTBEAT_MAX_CONCURRENT_RUNS_DEFAULT, Math.min(HEARTBEAT_MAX_CONCURRENT_RUNS_MAX, parsed));
|
||||
return Math.max(HEARTBEAT_MAX_CONCURRENT_RUNS_MIN, Math.min(HEARTBEAT_MAX_CONCURRENT_RUNS_MAX, parsed));
|
||||
}
|
||||
|
||||
interface WakeupOptions {
|
||||
|
|
@ -6193,6 +6195,7 @@ export function heartbeatService(db: Db, options: HeartbeatServiceOptions = {})
|
|||
}
|
||||
|
||||
const shouldBlockImmediately =
|
||||
issue.originKind === RECOVERY_ORIGIN_KINDS.strandedIssueRecovery ||
|
||||
!recoveryAgentInvokable ||
|
||||
!recoveryAgent ||
|
||||
didAutomaticRecoveryFail(run, issue.status === "todo" ? "assignment_recovery" : "issue_continuation_needed");
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ export {
|
|||
RECOVERY_REASON_KINDS,
|
||||
buildIssueGraphLivenessIncidentKey,
|
||||
buildIssueGraphLivenessLeafKey,
|
||||
isStrandedIssueRecoveryOriginKind,
|
||||
parseIssueGraphLivenessIncidentKey,
|
||||
} from "./origins.js";
|
||||
export type {
|
||||
|
|
|
|||
|
|
@ -17,6 +17,10 @@ export type RecoveryOriginKind = typeof RECOVERY_ORIGIN_KINDS[keyof typeof RECOV
|
|||
export type RecoveryReasonKind = typeof RECOVERY_REASON_KINDS[keyof typeof RECOVERY_REASON_KINDS];
|
||||
export type RecoveryKeyPrefix = typeof RECOVERY_KEY_PREFIXES[keyof typeof RECOVERY_KEY_PREFIXES];
|
||||
|
||||
export function isStrandedIssueRecoveryOriginKind(originKind: string | null | undefined) {
|
||||
return originKind === RECOVERY_ORIGIN_KINDS.strandedIssueRecovery;
|
||||
}
|
||||
|
||||
export function buildIssueGraphLivenessIncidentKey(input: {
|
||||
companyId: string;
|
||||
issueId: string;
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import { getRunLogStore } from "../run-log-store.js";
|
|||
import {
|
||||
RECOVERY_ORIGIN_KINDS,
|
||||
buildIssueGraphLivenessLeafKey,
|
||||
isStrandedIssueRecoveryOriginKind,
|
||||
parseIssueGraphLivenessIncidentKey,
|
||||
} from "./origins.js";
|
||||
import {
|
||||
|
|
@ -101,22 +102,9 @@ function readNonEmptyString(value: unknown): string | null {
|
|||
function summarizeRunFailureForIssueComment(run: LatestIssueRun) {
|
||||
if (!run) return null;
|
||||
|
||||
const errorCode = readNonEmptyString(run.errorCode)?.trim() ?? null;
|
||||
const rawError = readNonEmptyString(run.error)?.trim() ?? null;
|
||||
const apiMessageMatch = rawError?.match(/"message"\s*:\s*"([^"]+)"/);
|
||||
const firstLine = rawError
|
||||
?.split(/\r?\n/)
|
||||
.map((line) => line.trim())
|
||||
.find(Boolean) ?? null;
|
||||
const summarySource = apiMessageMatch?.[1] ?? firstLine;
|
||||
const summary =
|
||||
summarySource && summarySource.length > 240
|
||||
? `${summarySource.slice(0, 237)}...`
|
||||
: summarySource;
|
||||
|
||||
if (errorCode && summary) return ` Latest retry failure: \`${errorCode}\` - ${summary}.`;
|
||||
if (errorCode) return ` Latest retry failure: \`${errorCode}\`.`;
|
||||
if (summary) return ` Latest retry failure: ${summary}.`;
|
||||
if (readNonEmptyString(run.error) || readNonEmptyString(run.errorCode)) {
|
||||
return " Latest retry failure details were withheld from the issue thread; inspect the linked run for evidence.";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -187,6 +175,19 @@ function isAgentInvokable(agent: typeof agents.$inferSelect | null | undefined)
|
|||
return Boolean(agent && !["paused", "terminated", "pending_approval"].includes(agent.status));
|
||||
}
|
||||
|
||||
function isStrandedIssueRecoveryIssue(issue: Pick<typeof issues.$inferSelect, "originKind">) {
|
||||
return isStrandedIssueRecoveryOriginKind(issue.originKind);
|
||||
}
|
||||
|
||||
function isUnsuccessfulTerminalIssueRun(latestRun: LatestIssueRun) {
|
||||
return Boolean(
|
||||
latestRun &&
|
||||
UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES.includes(
|
||||
latestRun.status as (typeof UNSUCCESSFUL_HEARTBEAT_RUN_TERMINAL_STATUSES)[number],
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
function parseLivenessIncidentKey(incidentKey: string | null | undefined) {
|
||||
if (!incidentKey) return null;
|
||||
return parseIssueGraphLivenessIncidentKey(incidentKey);
|
||||
|
|
@ -813,6 +814,16 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
);
|
||||
}
|
||||
|
||||
function isUniqueStrandedIssueRecoveryConflict(error: unknown) {
|
||||
if (!error || typeof error !== "object") return false;
|
||||
const maybe = error as { code?: string; constraint?: string; message?: string };
|
||||
return maybe.code === "23505" &&
|
||||
(
|
||||
maybe.constraint === "issues_active_stranded_issue_recovery_uq" ||
|
||||
typeof maybe.message === "string" && maybe.message.includes("issues_active_stranded_issue_recovery_uq")
|
||||
);
|
||||
}
|
||||
|
||||
async function ensureSourceIssueBlockedByStaleEvaluation(input: {
|
||||
sourceIssue: typeof issues.$inferSelect | null;
|
||||
evaluationIssue: { id: string; identifier: string | null };
|
||||
|
|
@ -1257,6 +1268,8 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
latestRun: LatestIssueRun;
|
||||
previousStatus: "todo" | "in_progress";
|
||||
}) {
|
||||
if (isStrandedIssueRecoveryIssue(input.issue)) return null;
|
||||
|
||||
const existing = await findOpenStrandedIssueRecoveryIssue(input.issue.companyId, input.issue.id);
|
||||
if (existing) return existing;
|
||||
|
||||
|
|
@ -1264,32 +1277,40 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
if (!ownerAgentId) return null;
|
||||
|
||||
const prefix = await getCompanyIssuePrefix(input.issue.companyId);
|
||||
const recovery = await issuesSvc.create(input.issue.companyId, {
|
||||
title: `Recover stalled issue ${input.issue.identifier ?? input.issue.title}`,
|
||||
description: buildStrandedIssueRecoveryDescription({
|
||||
issue: input.issue,
|
||||
latestRun: input.latestRun,
|
||||
previousStatus: input.previousStatus,
|
||||
prefix,
|
||||
}),
|
||||
status: "todo",
|
||||
priority: input.issue.priority,
|
||||
parentId: input.issue.id,
|
||||
projectId: input.issue.projectId,
|
||||
goalId: input.issue.goalId,
|
||||
assigneeAgentId: ownerAgentId,
|
||||
originKind: STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||||
originId: input.issue.id,
|
||||
originRunId: input.latestRun?.id ?? null,
|
||||
originFingerprint: [
|
||||
STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||||
input.issue.companyId,
|
||||
input.issue.id,
|
||||
input.latestRun?.id ?? "no-run",
|
||||
].join(":"),
|
||||
billingCode: input.issue.billingCode,
|
||||
inheritExecutionWorkspaceFromIssueId: input.issue.id,
|
||||
});
|
||||
let recovery: Awaited<ReturnType<typeof issuesSvc.create>>;
|
||||
try {
|
||||
recovery = await issuesSvc.create(input.issue.companyId, {
|
||||
title: `Recover stalled issue ${input.issue.identifier ?? input.issue.title}`,
|
||||
description: buildStrandedIssueRecoveryDescription({
|
||||
issue: input.issue,
|
||||
latestRun: input.latestRun,
|
||||
previousStatus: input.previousStatus,
|
||||
prefix,
|
||||
}),
|
||||
status: "todo",
|
||||
priority: input.issue.priority,
|
||||
parentId: input.issue.id,
|
||||
projectId: input.issue.projectId,
|
||||
goalId: input.issue.goalId,
|
||||
assigneeAgentId: ownerAgentId,
|
||||
originKind: STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||||
originId: input.issue.id,
|
||||
originRunId: input.latestRun?.id ?? null,
|
||||
originFingerprint: [
|
||||
STRANDED_ISSUE_RECOVERY_ORIGIN_KIND,
|
||||
input.issue.companyId,
|
||||
input.issue.id,
|
||||
input.latestRun?.id ?? "no-run",
|
||||
].join(":"),
|
||||
billingCode: input.issue.billingCode,
|
||||
inheritExecutionWorkspaceFromIssueId: input.issue.id,
|
||||
});
|
||||
} catch (error) {
|
||||
if (!isUniqueStrandedIssueRecoveryConflict(error)) throw error;
|
||||
const raced = await findOpenStrandedIssueRecoveryIssue(input.issue.companyId, input.issue.id);
|
||||
if (!raced) throw error;
|
||||
return raced;
|
||||
}
|
||||
|
||||
await deps.enqueueWakeup(ownerAgentId, {
|
||||
source: "assignment",
|
||||
|
|
@ -1315,6 +1336,78 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
return recovery;
|
||||
}
|
||||
|
||||
function buildRecoveryIssueInPlaceEscalationComment(input: {
|
||||
issue: typeof issues.$inferSelect;
|
||||
previousStatus: "todo" | "in_progress";
|
||||
latestRun: LatestIssueRun;
|
||||
prefix: string;
|
||||
}) {
|
||||
const runLink = input.latestRun
|
||||
? runUiLink({ id: input.latestRun.id, agentId: input.latestRun.agentId }, input.prefix)
|
||||
: "none";
|
||||
const retryReason = readNonEmptyString(parseObject(input.latestRun?.contextSnapshot)?.retryReason) ?? "none";
|
||||
const failureSummary = summarizeRunFailureForIssueComment(input.latestRun);
|
||||
|
||||
return [
|
||||
"Paperclip stopped automatic stranded-work recovery for this recovery issue.",
|
||||
"",
|
||||
`- Recovery issue: ${issueUiLink({ identifier: input.issue.identifier, id: input.issue.id }, input.prefix)}`,
|
||||
`- Previous status: \`${input.previousStatus}\``,
|
||||
`- Latest run: ${runLink}`,
|
||||
`- Latest run status: \`${input.latestRun?.status ?? "unknown"}\``,
|
||||
`- Retry reason: \`${retryReason}\``,
|
||||
failureSummary ? `- Failure: ${failureSummary.trim()}` : "- Failure: none recorded",
|
||||
"- Guard: recovery issues do not create nested `stranded_issue_recovery` issues.",
|
||||
"",
|
||||
"Next action: the current recovery owner should inspect the failed run evidence, restore a live execution path or record the manual resolution, then move this recovery issue out of `blocked`.",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
async function escalateStrandedRecoveryIssueInPlace(input: {
|
||||
issue: typeof issues.$inferSelect;
|
||||
previousStatus: "todo" | "in_progress";
|
||||
latestRun: LatestIssueRun;
|
||||
}) {
|
||||
const updated = await issuesSvc.update(input.issue.id, { status: "blocked" });
|
||||
if (!updated) return null;
|
||||
|
||||
const prefix = await getCompanyIssuePrefix(input.issue.companyId);
|
||||
await issuesSvc.addComment(
|
||||
input.issue.id,
|
||||
buildRecoveryIssueInPlaceEscalationComment({
|
||||
issue: input.issue,
|
||||
previousStatus: input.previousStatus,
|
||||
latestRun: input.latestRun,
|
||||
prefix,
|
||||
}),
|
||||
{},
|
||||
);
|
||||
|
||||
await logActivity(db, {
|
||||
companyId: input.issue.companyId,
|
||||
actorType: "system",
|
||||
actorId: "system",
|
||||
agentId: null,
|
||||
runId: null,
|
||||
action: "issue.updated",
|
||||
entityType: "issue",
|
||||
entityId: input.issue.id,
|
||||
details: {
|
||||
identifier: input.issue.identifier,
|
||||
status: "blocked",
|
||||
previousStatus: input.previousStatus,
|
||||
source: "recovery.reconcile_stranded_recovery_issue",
|
||||
latestRunId: input.latestRun?.id ?? null,
|
||||
latestRunStatus: input.latestRun?.status ?? null,
|
||||
latestRunErrorCode: input.latestRun?.errorCode ?? null,
|
||||
originKind: input.issue.originKind,
|
||||
originId: input.issue.originId,
|
||||
},
|
||||
});
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
async function existingBlockerIssueIds(companyId: string, issueId: string) {
|
||||
return db
|
||||
.select({ blockerIssueId: issueRelations.issueId })
|
||||
|
|
@ -1357,6 +1450,14 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
latestRun: LatestIssueRun;
|
||||
comment: string;
|
||||
}) {
|
||||
if (isStrandedIssueRecoveryIssue(input.issue)) {
|
||||
return escalateStrandedRecoveryIssueInPlace({
|
||||
issue: input.issue,
|
||||
previousStatus: input.previousStatus,
|
||||
latestRun: input.latestRun,
|
||||
});
|
||||
}
|
||||
|
||||
const recoveryIssue = await ensureStrandedIssueRecoveryIssue({
|
||||
issue: input.issue,
|
||||
previousStatus: input.previousStatus,
|
||||
|
|
@ -1457,6 +1558,21 @@ export function recoveryService(db: Db, deps: { enqueueWakeup: RecoveryWakeup })
|
|||
}
|
||||
|
||||
const latestRun = await getLatestIssueRun(issue.companyId, issue.id);
|
||||
if (isStrandedIssueRecoveryIssue(issue) && isUnsuccessfulTerminalIssueRun(latestRun)) {
|
||||
const updated = await escalateStrandedRecoveryIssueInPlace({
|
||||
issue,
|
||||
previousStatus: issue.status as "todo" | "in_progress",
|
||||
latestRun,
|
||||
});
|
||||
if (updated) {
|
||||
result.escalated += 1;
|
||||
result.issueIds.push(issue.id);
|
||||
} else {
|
||||
result.skipped += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (issue.status === "todo") {
|
||||
if (!latestRun || latestRun.status === "succeeded") {
|
||||
result.skipped += 1;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue