[codex] Add run liveness continuations (#4083)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - Heartbeat runs are the control-plane record of each agent execution
window.
> - Long-running local agents can exhaust context or stop while still
holding useful next-step state.
> - Operators need that stop reason, next action, and continuation path
to be durable and visible.
> - This pull request adds run liveness metadata, continuation
summaries, and UI surfaces for issue run ledgers.
> - The benefit is that interrupted or long-running work can resume with
clearer context instead of losing the agent's last useful handoff.

## What Changed

- Added heartbeat-run liveness fields, continuation attempt tracking,
and an idempotent `0058` migration.
- Added server services and tests for run liveness, continuation
summaries, stop metadata, and activity backfill.
- Wired local and HTTP adapters to surface continuation/liveness context
through shared adapter utilities.
- Added shared constants, validators, and heartbeat types for liveness
continuation state.
- Added issue-detail UI surfaces for continuation handoffs and the run
ledger, with component tests.
- Updated agent runtime docs, heartbeat protocol docs, prompt guidance,
onboarding assets, and skills instructions to explain continuation
behavior.
- Addressed Greptile feedback by scoping document evidence by run,
excluding system continuation-summary documents from liveness evidence,
importing shared liveness types, surfacing hidden ledger run counts,
documenting bounded retry behavior, and moving run-ledger liveness
backfill off the request path.

## Verification

- `pnpm exec vitest run packages/adapter-utils/src/server-utils.test.ts
server/src/__tests__/run-continuations.test.ts
server/src/__tests__/run-liveness.test.ts
server/src/__tests__/activity-service.test.ts
server/src/__tests__/documents-service.test.ts
server/src/__tests__/issue-continuation-summary.test.ts
server/src/services/heartbeat-stop-metadata.test.ts
ui/src/components/IssueRunLedger.test.tsx
ui/src/components/IssueContinuationHandoff.test.tsx
ui/src/components/IssueDocumentsSection.test.tsx`
- `pnpm --filter @paperclipai/db build`
- `pnpm exec vitest run server/src/__tests__/activity-service.test.ts
ui/src/components/IssueRunLedger.test.tsx`
- `pnpm --filter @paperclipai/ui typecheck`
- `pnpm --filter @paperclipai/server typecheck`
- `pnpm exec vitest run server/src/__tests__/activity-service.test.ts
server/src/__tests__/run-continuations.test.ts
ui/src/components/IssueRunLedger.test.tsx`
- `pnpm exec vitest run
server/src/__tests__/heartbeat-process-recovery.test.ts -t "treats a
plan document update"`
- `pnpm exec vitest run server/src/__tests__/activity-service.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts -t "activity
service|treats a plan document update"`
- Remote PR checks on head `e53b1a1d`: `verify`, `e2e`, `policy`, and
Snyk all passed.
- Confirmed `public-gh/master` is an ancestor of this branch after
fetching `public-gh master`.
- Confirmed `pnpm-lock.yaml` is not included in the branch diff.
- Confirmed migration `0058_wealthy_starbolt.sql` is ordered after
`0057` and uses `IF NOT EXISTS` guards for repeat application.
- Greptile inline review threads are resolved.

## Risks

- Medium risk: this touches heartbeat execution, liveness recovery,
activity rendering, issue routes, shared contracts, docs, and UI.
- Migration risk is mitigated by additive columns/indexes and idempotent
guards.
- Run-ledger liveness backfill is now asynchronous, so the first ledger
response can briefly show historical missing liveness until the
background backfill completes.
- UI screenshot coverage is not included in this packaging pass;
validation is currently through focused component tests.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5.4, local tool-use coding agent with terminal, git,
GitHub connector, GitHub CLI, and Paperclip API access.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

Screenshot note: no before/after screenshots were captured in this PR
packaging pass; the UI changes are covered by focused component tests
listed above.

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta 2026-04-20 06:01:49 -05:00 committed by GitHub
parent b9a80dcf22
commit 236d11d36f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
71 changed files with 18254 additions and 85 deletions

View file

@ -4,19 +4,25 @@ import { execFile as execFileCallback } from "node:child_process";
import { promisify } from "node:util";
import { and, asc, desc, eq, getTableColumns, gt, inArray, isNull, or, sql } from "drizzle-orm";
import type { Db } from "@paperclipai/db";
import type { BillingType, ExecutionWorkspace, ExecutionWorkspaceConfig } from "@paperclipai/shared";
import { ISSUE_CONTINUATION_SUMMARY_DOCUMENT_KEY } from "@paperclipai/shared";
import type { BillingType, ExecutionWorkspace, ExecutionWorkspaceConfig, RunLivenessState } from "@paperclipai/shared";
import {
agents,
agentRuntimeState,
agentTaskSessions,
agentWakeupRequests,
activityLog,
companySkills as companySkillsTable,
documentRevisions,
issueDocuments,
heartbeatRunEvents,
heartbeatRuns,
issueComments,
issues,
issueWorkProducts,
projects,
projectWorkspaces,
workspaceOperations,
} from "@paperclipai/db";
import { conflict, HttpError, notFound } from "../errors.js";
import { logger } from "../middleware/logger.js";
@ -40,6 +46,14 @@ import {
HEARTBEAT_RUN_SAFE_RESULT_JSON_MAX_BYTES,
mergeHeartbeatRunResultJson,
} from "./heartbeat-run-summary.js";
import {
buildHeartbeatRunStopMetadata,
mergeHeartbeatRunStopMetadata,
} from "./heartbeat-stop-metadata.js";
import {
classifyRunLiveness,
type RunLivenessClassificationInput,
} from "./run-liveness.js";
import { logActivity, type LogActivityInput } from "./activity-log.js";
import {
buildWorkspaceReadyComment,
@ -53,6 +67,10 @@ import {
sanitizeRuntimeServiceBaseEnv,
} from "./workspace-runtime.js";
import { issueService } from "./issues.js";
import {
getIssueContinuationSummaryDocument,
refreshIssueContinuationSummary,
} from "./issue-continuation-summary.js";
import { executionWorkspaceService, mergeExecutionWorkspaceConfig } from "./execution-workspaces.js";
import { workspaceOperationService } from "./workspace-operations.js";
import { isProcessGroupAlive, terminateLocalService } from "./local-service-supervisor.js";
@ -65,6 +83,13 @@ import {
resolveExecutionWorkspaceMode,
} from "./execution-workspace-policy.js";
import { instanceSettingsService } from "./instance-settings.js";
import {
RUN_LIVENESS_CONTINUATION_REASON,
buildRunLivenessContinuationIdempotencyKey,
decideRunLivenessContinuation,
findExistingRunLivenessContinuationWake,
readContinuationAttempt,
} from "./run-continuations.js";
import { redactCurrentUserText, redactCurrentUserValue } from "../log-redaction.js";
import {
hasSessionCompactionThresholds,
@ -397,6 +422,11 @@ const heartbeatRunListColumns = {
processStartedAt: heartbeatRuns.processStartedAt,
retryOfRunId: heartbeatRuns.retryOfRunId,
processLossRetryCount: heartbeatRuns.processLossRetryCount,
livenessState: heartbeatRuns.livenessState,
livenessReason: heartbeatRuns.livenessReason,
continuationAttempt: heartbeatRuns.continuationAttempt,
lastUsefulActionAt: heartbeatRuns.lastUsefulActionAt,
nextAction: heartbeatRuns.nextAction,
createdAt: heartbeatRuns.createdAt,
updatedAt: heartbeatRuns.updatedAt,
} as const;
@ -490,6 +520,11 @@ const heartbeatRunIssueSummaryColumns = {
finishedAt: heartbeatRuns.finishedAt,
createdAt: heartbeatRuns.createdAt,
agentId: heartbeatRuns.agentId,
livenessState: heartbeatRuns.livenessState,
livenessReason: heartbeatRuns.livenessReason,
continuationAttempt: heartbeatRuns.continuationAttempt,
lastUsefulActionAt: heartbeatRuns.lastUsefulActionAt,
nextAction: heartbeatRuns.nextAction,
issueId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'issueId'`.as("issueId"),
} as const;
@ -1204,6 +1239,14 @@ async function buildPaperclipWakePayload(input: {
db: Db;
companyId: string;
contextSnapshot: Record<string, unknown>;
continuationSummary?:
| {
key: string;
title: string | null;
body: string;
updatedAt: Date;
}
| null;
issueSummary?:
| {
id: string;
@ -1217,6 +1260,7 @@ async function buildPaperclipWakePayload(input: {
const executionStage = parseObject(input.contextSnapshot.executionStage);
const commentIds = extractWakeCommentIds(input.contextSnapshot);
const issueId = readNonEmptyString(input.contextSnapshot.issueId);
const continuationSummary = input.continuationSummary ?? null;
const issueSummary =
input.issueSummary ??
(issueId
@ -1309,8 +1353,37 @@ async function buildPaperclipWakePayload(input: {
priority: issueSummary.priority,
}
: null,
childIssueSummaries: Array.isArray(input.contextSnapshot.childIssueSummaries)
? input.contextSnapshot.childIssueSummaries
: [],
childIssueSummaryTruncated: input.contextSnapshot.childIssueSummaryTruncated === true,
livenessContinuation: readNonEmptyString(input.contextSnapshot.livenessContinuationState) ||
readNonEmptyString(input.contextSnapshot.livenessContinuationInstruction) ||
readNonEmptyString(input.contextSnapshot.livenessContinuationSourceRunId) ||
typeof input.contextSnapshot.livenessContinuationAttempt === "number"
? {
attempt: input.contextSnapshot.livenessContinuationAttempt,
maxAttempts: input.contextSnapshot.livenessContinuationMaxAttempts,
sourceRunId: readNonEmptyString(input.contextSnapshot.livenessContinuationSourceRunId),
state: readNonEmptyString(input.contextSnapshot.livenessContinuationState),
reason: readNonEmptyString(input.contextSnapshot.livenessContinuationReason),
instruction: readNonEmptyString(input.contextSnapshot.livenessContinuationInstruction),
}
: null,
checkedOutByHarness: input.contextSnapshot[PAPERCLIP_HARNESS_CHECKOUT_KEY] === true,
executionStage: Object.keys(executionStage).length > 0 ? executionStage : null,
continuationSummary: continuationSummary
? {
key: continuationSummary.key,
title: continuationSummary.title,
body:
continuationSummary.body.length > 4_000
? continuationSummary.body.slice(0, 4_000)
: continuationSummary.body,
bodyTruncated: continuationSummary.body.length > 4_000,
updatedAt: continuationSummary.updatedAt.toISOString(),
}
: null,
commentIds,
latestCommentId: commentIds[commentIds.length - 1] ?? null,
comments,
@ -1643,6 +1716,7 @@ export function heartbeatService(db: Db) {
agent: typeof agents.$inferSelect;
sessionId: string | null;
issueId: string | null;
continuationSummaryBody?: string | null;
}): Promise<SessionCompactionDecision> {
const { agent, sessionId, issueId } = input;
if (!sessionId) {
@ -1746,6 +1820,9 @@ export function heartbeatService(db: Db) {
issueId ? `- Issue: ${issueId}` : "",
`- Rotation reason: ${reason}`,
latestTextSummary ? `- Last run summary: ${latestTextSummary}` : "",
input.continuationSummaryBody
? `- Issue continuation summary: ${input.continuationSummaryBody.slice(0, 1_500)}`
: "",
"Continue from the current task state. Rebuild only the minimum context you need.",
]
.filter(Boolean)
@ -2170,6 +2247,136 @@ export function heartbeatService(db: Db) {
.where(eq(agentWakeupRequests.id, wakeupRequestId));
}
async function addContinuationExhaustedCommentOnce(input: {
run: typeof heartbeatRuns.$inferSelect;
issueId: string;
comment: string;
}) {
const existing = await db
.select({ id: issueComments.id })
.from(issueComments)
.where(
and(
eq(issueComments.companyId, input.run.companyId),
eq(issueComments.issueId, input.issueId),
eq(issueComments.createdByRunId, input.run.id),
sql`${issueComments.body} like 'Bounded liveness continuation exhausted%'`,
),
)
.limit(1)
.then((rows) => rows[0] ?? null);
if (existing) return;
await issuesSvc.addComment(input.issueId, input.comment, {
agentId: input.run.agentId,
runId: input.run.id,
});
}
async function handleRunLivenessContinuation(run: typeof heartbeatRuns.$inferSelect) {
const livenessState = run.livenessState as RunLivenessState | null;
if (livenessState !== "plan_only" && livenessState !== "empty_response") return;
const context = parseObject(run.contextSnapshot);
const issueId = readNonEmptyString(context.issueId);
if (!issueId) return;
const [issue, agent] = await Promise.all([
db
.select({
id: issues.id,
companyId: issues.companyId,
identifier: issues.identifier,
title: issues.title,
status: issues.status,
assigneeAgentId: issues.assigneeAgentId,
executionState: issues.executionState,
projectId: issues.projectId,
})
.from(issues)
.where(and(eq(issues.id, issueId), eq(issues.companyId, run.companyId)))
.then((rows) => rows[0] ?? null),
db
.select({
id: agents.id,
companyId: agents.companyId,
status: agents.status,
})
.from(agents)
.where(eq(agents.id, run.agentId))
.then((rows) => rows[0] ?? null),
]);
const budgetBlock =
issue && agent
? await budgets.getInvocationBlock(issue.companyId, agent.id, {
issueId: issue.id,
projectId: issue.projectId,
})
: null;
const nextAttempt = readContinuationAttempt(run.continuationAttempt) + 1;
const idempotencyKey = issue
? buildRunLivenessContinuationIdempotencyKey({
issueId: issue.id,
sourceRunId: run.id,
livenessState,
nextAttempt,
})
: null;
const existingWake = idempotencyKey
? await findExistingRunLivenessContinuationWake(db, {
companyId: run.companyId,
idempotencyKey,
})
: null;
const decision = decideRunLivenessContinuation({
run,
issue,
agent,
livenessState,
livenessReason: run.livenessReason,
nextAction: run.nextAction,
budgetBlocked: Boolean(budgetBlock),
idempotentWakeExists: Boolean(existingWake),
});
if (decision.kind === "exhausted") {
await setRunStatus(run.id, run.status, {
livenessReason: `${run.livenessReason ?? "Run ended without concrete progress"}; continuation attempts exhausted`,
});
await addContinuationExhaustedCommentOnce({
run,
issueId,
comment: decision.comment,
});
return;
}
if (decision.kind !== "enqueue") return;
const continuationRun = await enqueueWakeup(run.agentId, {
source: "automation",
triggerDetail: "system",
reason: RUN_LIVENESS_CONTINUATION_REASON,
payload: decision.payload,
contextSnapshot: decision.contextSnapshot,
idempotencyKey: decision.idempotencyKey,
requestedByActorType: "system",
requestedByActorId: "heartbeat",
});
if (continuationRun) {
await db
.update(heartbeatRuns)
.set({
continuationAttempt: decision.nextAttempt,
updatedAt: new Date(),
})
.where(eq(heartbeatRuns.id, continuationRun.id));
}
}
async function appendRunEvent(
run: typeof heartbeatRuns.$inferSelect,
seq: number,
@ -2298,6 +2505,47 @@ export function heartbeatService(db: Db) {
.then((rows) => rows[0] ?? null);
}
async function refreshContinuationSummaryForRun(
run: typeof heartbeatRuns.$inferSelect,
agent: typeof agents.$inferSelect,
) {
const contextSnapshot = parseObject(run.contextSnapshot);
const issueId = readNonEmptyString(contextSnapshot.issueId);
if (!issueId) return null;
try {
return await refreshIssueContinuationSummary({
db,
issueId,
run: {
id: run.id,
status: run.status,
error: run.error,
errorCode: run.errorCode,
resultJson: run.resultJson as Record<string, unknown> | null,
stdoutExcerpt: run.stdoutExcerpt,
stderrExcerpt: run.stderrExcerpt,
finishedAt: run.finishedAt,
},
agent: {
id: agent.id,
name: agent.name,
adapterType: agent.adapterType,
},
});
} catch (err) {
logger.warn(
{
err,
runId: run.id,
issueId,
agentId: agent.id,
},
"failed to refresh issue continuation summary",
);
return null;
}
}
async function enqueueMissingIssueCommentRetry(
run: typeof heartbeatRuns.$inferSelect,
agent: typeof agents.$inferSelect,
@ -2737,6 +2985,194 @@ export function heartbeatService(db: Db) {
}
}
function mergeRunStopMetadataForAgent(
agent: Pick<typeof agents.$inferSelect, "adapterType" | "adapterConfig">,
outcome: "succeeded" | "failed" | "cancelled" | "timed_out",
options?: {
resultJson?: Record<string, unknown> | null;
errorCode?: string | null;
errorMessage?: string | null;
},
) {
const stopMetadata = buildHeartbeatRunStopMetadata({
adapterType: agent.adapterType,
adapterConfig: parseObject(agent.adapterConfig),
outcome,
errorCode: options?.errorCode ?? null,
errorMessage: options?.errorMessage ?? null,
});
return mergeHeartbeatRunStopMetadata(options?.resultJson ?? null, stopMetadata);
}
function countValue(value: unknown) {
const parsed = Number(value ?? 0);
return Number.isFinite(parsed) ? Math.max(0, Math.floor(parsed)) : 0;
}
function dateValue(value: unknown) {
if (value instanceof Date) return Number.isNaN(value.getTime()) ? null : value;
if (typeof value === "string" || typeof value === "number") {
const parsed = new Date(value);
return Number.isNaN(parsed.getTime()) ? null : parsed;
}
return null;
}
function latestDate(...values: unknown[]) {
let latest: Date | null = null;
for (const value of values) {
const parsed = dateValue(value);
if (!parsed) continue;
if (!latest || parsed.getTime() > latest.getTime()) latest = parsed;
}
return latest;
}
async function buildRunLivenessInput(
run: typeof heartbeatRuns.$inferSelect,
resultJson: Record<string, unknown> | null | undefined,
): Promise<RunLivenessClassificationInput> {
const context = parseObject(run.contextSnapshot);
const contextIssueId = readNonEmptyString(context.issueId);
const continuationAttempt = asNumber(context.continuationAttempt, run.continuationAttempt ?? 0);
const issue = contextIssueId
? await db
.select({
status: issues.status,
title: issues.title,
description: issues.description,
})
.from(issues)
.where(and(eq(issues.companyId, run.companyId), eq(issues.id, contextIssueId)))
.then((rows) => rows[0] ?? null)
: null;
const [commentStats] = contextIssueId
? await db
.select({
count: sql<number>`count(*)::int`,
latestAt: sql<Date | null>`max(${issueComments.createdAt})`,
})
.from(issueComments)
.where(
and(
eq(issueComments.companyId, run.companyId),
eq(issueComments.issueId, contextIssueId),
eq(issueComments.createdByRunId, run.id),
),
)
: [{ count: 0, latestAt: null }];
const [documentStats] = contextIssueId
? await db
.select({
count: sql<number>`count(*)::int`,
planCount: sql<number>`count(*) filter (where ${issueDocuments.key} = 'plan')::int`,
latestAt: sql<Date | null>`max(${documentRevisions.createdAt})`,
})
.from(documentRevisions)
.innerJoin(issueDocuments, eq(documentRevisions.documentId, issueDocuments.documentId))
.where(
and(
eq(documentRevisions.companyId, run.companyId),
eq(documentRevisions.createdByRunId, run.id),
eq(issueDocuments.companyId, run.companyId),
eq(issueDocuments.issueId, contextIssueId),
sql`${issueDocuments.key} != ${ISSUE_CONTINUATION_SUMMARY_DOCUMENT_KEY}`,
),
)
: [{ count: 0, planCount: 0, latestAt: null }];
const [workProductStats] = contextIssueId
? await db
.select({
count: sql<number>`count(*)::int`,
latestAt: sql<Date | null>`max(${issueWorkProducts.createdAt})`,
})
.from(issueWorkProducts)
.where(
and(
eq(issueWorkProducts.companyId, run.companyId),
eq(issueWorkProducts.issueId, contextIssueId),
eq(issueWorkProducts.createdByRunId, run.id),
),
)
: [{ count: 0, latestAt: null }];
const [workspaceOperationStats] = await db
.select({
count: sql<number>`count(*)::int`,
latestAt: sql<Date | null>`max(${workspaceOperations.startedAt})`,
})
.from(workspaceOperations)
.where(and(eq(workspaceOperations.companyId, run.companyId), eq(workspaceOperations.heartbeatRunId, run.id)));
const [activityStats] = await db
.select({
count: sql<number>`count(*)::int`,
latestAt: sql<Date | null>`max(${activityLog.createdAt})`,
})
.from(activityLog)
.where(and(eq(activityLog.companyId, run.companyId), eq(activityLog.runId, run.id)));
const [eventStats] = await db
.select({
count: sql<number>`count(*) filter (where ${heartbeatRunEvents.eventType} not in ('lifecycle', 'adapter.invoke', 'error'))::int`,
latestAt: sql<Date | null>`max(${heartbeatRunEvents.createdAt}) filter (where ${heartbeatRunEvents.eventType} not in ('lifecycle', 'adapter.invoke', 'error'))`,
})
.from(heartbeatRunEvents)
.where(and(eq(heartbeatRunEvents.companyId, run.companyId), eq(heartbeatRunEvents.runId, run.id)));
return {
runStatus: run.status,
issue,
resultJson: resultJson ?? run.resultJson ?? null,
stdoutExcerpt: run.stdoutExcerpt ?? null,
stderrExcerpt: run.stderrExcerpt ?? null,
error: run.error ?? null,
errorCode: run.errorCode ?? null,
continuationAttempt,
evidence: {
issueCommentsCreated: countValue(commentStats?.count),
documentRevisionsCreated: countValue(documentStats?.count),
planDocumentRevisionsCreated: countValue(documentStats?.planCount),
workProductsCreated: countValue(workProductStats?.count),
workspaceOperationsCreated: countValue(workspaceOperationStats?.count),
activityEventsCreated: countValue(activityStats?.count),
toolOrActionEventsCreated: countValue(eventStats?.count),
latestEvidenceAt: latestDate(
commentStats?.latestAt,
documentStats?.latestAt,
workProductStats?.latestAt,
workspaceOperationStats?.latestAt,
activityStats?.latestAt,
eventStats?.latestAt,
),
},
};
}
async function classifyAndPersistRunLiveness(
run: typeof heartbeatRuns.$inferSelect,
resultJson?: Record<string, unknown> | null,
) {
const classification = classifyRunLiveness(await buildRunLivenessInput(run, resultJson));
return db
.update(heartbeatRuns)
.set({
livenessState: classification.livenessState,
livenessReason: classification.livenessReason,
continuationAttempt: classification.continuationAttempt,
lastUsefulActionAt: classification.lastUsefulActionAt,
nextAction: classification.nextAction,
updatedAt: new Date(),
})
.where(eq(heartbeatRuns.id, run.id))
.returning()
.then((rows) => rows[0] ?? null);
}
async function reapOrphanedRuns(opts?: { staleThresholdMs?: number }) {
const staleThresholdMs = opts?.staleThresholdMs ?? 0;
const now = new Date();
@ -2746,6 +3182,7 @@ export function heartbeatService(db: Db) {
.select({
run: heartbeatRuns,
adapterType: agents.adapterType,
adapterConfig: agents.adapterConfig,
})
.from(heartbeatRuns)
.innerJoin(agents, eq(heartbeatRuns.agentId, agents.id))
@ -2753,7 +3190,7 @@ export function heartbeatService(db: Db) {
const reaped: string[] = [];
for (const { run, adapterType } of activeRuns) {
for (const { run, adapterType, adapterConfig } of activeRuns) {
if (runningProcesses.has(run.id) || activeRunExecutions.has(run.id)) continue;
// Apply staleness threshold to avoid false positives
@ -2803,6 +3240,15 @@ export function heartbeatService(db: Db) {
error: shouldRetry ? `${baseMessage}; retrying once` : baseMessage,
errorCode: "process_lost",
finishedAt: now,
resultJson: mergeRunStopMetadataForAgent(
{ adapterType, adapterConfig },
"failed",
{
resultJson: parseObject(run.resultJson),
errorCode: "process_lost",
errorMessage: shouldRetry ? `${baseMessage}; retrying once` : baseMessage,
},
),
});
await setWakeupStatus(run.wakeupRequestId, "failed", {
finishedAt: now,
@ -2810,6 +3256,7 @@ export function heartbeatService(db: Db) {
});
if (!finalizedRun) finalizedRun = await getRun(run.id);
if (!finalizedRun) continue;
finalizedRun = await classifyAndPersistRunLiveness(finalizedRun, parseObject(finalizedRun.resultJson)) ?? finalizedRun;
let retriedRun: typeof heartbeatRuns.$inferSelect | null = null;
if (shouldRetry) {
@ -3340,10 +3787,24 @@ export function heartbeatService(db: Db) {
executionWorkspacePreference: issueContext.executionWorkspacePreference,
}
: null;
const continuationSummary = issueRef
? await getIssueContinuationSummaryDocument(db, issueRef.id)
: null;
if (continuationSummary) {
context.paperclipContinuationSummary = {
key: continuationSummary.key,
title: continuationSummary.title,
body: continuationSummary.body,
updatedAt: continuationSummary.updatedAt.toISOString(),
};
} else {
delete context.paperclipContinuationSummary;
}
const paperclipWakePayload = await buildPaperclipWakePayload({
db,
companyId: agent.companyId,
contextSnapshot: context,
continuationSummary,
issueSummary: issueRef
? {
id: issueRef.id,
@ -3656,6 +4117,7 @@ export function heartbeatService(db: Db) {
agent,
sessionId: previousSessionDisplayId ?? runtimeSessionIdForAdapter,
issueId,
continuationSummaryBody: continuationSummary?.body ?? null,
});
if (sessionCompaction.rotate) {
context.paperclipSessionHandoffMarkdown = sessionCompaction.handoffMarkdown;
@ -3962,6 +4424,23 @@ export function heartbeatService(db: Db) {
} else {
outcome = "failed";
}
const runErrorMessage =
outcome === "cancelled"
? (latestRun?.error ?? adapterResult.errorMessage ?? "Cancelled")
: outcome === "succeeded"
? null
: redactCurrentUserText(
adapterResult.errorMessage ?? (outcome === "timed_out" ? "Timed out" : "Adapter failed"),
currentUserRedactionOptions,
);
const runErrorCode =
outcome === "timed_out"
? "timeout"
: outcome === "cancelled"
? (latestRun?.errorCode ?? "cancelled")
: outcome === "failed"
? (adapterResult.errorCode ?? "adapter_failed")
: null;
let logSummary: { bytes: number; sha256?: string; compressed: boolean } | null = null;
if (handle) {
@ -4004,27 +4483,18 @@ export function heartbeatService(db: Db) {
: null;
const persistedResultJson = mergeHeartbeatRunResultJson(
adapterResult.resultJson ?? null,
mergeRunStopMetadataForAgent(agent, outcome, {
resultJson: adapterResult.resultJson ?? null,
errorCode: runErrorCode,
errorMessage: runErrorMessage,
}),
adapterResult.summary ?? null,
);
await setRunStatus(run.id, status, {
let persistedRun = await setRunStatus(run.id, status, {
finishedAt: new Date(),
error:
outcome === "succeeded"
? null
: redactCurrentUserText(
adapterResult.errorMessage ?? (outcome === "timed_out" ? "Timed out" : "Adapter failed"),
currentUserRedactionOptions,
),
errorCode:
outcome === "timed_out"
? "timeout"
: outcome === "cancelled"
? "cancelled"
: outcome === "failed"
? (adapterResult.errorCode ?? "adapter_failed")
: null,
error: runErrorMessage,
errorCode: runErrorCode,
exitCode: adapterResult.exitCode,
signal: adapterResult.signal,
usageJson,
@ -4036,13 +4506,16 @@ export function heartbeatService(db: Db) {
logSha256: logSummary?.sha256,
logCompressed: logSummary?.compressed ?? false,
});
if (persistedRun) {
persistedRun = await classifyAndPersistRunLiveness(persistedRun, persistedResultJson) ?? persistedRun;
}
await setWakeupStatus(run.wakeupRequestId, outcome === "succeeded" ? "completed" : status, {
finishedAt: new Date(),
error: adapterResult.errorMessage ?? null,
error: runErrorMessage,
});
const finalizedRun = await getRun(run.id);
const finalizedRun = persistedRun ?? (await getRun(run.id));
if (finalizedRun) {
await appendRunEvent(finalizedRun, seq++, {
eventType: "lifecycle",
@ -4054,13 +4527,15 @@ export function heartbeatService(db: Db) {
exitCode: adapterResult.exitCode,
},
});
const livenessRun = finalizedRun;
await refreshContinuationSummaryForRun(livenessRun, agent);
if (issueId && outcome === "succeeded") {
try {
const existingRunComment = await findRunIssueComment(finalizedRun.id, finalizedRun.companyId, issueId);
const existingRunComment = await findRunIssueComment(livenessRun.id, livenessRun.companyId, issueId);
if (!existingRunComment) {
const issueComment = buildHeartbeatRunIssueComment(persistedResultJson);
if (issueComment) {
await issuesSvc.addComment(issueId, issueComment, { agentId: agent.id, runId: finalizedRun.id });
await issuesSvc.addComment(issueId, issueComment, { agentId: agent.id, runId: livenessRun.id });
}
}
} catch (err) {
@ -4070,8 +4545,9 @@ export function heartbeatService(db: Db) {
);
}
}
await finalizeIssueCommentPolicy(finalizedRun, agent);
await releaseIssueExecutionAndPromote(finalizedRun);
await finalizeIssueCommentPolicy(livenessRun, agent);
await releaseIssueExecutionAndPromote(livenessRun);
await handleRunLivenessContinuation(livenessRun);
}
if (finalizedRun) {
@ -4119,6 +4595,10 @@ export function heartbeatService(db: Db) {
error: message,
errorCode: "adapter_failed",
finishedAt: new Date(),
resultJson: mergeRunStopMetadataForAgent(agent, "failed", {
errorCode: "adapter_failed",
errorMessage: message,
}),
stdoutExcerpt,
stderrExcerpt,
logBytes: logSummary?.bytes,
@ -4137,10 +4617,12 @@ export function heartbeatService(db: Db) {
level: "error",
message,
});
await finalizeIssueCommentPolicy(failedRun, agent);
await releaseIssueExecutionAndPromote(failedRun);
const livenessRun = await classifyAndPersistRunLiveness(failedRun) ?? failedRun;
await refreshContinuationSummaryForRun(livenessRun, agent);
await finalizeIssueCommentPolicy(livenessRun, agent);
await releaseIssueExecutionAndPromote(livenessRun);
await updateRuntimeState(agent, failedRun, {
await updateRuntimeState(agent, livenessRun, {
exitCode: null,
signal: null,
timedOut: false,
@ -4170,10 +4652,17 @@ export function heartbeatService(db: Db) {
// The inner catch did not fire, so we must record the failure here.
const message = outerErr instanceof Error ? outerErr.message : "Unknown setup failure";
logger.error({ err: outerErr, runId }, "heartbeat execution setup failed");
const setupFailureAgent = await getAgent(run.agentId).catch(() => null);
await setRunStatus(runId, "failed", {
error: message,
errorCode: "adapter_failed",
finishedAt: new Date(),
...(setupFailureAgent ? {
resultJson: mergeRunStopMetadataForAgent(setupFailureAgent, "failed", {
errorCode: "adapter_failed",
errorMessage: message,
}),
} : {}),
}).catch(() => undefined);
await setWakeupStatus(run.wakeupRequestId, "failed", {
finishedAt: new Date(),
@ -4189,11 +4678,13 @@ export function heartbeatService(db: Db) {
level: "error",
message,
}).catch(() => undefined);
const failedAgent = await getAgent(run.agentId).catch(() => null);
const livenessRun = await classifyAndPersistRunLiveness(failedRun).catch(() => failedRun);
const failedAgent = setupFailureAgent ?? await getAgent(run.agentId).catch(() => null);
if (failedAgent) {
await finalizeIssueCommentPolicy(failedRun, failedAgent).catch(() => undefined);
await refreshContinuationSummaryForRun(livenessRun, failedAgent).catch(() => undefined);
await finalizeIssueCommentPolicy(livenessRun, failedAgent).catch(() => undefined);
}
await releaseIssueExecutionAndPromote(failedRun).catch(() => undefined);
await releaseIssueExecutionAndPromote(livenessRun).catch(() => undefined);
}
// Ensure the agent is not left stuck in "running" if the inner catch handler's
// DB calls threw (e.g. a transient DB error in finalizeAgentStatus).
@ -4363,6 +4854,9 @@ export function heartbeatService(db: Db) {
const sessionBefore =
readNonEmptyString(promotedContextSnapshot.resumeSessionDisplayId) ??
await resolveSessionBeforeForWakeup(deferredAgent, promotedTaskKey);
const promotedContinuationAttempt = readContinuationAttempt(
promotedContextSnapshot.livenessContinuationAttempt,
);
const now = new Date();
const newRun = await tx
.insert(heartbeatRuns)
@ -4375,6 +4869,7 @@ export function heartbeatService(db: Db) {
wakeupRequestId: deferred.id,
contextSnapshot: promotedContextSnapshot,
sessionIdBefore: sessionBefore,
continuationAttempt: promotedContinuationAttempt,
})
.returning()
.then((rows) => rows[0]);
@ -4473,6 +4968,7 @@ export function heartbeatService(db: Db) {
const sessionBefore =
explicitResumeSession?.sessionDisplayId ??
await resolveSessionBeforeForWakeup(agent, effectiveTaskKey);
const continuationAttempt = readContinuationAttempt(enrichedContextSnapshot.livenessContinuationAttempt);
const writeSkippedRequest = async (skipReason: string) => {
await db.insert(agentWakeupRequests).values({
@ -4771,6 +5267,7 @@ export function heartbeatService(db: Db) {
wakeupRequestId: wakeupRequest.id,
contextSnapshot: enrichedContextSnapshot,
sessionIdBefore: sessionBefore,
continuationAttempt,
})
.returning()
.then((rows) => rows[0]);
@ -4890,6 +5387,7 @@ export function heartbeatService(db: Db) {
wakeupRequestId: wakeupRequest.id,
contextSnapshot: enrichedContextSnapshot,
sessionIdBefore: sessionBefore,
continuationAttempt,
})
.returning()
.then((rows) => rows[0]);
@ -5022,6 +5520,7 @@ export function heartbeatService(db: Db) {
const run = await getRun(runId);
if (!run) throw notFound("Heartbeat run not found");
if (run.status !== "running" && run.status !== "queued") return run;
const agent = await getAgent(run.agentId);
const running = runningProcesses.get(run.id);
if (running) {
@ -5041,6 +5540,13 @@ export function heartbeatService(db: Db) {
finishedAt: new Date(),
error: reason,
errorCode: "cancelled",
...(agent ? {
resultJson: mergeRunStopMetadataForAgent(agent, "cancelled", {
resultJson: parseObject(run.resultJson),
errorCode: "cancelled",
errorMessage: reason,
}),
} : {}),
});
await setWakeupStatus(run.wakeupRequestId, "cancelled", {
@ -5065,6 +5571,7 @@ export function heartbeatService(db: Db) {
}
async function cancelActiveForAgentInternal(agentId: string, reason = "Cancelled due to agent pause") {
const agent = await getAgent(agentId);
const runs = await db
.select()
.from(heartbeatRuns)
@ -5075,6 +5582,13 @@ export function heartbeatService(db: Db) {
finishedAt: new Date(),
error: reason,
errorCode: "cancelled",
...(agent ? {
resultJson: mergeRunStopMetadataForAgent(agent, "cancelled", {
resultJson: parseObject(run.resultJson),
errorCode: "cancelled",
errorMessage: reason,
}),
} : {}),
});
await setWakeupStatus(run.wakeupRequestId, "cancelled", {