[codex] harden heartbeat run summaries and recovery context (#3742)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies
> - Heartbeat runs are the control-plane record of what agents did, why
they woke up, and what operators should see next
> - Run lists, stranded issue comments, and live log polling all depend
on compact but accurate heartbeat summaries
> - The current branch had a focused backend slice that improves how run
result JSON is summarized, how stale process recovery comments are
written, and how live log polling resolves the active run
> - This pull request isolates that heartbeat/runtime reliability work
from the unrelated UI and dev-tooling changes
> - The benefit is more reliable issue context and cheaper run lookups
without dragging unrelated board UI changes into the same review

## What Changed

- Include the latest run failure in stranded issue comments during
orphaned process recovery.
- Bound heartbeat `result_json` payloads for list responses while
preserving the raw stored payloads.
- Narrow heartbeat log endpoint lookups so issue polling resolves the
relevant active run with less unnecessary scanning.
- Add focused tests for heartbeat list summaries, live run polling,
orphaned process recovery, and the run context/result summary helpers.

## Verification

- `pnpm vitest run
server/src/__tests__/heartbeat-context-summary.test.ts
server/src/__tests__/heartbeat-list.test.ts
server/src/__tests__/agent-live-run-routes.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts`

## Risks

- The main risk is accidentally hiding a field that some client still
expects from summarized `result_json`, or over-constraining the live log
lookup path for edge-case run routing.
- Recovery comments now surface the latest failure more aggressively, so
wording changes may affect downstream expectations if anyone parses
those comments too strictly.

## Model Used

- OpenAI Codex, GPT-5-based coding agent in the Codex CLI environment.
Exact backend model deployment ID was not exposed in-session.
Tool-assisted editing and shell execution were used.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
This commit is contained in:
Dotta 2026-04-15 09:48:39 -05:00 committed by GitHub
parent c1a02497b0
commit 3fa5d25de1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 498 additions and 24 deletions

View file

@ -1,4 +1,8 @@
function truncateSummaryText(value: unknown, maxLength = 500) {
export const HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS = 500;
export const HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS = 4_096;
export const HEARTBEAT_RUN_SAFE_RESULT_JSON_MAX_BYTES = 64 * 1024;
function truncateSummaryText(value: unknown, maxLength = HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS) {
if (typeof value !== "string") return null;
return value.length > maxLength ? value.slice(0, maxLength) : value;
}

View file

@ -2,7 +2,7 @@ import fs from "node:fs/promises";
import path from "node:path";
import { execFile as execFileCallback } from "node:child_process";
import { promisify } from "node:util";
import { and, asc, desc, eq, gt, inArray, isNull, or, sql } from "drizzle-orm";
import { and, asc, desc, eq, getTableColumns, gt, inArray, isNull, or, sql } from "drizzle-orm";
import type { Db } from "@paperclipai/db";
import type { BillingType, ExecutionWorkspace, ExecutionWorkspaceConfig } from "@paperclipai/shared";
import {
@ -35,8 +35,10 @@ import { secretService } from "./secrets.js";
import { resolveDefaultAgentWorkspaceDir, resolveManagedProjectWorkspaceDir } from "../home-paths.js";
import {
buildHeartbeatRunIssueComment,
HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS,
HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS,
HEARTBEAT_RUN_SAFE_RESULT_JSON_MAX_BYTES,
mergeHeartbeatRunResultJson,
summarizeHeartbeatRunResultJson,
} from "./heartbeat-run-summary.js";
import { logActivity, type LogActivityInput } from "./activity-log.js";
import {
@ -378,7 +380,6 @@ const heartbeatRunListColumns = {
exitCode: heartbeatRuns.exitCode,
signal: heartbeatRuns.signal,
usageJson: heartbeatRuns.usageJson,
resultJson: heartbeatRuns.resultJson,
sessionIdBefore: heartbeatRuns.sessionIdBefore,
sessionIdAfter: heartbeatRuns.sessionIdAfter,
logStore: heartbeatRuns.logStore,
@ -395,11 +396,90 @@ const heartbeatRunListColumns = {
processStartedAt: heartbeatRuns.processStartedAt,
retryOfRunId: heartbeatRuns.retryOfRunId,
processLossRetryCount: heartbeatRuns.processLossRetryCount,
contextSnapshot: heartbeatRuns.contextSnapshot,
createdAt: heartbeatRuns.createdAt,
updatedAt: heartbeatRuns.updatedAt,
} as const;
const heartbeatRunListContextColumns = {
contextIssueId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'issueId'`.as("contextIssueId"),
contextTaskId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'taskId'`.as("contextTaskId"),
contextTaskKey: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'taskKey'`.as("contextTaskKey"),
contextCommentId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'commentId'`.as("contextCommentId"),
contextWakeCommentId: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'wakeCommentId'`.as("contextWakeCommentId"),
contextWakeReason: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'wakeReason'`.as("contextWakeReason"),
contextWakeSource: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'wakeSource'`.as("contextWakeSource"),
contextWakeTriggerDetail: sql<string | null>`${heartbeatRuns.contextSnapshot} ->> 'wakeTriggerDetail'`.as("contextWakeTriggerDetail"),
} as const;
const heartbeatRunListResultColumns = {
resultSummary: sql<string | null>`left(${heartbeatRuns.resultJson} ->> 'summary', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS})`.as("resultSummary"),
resultResult: sql<string | null>`left(${heartbeatRuns.resultJson} ->> 'result', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS})`.as("resultResult"),
resultMessage: sql<string | null>`left(${heartbeatRuns.resultJson} ->> 'message', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS})`.as("resultMessage"),
resultError: sql<string | null>`left(${heartbeatRuns.resultJson} ->> 'error', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS})`.as("resultError"),
resultTotalCostUsd: sql<string | null>`${heartbeatRuns.resultJson} ->> 'total_cost_usd'`.as("resultTotalCostUsd"),
resultCostUsd: sql<string | null>`${heartbeatRuns.resultJson} ->> 'cost_usd'`.as("resultCostUsd"),
resultCostUsdCamel: sql<string | null>`${heartbeatRuns.resultJson} ->> 'costUsd'`.as("resultCostUsdCamel"),
} as const;
const heartbeatRunSafeResultJsonColumn = sql<Record<string, unknown> | null>`
case
when ${heartbeatRuns.resultJson} is null then null
when pg_column_size(${heartbeatRuns.resultJson}) <= ${HEARTBEAT_RUN_SAFE_RESULT_JSON_MAX_BYTES}
then ${heartbeatRuns.resultJson}
else jsonb_strip_nulls(
jsonb_build_object(
'summary', left(${heartbeatRuns.resultJson} ->> 'summary', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS}),
'result', left(${heartbeatRuns.resultJson} ->> 'result', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS}),
'message', left(${heartbeatRuns.resultJson} ->> 'message', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS}),
'error', left(${heartbeatRuns.resultJson} ->> 'error', ${HEARTBEAT_RUN_RESULT_SUMMARY_MAX_CHARS}),
'stdout', left(${heartbeatRuns.resultJson} ->> 'stdout', ${HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS}),
'stderr', left(${heartbeatRuns.resultJson} ->> 'stderr', ${HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS}),
'stdoutTruncated', case
when length(${heartbeatRuns.resultJson} ->> 'stdout') > ${HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS}
then to_jsonb(true)
else null
end,
'stderrTruncated', case
when length(${heartbeatRuns.resultJson} ->> 'stderr') > ${HEARTBEAT_RUN_RESULT_OUTPUT_MAX_CHARS}
then to_jsonb(true)
else null
end,
'costUsd', coalesce(
${heartbeatRuns.resultJson} -> 'costUsd',
${heartbeatRuns.resultJson} -> 'cost_usd',
${heartbeatRuns.resultJson} -> 'total_cost_usd'
),
'cost_usd', coalesce(
${heartbeatRuns.resultJson} -> 'cost_usd',
${heartbeatRuns.resultJson} -> 'costUsd',
${heartbeatRuns.resultJson} -> 'total_cost_usd'
),
'total_cost_usd', coalesce(
${heartbeatRuns.resultJson} -> 'total_cost_usd',
${heartbeatRuns.resultJson} -> 'cost_usd',
${heartbeatRuns.resultJson} -> 'costUsd'
),
'truncated', true,
'truncationReason', 'oversized_result_json',
'originalSizeBytes', pg_column_size(${heartbeatRuns.resultJson})
)
)
end
`.as("resultJson");
const heartbeatRunSafeColumns = {
...getTableColumns(heartbeatRuns),
processGroupId: heartbeatRunProcessGroupIdColumn,
resultJson: heartbeatRunSafeResultJsonColumn,
} as const;
const heartbeatRunLogAccessColumns = {
id: heartbeatRuns.id,
companyId: heartbeatRuns.companyId,
logStore: heartbeatRuns.logStore,
logRef: heartbeatRuns.logRef,
} as const;
const heartbeatRunIssueSummaryColumns = {
id: heartbeatRuns.id,
status: heartbeatRuns.status,
@ -519,6 +599,87 @@ function readNonEmptyString(value: unknown): string | null {
return typeof value === "string" && value.trim().length > 0 ? value : null;
}
export function summarizeHeartbeatRunContextSnapshot(
contextSnapshot: Record<string, unknown> | null | undefined,
): Record<string, unknown> | null {
const summary: Record<string, unknown> = {};
const allowedKeys = [
"issueId",
"taskId",
"taskKey",
"commentId",
"wakeCommentId",
"wakeReason",
"wakeSource",
"wakeTriggerDetail",
] as const;
for (const key of allowedKeys) {
const value = readNonEmptyString(contextSnapshot?.[key]);
if (value) summary[key] = value;
}
return Object.keys(summary).length > 0 ? summary : null;
}
export function summarizeHeartbeatRunListResultJson(input: {
summary?: string | null;
result?: string | null;
message?: string | null;
error?: string | null;
totalCostUsd?: string | null;
costUsd?: string | null;
costUsdCamel?: string | null;
}): Record<string, unknown> | null {
const summary: Record<string, unknown> = {};
for (const [key, value] of [
["summary", input.summary],
["result", input.result],
["message", input.message],
["error", input.error],
] as const) {
const normalized = readNonEmptyString(value);
if (normalized) summary[key] = normalized;
}
for (const [key, value] of [
["total_cost_usd", input.totalCostUsd],
["cost_usd", input.costUsd],
["costUsd", input.costUsdCamel],
] as const) {
const normalized = readNonEmptyString(value);
if (!normalized) continue;
const parsed = Number(normalized);
if (Number.isFinite(parsed)) summary[key] = parsed;
}
return Object.keys(summary).length > 0 ? summary : null;
}
function summarizeRunFailureForIssueComment(
run: Pick<typeof heartbeatRuns.$inferSelect, "error" | "errorCode"> | null | undefined,
) {
if (!run) return null;
const errorCode = readNonEmptyString(run.errorCode)?.trim() ?? null;
const rawError = readNonEmptyString(run.error)?.trim() ?? null;
const apiMessageMatch = rawError?.match(/"message"\s*:\s*"([^"]+)"/);
const firstLine = rawError
?.split(/\r?\n/)
.map((line) => line.trim())
.find(Boolean) ?? null;
const summarySource = apiMessageMatch?.[1] ?? firstLine;
const summary =
summarySource && summarySource.length > 240
? `${summarySource.slice(0, 237)}...`
: summarySource;
if (errorCode && summary) return ` Latest retry failure: \`${errorCode}\` - ${summary}.`;
if (errorCode) return ` Latest retry failure: \`${errorCode}\`.`;
if (summary) return ` Latest retry failure: ${summary}.`;
return null;
}
function normalizeLedgerBillingType(value: unknown): BillingType {
const raw = readNonEmptyString(value);
switch (raw) {
@ -1351,9 +1512,17 @@ export function heartbeatService(db: Db) {
.then((rows) => rows[0] ?? null);
}
async function getRun(runId: string) {
async function getRun(runId: string, opts?: { unsafeFullResultJson?: boolean }) {
return db
.select()
.select(opts?.unsafeFullResultJson ? getTableColumns(heartbeatRuns) : heartbeatRunSafeColumns)
.from(heartbeatRuns)
.where(eq(heartbeatRuns.id, runId))
.then((rows) => rows[0] ?? null);
}
async function getRunLogAccess(runId: string) {
return db
.select(heartbeatRunLogAccessColumns)
.from(heartbeatRuns)
.where(eq(heartbeatRuns.id, runId))
.then((rows) => rows[0] ?? null);
@ -1421,7 +1590,10 @@ export function heartbeatService(db: Db) {
conditions.push(sql`${heartbeatRuns.id} <> ${opts.excludeRunId}`);
}
return db
.select()
.select({
id: heartbeatRuns.id,
usageJson: heartbeatRuns.usageJson,
})
.from(heartbeatRuns)
.where(and(...conditions))
.orderBy(desc(heartbeatRuns.createdAt))
@ -1497,8 +1669,8 @@ export function heartbeatService(db: Db) {
id: heartbeatRuns.id,
createdAt: heartbeatRuns.createdAt,
usageJson: heartbeatRuns.usageJson,
resultJson: heartbeatRuns.resultJson,
error: heartbeatRuns.error,
...heartbeatRunListResultColumns,
})
.from(heartbeatRuns)
.where(and(eq(heartbeatRuns.agentId, agent.id), eq(heartbeatRuns.sessionIdAfter, sessionId)))
@ -1552,7 +1724,15 @@ export function heartbeatService(db: Db) {
};
}
const latestSummary = summarizeHeartbeatRunResultJson(latestRun.resultJson);
const latestSummary = summarizeHeartbeatRunListResultJson({
summary: latestRun?.resultSummary,
result: latestRun?.resultResult,
message: latestRun?.resultMessage,
error: latestRun?.resultError,
totalCostUsd: latestRun?.resultTotalCostUsd,
costUsd: latestRun?.resultCostUsd,
costUsdCamel: latestRun?.resultCostUsdCamel,
});
const latestTextSummary =
readNonEmptyString(latestSummary?.summary) ??
readNonEmptyString(latestSummary?.result) ??
@ -2681,7 +2861,13 @@ export function heartbeatService(db: Db) {
async function getLatestIssueRun(companyId: string, issueId: string) {
return db
.select()
.select({
id: heartbeatRuns.id,
status: heartbeatRuns.status,
error: heartbeatRuns.error,
errorCode: heartbeatRuns.errorCode,
contextSnapshot: heartbeatRuns.contextSnapshot,
})
.from(heartbeatRuns)
.where(
and(
@ -2771,7 +2957,10 @@ export function heartbeatService(db: Db) {
async function escalateStrandedAssignedIssue(input: {
issue: typeof issues.$inferSelect;
previousStatus: "todo" | "in_progress";
latestRun: typeof heartbeatRuns.$inferSelect | null;
latestRun: Pick<
typeof heartbeatRuns.$inferSelect,
"id" | "status" | "error" | "errorCode" | "contextSnapshot"
> | null;
comment: string;
}) {
const updated = await issuesSvc.update(input.issue.id, {
@ -2857,13 +3046,15 @@ export function heartbeatService(db: Db) {
}
if (latestRetryReason === "assignment_recovery") {
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
const updated = await escalateStrandedAssignedIssue({
issue,
previousStatus: "todo",
latestRun,
comment:
"Paperclip automatically retried dispatch for this assigned `todo` issue after a lost wake/run, " +
"but it still has no live execution path. Moving it to `blocked` so it is visible for intervention.",
`but it still has no live execution path.${failureSummary ?? ""} ` +
"Moving it to `blocked` so it is visible for intervention.",
});
if (updated) {
result.escalated += 1;
@ -2892,14 +3083,15 @@ export function heartbeatService(db: Db) {
}
if (latestRetryReason === "issue_continuation_needed") {
const failureSummary = summarizeRunFailureForIssueComment(latestRun);
const updated = await escalateStrandedAssignedIssue({
issue,
previousStatus: "in_progress",
latestRun,
comment:
"Paperclip automatically retried continuation for this assigned `in_progress` issue after its live " +
"execution disappeared, but it still has no live execution path. Moving it to `blocked` so it is " +
"visible for intervention.",
`execution disappeared, but it still has no live execution path.${failureSummary ?? ""} ` +
"Moving it to `blocked` so it is visible for intervention.",
});
if (updated) {
result.escalated += 1;
@ -4940,7 +5132,11 @@ export function heartbeatService(db: Db) {
return {
list: async (companyId: string, agentId?: string, limit?: number) => {
const query = db
.select(heartbeatRunListColumns)
.select({
...heartbeatRunListColumns,
...heartbeatRunListContextColumns,
...heartbeatRunListResultColumns,
})
.from(heartbeatRuns)
.where(
agentId
@ -4950,14 +5146,55 @@ export function heartbeatService(db: Db) {
.orderBy(desc(heartbeatRuns.createdAt));
const rows = limit ? await query.limit(limit) : await query;
return rows.map((row) => ({
...row,
resultJson: summarizeHeartbeatRunResultJson(row.resultJson),
}));
return rows.map((row) => {
const {
contextIssueId,
contextTaskId,
contextTaskKey,
contextCommentId,
contextWakeCommentId,
contextWakeReason,
contextWakeSource,
contextWakeTriggerDetail,
resultSummary,
resultResult,
resultMessage,
resultError,
resultTotalCostUsd,
resultCostUsd,
resultCostUsdCamel,
...rest
} = row;
return {
...rest,
contextSnapshot: summarizeHeartbeatRunContextSnapshot({
issueId: contextIssueId,
taskId: contextTaskId,
taskKey: contextTaskKey,
commentId: contextCommentId,
wakeCommentId: contextWakeCommentId,
wakeReason: contextWakeReason,
wakeSource: contextWakeSource,
wakeTriggerDetail: contextWakeTriggerDetail,
}),
resultJson: summarizeHeartbeatRunListResultJson({
summary: resultSummary,
result: resultResult,
message: resultMessage,
error: resultError,
totalCostUsd: resultTotalCostUsd,
costUsd: resultCostUsd,
costUsdCamel: resultCostUsdCamel,
}),
};
});
},
getRun,
getRunLogAccess,
getRuntimeState: async (agentId: string) => {
const state = await getRuntimeState(agentId);
const agent = await getAgent(agentId);
@ -5031,8 +5268,17 @@ export function heartbeatService(db: Db) {
.orderBy(asc(heartbeatRunEvents.seq))
.limit(Math.max(1, Math.min(limit, 1000))),
readLog: async (runId: string, opts?: { offset?: number; limitBytes?: number }) => {
const run = await getRun(runId);
readLog: async (
runOrLookup: string | {
id: string;
companyId: string;
logStore: string | null;
logRef: string | null;
},
opts?: { offset?: number; limitBytes?: number },
) => {
const run = typeof runOrLookup === "string" ? await getRunLogAccess(runOrLookup) : runOrLookup;
const runId = typeof runOrLookup === "string" ? runOrLookup : runOrLookup.id;
if (!run) throw notFound("Heartbeat run not found");
if (!run.logStore || !run.logRef) throw notFound("Run log not found");