Improve orphaned local heartbeat recovery

Persist child-process metadata for local adapter runs, keep detached runs alive when their pid still exists, queue a single automatic retry when the pid is confirmed dead, and clear detached warnings when the original run reports activity again.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
dotta 2026-03-19 11:20:36 -05:00
parent 7f3fad64b8
commit c844ca1a40
17 changed files with 10924 additions and 22 deletions

View file

@ -8,6 +8,8 @@ export interface RunProcessResult {
timedOut: boolean;
stdout: string;
stderr: string;
pid: number | null;
startedAt: string | null;
}
interface RunningProcess {
@ -423,6 +425,7 @@ export async function runChildProcess(
graceSec: number;
onLog: (stream: "stdout" | "stderr", chunk: string) => Promise<void>;
onLogError?: (err: unknown, runId: string, message: string) => void;
onSpawn?: (meta: { pid: number; startedAt: string }) => Promise<void>;
stdin?: string;
},
): Promise<RunProcessResult> {
@ -455,12 +458,19 @@ export async function runChildProcess(
shell: false,
stdio: [opts.stdin != null ? "pipe" : "ignore", "pipe", "pipe"],
}) as ChildProcessWithEvents;
const startedAt = new Date().toISOString();
if (opts.stdin != null && child.stdin) {
child.stdin.write(opts.stdin);
child.stdin.end();
}
if (typeof child.pid === "number" && child.pid > 0 && opts.onSpawn) {
void opts.onSpawn({ pid: child.pid, startedAt }).catch((err) => {
onLogError(err, runId, "failed to record child process metadata");
});
}
runningProcesses.set(runId, { child, graceSec: opts.graceSec });
let timedOut = false;
@ -519,6 +529,8 @@ export async function runChildProcess(
timedOut,
stdout,
stderr,
pid: child.pid ?? null,
startedAt,
});
});
});

View file

@ -120,6 +120,7 @@ export interface AdapterExecutionContext {
context: Record<string, unknown>;
onLog: (stream: "stdout" | "stderr", chunk: string) => Promise<void>;
onMeta?: (meta: AdapterInvocationMeta) => Promise<void>;
onSpawn?: (meta: { pid: number; startedAt: string }) => Promise<void>;
authToken?: string;
}

View file

@ -303,7 +303,7 @@ export async function runClaudeLogin(input: {
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -455,6 +455,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
stdin: prompt,
timeoutSec,
graceSec,
onSpawn,
onLog,
});

View file

@ -167,7 +167,7 @@ export async function ensureCodexSkillsInjected(
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -454,6 +454,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
stdin: prompt,
timeoutSec,
graceSec,
onSpawn,
onLog: async (stream, chunk) => {
if (stream !== "stderr") {
await onLog(stream, chunk);

View file

@ -152,7 +152,7 @@ export async function ensureCursorSkillsInjected(
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -419,6 +419,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
timeoutSec,
graceSec,
stdin: prompt,
onSpawn,
onLog: async (stream, chunk) => {
if (stream !== "stdout") {
await onLog(stream, chunk);

View file

@ -129,7 +129,7 @@ async function ensureGeminiSkillsInjected(
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -349,6 +349,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
env,
timeoutSec,
graceSec,
onSpawn,
onLog,
});
return {

View file

@ -88,7 +88,7 @@ async function ensureOpenCodeSkillsInjected(onLog: AdapterExecutionContext["onLo
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -301,6 +301,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
stdin: prompt,
timeoutSec,
graceSec,
onSpawn,
onLog,
});
return {

View file

@ -102,7 +102,7 @@ function buildSessionPath(agentId: string, timestamp: string): string {
}
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
const promptTemplate = asString(
config.promptTemplate,
@ -398,6 +398,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
env: runtimeEnv,
timeoutSec,
graceSec,
onSpawn,
onLog: bufferedOnLog,
stdin: buildRpcStdin(),
});

View file

@ -0,0 +1,5 @@
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_pid" integer;--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_started_at" timestamp with time zone;--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN "retry_of_run_id" uuid;--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_loss_retry_count" integer DEFAULT 0 NOT NULL;--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD CONSTRAINT "heartbeat_runs_retry_of_run_id_heartbeat_runs_id_fk" FOREIGN KEY ("retry_of_run_id") REFERENCES "public"."heartbeat_runs"("id") ON DELETE set null ON UPDATE no action;

File diff suppressed because it is too large Load diff

View file

@ -267,6 +267,13 @@
"when": 1773756922363,
"tag": "0037_friendly_eddie_brock",
"breakpoints": true
},
{
"idx": 38,
"version": "7",
"when": 1773931592563,
"tag": "0038_careless_iron_monger",
"breakpoints": true
}
]
}

View file

@ -1,4 +1,4 @@
import { pgTable, uuid, text, timestamp, jsonb, index, integer, bigint, boolean } from "drizzle-orm/pg-core";
import { type AnyPgColumn, pgTable, uuid, text, timestamp, jsonb, index, integer, bigint, boolean } from "drizzle-orm/pg-core";
import { companies } from "./companies.js";
import { agents } from "./agents.js";
import { agentWakeupRequests } from "./agent_wakeup_requests.js";
@ -31,6 +31,12 @@ export const heartbeatRuns = pgTable(
stderrExcerpt: text("stderr_excerpt"),
errorCode: text("error_code"),
externalRunId: text("external_run_id"),
processPid: integer("process_pid"),
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
onDelete: "set null",
}),
processLossRetryCount: integer("process_loss_retry_count").notNull().default(0),
contextSnapshot: jsonb("context_snapshot").$type<Record<string, unknown>>(),
createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(),
updatedAt: timestamp("updated_at", { withTimezone: true }).notNull().defaultNow(),

View file

@ -33,6 +33,10 @@ export interface HeartbeatRun {
stderrExcerpt: string | null;
errorCode: string | null;
externalRunId: string | null;
processPid: number | null;
processStartedAt: Date | null;
retryOfRunId: string | null;
processLossRetryCount: number;
contextSnapshot: Record<string, unknown> | null;
createdAt: Date;
updatedAt: Date;