mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-17 19:20:39 +09:00
Improve orphaned local heartbeat recovery
Persist child-process metadata for local adapter runs, keep detached runs alive when their pid still exists, queue a single automatic retry when the pid is confirmed dead, and clear detached warnings when the original run reports activity again. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
parent
7f3fad64b8
commit
c844ca1a40
17 changed files with 10924 additions and 22 deletions
|
|
@ -8,6 +8,8 @@ export interface RunProcessResult {
|
|||
timedOut: boolean;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
pid: number | null;
|
||||
startedAt: string | null;
|
||||
}
|
||||
|
||||
interface RunningProcess {
|
||||
|
|
@ -423,6 +425,7 @@ export async function runChildProcess(
|
|||
graceSec: number;
|
||||
onLog: (stream: "stdout" | "stderr", chunk: string) => Promise<void>;
|
||||
onLogError?: (err: unknown, runId: string, message: string) => void;
|
||||
onSpawn?: (meta: { pid: number; startedAt: string }) => Promise<void>;
|
||||
stdin?: string;
|
||||
},
|
||||
): Promise<RunProcessResult> {
|
||||
|
|
@ -455,12 +458,19 @@ export async function runChildProcess(
|
|||
shell: false,
|
||||
stdio: [opts.stdin != null ? "pipe" : "ignore", "pipe", "pipe"],
|
||||
}) as ChildProcessWithEvents;
|
||||
const startedAt = new Date().toISOString();
|
||||
|
||||
if (opts.stdin != null && child.stdin) {
|
||||
child.stdin.write(opts.stdin);
|
||||
child.stdin.end();
|
||||
}
|
||||
|
||||
if (typeof child.pid === "number" && child.pid > 0 && opts.onSpawn) {
|
||||
void opts.onSpawn({ pid: child.pid, startedAt }).catch((err) => {
|
||||
onLogError(err, runId, "failed to record child process metadata");
|
||||
});
|
||||
}
|
||||
|
||||
runningProcesses.set(runId, { child, graceSec: opts.graceSec });
|
||||
|
||||
let timedOut = false;
|
||||
|
|
@ -519,6 +529,8 @@ export async function runChildProcess(
|
|||
timedOut,
|
||||
stdout,
|
||||
stderr,
|
||||
pid: child.pid ?? null,
|
||||
startedAt,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -120,6 +120,7 @@ export interface AdapterExecutionContext {
|
|||
context: Record<string, unknown>;
|
||||
onLog: (stream: "stdout" | "stderr", chunk: string) => Promise<void>;
|
||||
onMeta?: (meta: AdapterInvocationMeta) => Promise<void>;
|
||||
onSpawn?: (meta: { pid: number; startedAt: string }) => Promise<void>;
|
||||
authToken?: string;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -303,7 +303,7 @@ export async function runClaudeLogin(input: {
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -455,6 +455,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
stdin: prompt,
|
||||
timeoutSec,
|
||||
graceSec,
|
||||
onSpawn,
|
||||
onLog,
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ export async function ensureCodexSkillsInjected(
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -454,6 +454,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
stdin: prompt,
|
||||
timeoutSec,
|
||||
graceSec,
|
||||
onSpawn,
|
||||
onLog: async (stream, chunk) => {
|
||||
if (stream !== "stderr") {
|
||||
await onLog(stream, chunk);
|
||||
|
|
|
|||
|
|
@ -152,7 +152,7 @@ export async function ensureCursorSkillsInjected(
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -419,6 +419,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
timeoutSec,
|
||||
graceSec,
|
||||
stdin: prompt,
|
||||
onSpawn,
|
||||
onLog: async (stream, chunk) => {
|
||||
if (stream !== "stdout") {
|
||||
await onLog(stream, chunk);
|
||||
|
|
|
|||
|
|
@ -129,7 +129,7 @@ async function ensureGeminiSkillsInjected(
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -349,6 +349,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
env,
|
||||
timeoutSec,
|
||||
graceSec,
|
||||
onSpawn,
|
||||
onLog,
|
||||
});
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ async function ensureOpenCodeSkillsInjected(onLog: AdapterExecutionContext["onLo
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -301,6 +301,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
stdin: prompt,
|
||||
timeoutSec,
|
||||
graceSec,
|
||||
onSpawn,
|
||||
onLog,
|
||||
});
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ function buildSessionPath(agentId: string, timestamp: string): string {
|
|||
}
|
||||
|
||||
export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExecutionResult> {
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, authToken } = ctx;
|
||||
const { runId, agent, runtime, config, context, onLog, onMeta, onSpawn, authToken } = ctx;
|
||||
|
||||
const promptTemplate = asString(
|
||||
config.promptTemplate,
|
||||
|
|
@ -398,6 +398,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
env: runtimeEnv,
|
||||
timeoutSec,
|
||||
graceSec,
|
||||
onSpawn,
|
||||
onLog: bufferedOnLog,
|
||||
stdin: buildRpcStdin(),
|
||||
});
|
||||
|
|
|
|||
5
packages/db/src/migrations/0038_careless_iron_monger.sql
Normal file
5
packages/db/src/migrations/0038_careless_iron_monger.sql
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_pid" integer;--> statement-breakpoint
|
||||
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_started_at" timestamp with time zone;--> statement-breakpoint
|
||||
ALTER TABLE "heartbeat_runs" ADD COLUMN "retry_of_run_id" uuid;--> statement-breakpoint
|
||||
ALTER TABLE "heartbeat_runs" ADD COLUMN "process_loss_retry_count" integer DEFAULT 0 NOT NULL;--> statement-breakpoint
|
||||
ALTER TABLE "heartbeat_runs" ADD CONSTRAINT "heartbeat_runs_retry_of_run_id_heartbeat_runs_id_fk" FOREIGN KEY ("retry_of_run_id") REFERENCES "public"."heartbeat_runs"("id") ON DELETE set null ON UPDATE no action;
|
||||
10301
packages/db/src/migrations/meta/0038_snapshot.json
Normal file
10301
packages/db/src/migrations/meta/0038_snapshot.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -267,6 +267,13 @@
|
|||
"when": 1773756922363,
|
||||
"tag": "0037_friendly_eddie_brock",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 38,
|
||||
"version": "7",
|
||||
"when": 1773931592563,
|
||||
"tag": "0038_careless_iron_monger",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
import { pgTable, uuid, text, timestamp, jsonb, index, integer, bigint, boolean } from "drizzle-orm/pg-core";
|
||||
import { type AnyPgColumn, pgTable, uuid, text, timestamp, jsonb, index, integer, bigint, boolean } from "drizzle-orm/pg-core";
|
||||
import { companies } from "./companies.js";
|
||||
import { agents } from "./agents.js";
|
||||
import { agentWakeupRequests } from "./agent_wakeup_requests.js";
|
||||
|
|
@ -31,6 +31,12 @@ export const heartbeatRuns = pgTable(
|
|||
stderrExcerpt: text("stderr_excerpt"),
|
||||
errorCode: text("error_code"),
|
||||
externalRunId: text("external_run_id"),
|
||||
processPid: integer("process_pid"),
|
||||
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
|
||||
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
|
||||
onDelete: "set null",
|
||||
}),
|
||||
processLossRetryCount: integer("process_loss_retry_count").notNull().default(0),
|
||||
contextSnapshot: jsonb("context_snapshot").$type<Record<string, unknown>>(),
|
||||
createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(),
|
||||
updatedAt: timestamp("updated_at", { withTimezone: true }).notNull().defaultNow(),
|
||||
|
|
|
|||
|
|
@ -33,6 +33,10 @@ export interface HeartbeatRun {
|
|||
stderrExcerpt: string | null;
|
||||
errorCode: string | null;
|
||||
externalRunId: string | null;
|
||||
processPid: number | null;
|
||||
processStartedAt: Date | null;
|
||||
retryOfRunId: string | null;
|
||||
processLossRetryCount: number;
|
||||
contextSnapshot: Record<string, unknown> | null;
|
||||
createdAt: Date;
|
||||
updatedAt: Date;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue