[codex] Add runtime lifecycle recovery and live issue visibility (#4419)

This commit is contained in:
Dotta 2026-04-24 15:50:32 -05:00 committed by GitHub
parent 9a8d219949
commit 5a0c1979cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
121 changed files with 9625 additions and 2044 deletions

View file

@ -0,0 +1,13 @@
CREATE UNIQUE INDEX IF NOT EXISTS "issues_active_liveness_recovery_incident_uq"
ON "issues" USING btree ("company_id","origin_kind","origin_id")
WHERE "origin_kind" = 'harness_liveness_escalation'
AND "origin_id" IS NOT NULL
AND "hidden_at" IS NULL
AND "status" NOT IN ('done', 'cancelled');
--> statement-breakpoint
CREATE UNIQUE INDEX IF NOT EXISTS "issues_active_liveness_recovery_leaf_uq"
ON "issues" USING btree ("company_id","origin_kind","origin_fingerprint")
WHERE "origin_kind" = 'harness_liveness_escalation'
AND "origin_fingerprint" <> 'default'
AND "hidden_at" IS NULL
AND "status" NOT IN ('done', 'cancelled');

View file

@ -0,0 +1,70 @@
ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "last_output_at" timestamp with time zone;
--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "last_output_seq" integer DEFAULT 0 NOT NULL;
--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "last_output_stream" text;
--> statement-breakpoint
ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "last_output_bytes" bigint;
--> statement-breakpoint
CREATE INDEX IF NOT EXISTS "heartbeat_runs_company_status_last_output_idx"
ON "heartbeat_runs" USING btree ("company_id","status","last_output_at");
--> statement-breakpoint
CREATE INDEX IF NOT EXISTS "heartbeat_runs_company_status_process_started_idx"
ON "heartbeat_runs" USING btree ("company_id","status","process_started_at");
--> statement-breakpoint
CREATE TABLE IF NOT EXISTS "heartbeat_run_watchdog_decisions" (
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
"company_id" uuid NOT NULL,
"run_id" uuid NOT NULL,
"evaluation_issue_id" uuid,
"decision" text NOT NULL,
"snoozed_until" timestamp with time zone,
"reason" text,
"created_by_agent_id" uuid,
"created_by_user_id" text,
"created_by_run_id" uuid,
"created_at" timestamp with time zone DEFAULT now() NOT NULL
);
--> statement-breakpoint
DO $$ BEGIN
ALTER TABLE "heartbeat_run_watchdog_decisions" ADD CONSTRAINT "heartbeat_run_watchdog_decisions_company_id_companies_id_fk" FOREIGN KEY ("company_id") REFERENCES "public"."companies"("id") ON DELETE no action ON UPDATE no action;
EXCEPTION
WHEN duplicate_object THEN null;
END $$;
--> statement-breakpoint
DO $$ BEGIN
ALTER TABLE "heartbeat_run_watchdog_decisions" ADD CONSTRAINT "heartbeat_run_watchdog_decisions_run_id_heartbeat_runs_id_fk" FOREIGN KEY ("run_id") REFERENCES "public"."heartbeat_runs"("id") ON DELETE cascade ON UPDATE no action;
EXCEPTION
WHEN duplicate_object THEN null;
END $$;
--> statement-breakpoint
DO $$ BEGIN
ALTER TABLE "heartbeat_run_watchdog_decisions" ADD CONSTRAINT "heartbeat_run_watchdog_decisions_evaluation_issue_id_issues_id_fk" FOREIGN KEY ("evaluation_issue_id") REFERENCES "public"."issues"("id") ON DELETE set null ON UPDATE no action;
EXCEPTION
WHEN duplicate_object THEN null;
END $$;
--> statement-breakpoint
DO $$ BEGIN
ALTER TABLE "heartbeat_run_watchdog_decisions" ADD CONSTRAINT "heartbeat_run_watchdog_decisions_created_by_agent_id_agents_id_fk" FOREIGN KEY ("created_by_agent_id") REFERENCES "public"."agents"("id") ON DELETE set null ON UPDATE no action;
EXCEPTION
WHEN duplicate_object THEN null;
END $$;
--> statement-breakpoint
DO $$ BEGIN
ALTER TABLE "heartbeat_run_watchdog_decisions" ADD CONSTRAINT "heartbeat_run_watchdog_decisions_created_by_run_id_heartbeat_runs_id_fk" FOREIGN KEY ("created_by_run_id") REFERENCES "public"."heartbeat_runs"("id") ON DELETE set null ON UPDATE no action;
EXCEPTION
WHEN duplicate_object THEN null;
END $$;
--> statement-breakpoint
CREATE INDEX IF NOT EXISTS "heartbeat_run_watchdog_decisions_company_run_created_idx"
ON "heartbeat_run_watchdog_decisions" USING btree ("company_id","run_id","created_at");
--> statement-breakpoint
CREATE INDEX IF NOT EXISTS "heartbeat_run_watchdog_decisions_company_run_snooze_idx"
ON "heartbeat_run_watchdog_decisions" USING btree ("company_id","run_id","snoozed_until");
--> statement-breakpoint
CREATE UNIQUE INDEX IF NOT EXISTS "issues_active_stale_run_evaluation_uq"
ON "issues" USING btree ("company_id","origin_kind","origin_id")
WHERE "origin_kind" = 'stale_active_run_evaluation'
AND "origin_id" IS NOT NULL
AND "hidden_at" IS NULL
AND "status" NOT IN ('done', 'cancelled');

View file

@ -484,6 +484,20 @@
"when": 1776959400000,
"tag": "0068_environment_local_driver_unique",
"breakpoints": true
},
{
"idx": 69,
"version": "7",
"when": 1776780003000,
"tag": "0069_liveness_recovery_dedupe",
"breakpoints": true
},
{
"idx": 70,
"version": "7",
"when": 1776780004000,
"tag": "0070_active_run_output_watchdog",
"breakpoints": true
}
]
}

View file

@ -0,0 +1,34 @@
import { index, pgTable, text, timestamp, uuid } from "drizzle-orm/pg-core";
import { agents } from "./agents.js";
import { companies } from "./companies.js";
import { heartbeatRuns } from "./heartbeat_runs.js";
import { issues } from "./issues.js";
export const heartbeatRunWatchdogDecisions = pgTable(
"heartbeat_run_watchdog_decisions",
{
id: uuid("id").primaryKey().defaultRandom(),
companyId: uuid("company_id").notNull().references(() => companies.id),
runId: uuid("run_id").notNull().references(() => heartbeatRuns.id, { onDelete: "cascade" }),
evaluationIssueId: uuid("evaluation_issue_id").references(() => issues.id, { onDelete: "set null" }),
decision: text("decision").notNull(),
snoozedUntil: timestamp("snoozed_until", { withTimezone: true }),
reason: text("reason"),
createdByAgentId: uuid("created_by_agent_id").references(() => agents.id, { onDelete: "set null" }),
createdByUserId: text("created_by_user_id"),
createdByRunId: uuid("created_by_run_id").references(() => heartbeatRuns.id, { onDelete: "set null" }),
createdAt: timestamp("created_at", { withTimezone: true }).notNull().defaultNow(),
},
(table) => ({
companyRunCreatedIdx: index("heartbeat_run_watchdog_decisions_company_run_created_idx").on(
table.companyId,
table.runId,
table.createdAt,
),
companyRunSnoozeIdx: index("heartbeat_run_watchdog_decisions_company_run_snooze_idx").on(
table.companyId,
table.runId,
table.snoozedUntil,
),
}),
);

View file

@ -34,6 +34,10 @@ export const heartbeatRuns = pgTable(
processPid: integer("process_pid"),
processGroupId: integer("process_group_id"),
processStartedAt: timestamp("process_started_at", { withTimezone: true }),
lastOutputAt: timestamp("last_output_at", { withTimezone: true }),
lastOutputSeq: integer("last_output_seq").notNull().default(0),
lastOutputStream: text("last_output_stream"),
lastOutputBytes: bigint("last_output_bytes", { mode: "number" }),
retryOfRunId: uuid("retry_of_run_id").references((): AnyPgColumn => heartbeatRuns.id, {
onDelete: "set null",
}),
@ -64,5 +68,15 @@ export const heartbeatRuns = pgTable(
table.livenessState,
table.createdAt,
),
companyStatusLastOutputIdx: index("heartbeat_runs_company_status_last_output_idx").on(
table.companyId,
table.status,
table.lastOutputAt,
),
companyStatusProcessStartedIdx: index("heartbeat_runs_company_status_process_started_idx").on(
table.companyId,
table.status,
table.processStartedAt,
),
}),
);

View file

@ -53,6 +53,7 @@ export { documentRevisions } from "./document_revisions.js";
export { issueDocuments } from "./issue_documents.js";
export { heartbeatRuns } from "./heartbeat_runs.js";
export { heartbeatRunEvents } from "./heartbeat_run_events.js";
export { heartbeatRunWatchdogDecisions } from "./heartbeat_run_watchdog_decisions.js";
export { costEvents } from "./cost_events.js";
export { financeEvents } from "./finance_events.js";
export { approvals } from "./approvals.js";

View file

@ -91,5 +91,29 @@ export const issues = pgTable(
and ${table.executionRunId} is not null
and ${table.status} in ('backlog', 'todo', 'in_progress', 'in_review', 'blocked')`,
),
activeLivenessRecoveryIncidentIdx: uniqueIndex("issues_active_liveness_recovery_incident_uq")
.on(table.companyId, table.originKind, table.originId)
.where(
sql`${table.originKind} = 'harness_liveness_escalation'
and ${table.originId} is not null
and ${table.hiddenAt} is null
and ${table.status} not in ('done', 'cancelled')`,
),
activeLivenessRecoveryLeafIdx: uniqueIndex("issues_active_liveness_recovery_leaf_uq")
.on(table.companyId, table.originKind, table.originFingerprint)
.where(
sql`${table.originKind} = 'harness_liveness_escalation'
and ${table.originFingerprint} <> 'default'
and ${table.hiddenAt} is null
and ${table.status} not in ('done', 'cancelled')`,
),
activeStaleRunEvaluationIdx: uniqueIndex("issues_active_stale_run_evaluation_uq")
.on(table.companyId, table.originKind, table.originId)
.where(
sql`${table.originKind} = 'stale_active_run_evaluation'
and ${table.originId} is not null
and ${table.hiddenAt} is null
and ${table.status} not in ('done', 'cancelled')`,
),
}),
);

View file

@ -33,77 +33,56 @@ export type EmbeddedPostgresTestDatabase = {
let embeddedPostgresSupportPromise: Promise<EmbeddedPostgresTestSupport> | null = null;
const DEFAULT_PAPERCLIP_EMBEDDED_POSTGRES_PORT = 54329;
function getReservedTestPorts(): Set<number> {
const configuredPorts = [
DEFAULT_PAPERCLIP_EMBEDDED_POSTGRES_PORT,
Number.parseInt(process.env.PAPERCLIP_EMBEDDED_POSTGRES_PORT ?? "", 10),
...String(process.env.PAPERCLIP_TEST_POSTGRES_RESERVED_PORTS ?? "")
.split(",")
.map((value) => Number.parseInt(value.trim(), 10)),
];
return new Set(configuredPorts.filter((port) => Number.isInteger(port) && port > 0 && port <= 65535));
}
async function getEmbeddedPostgresCtor(): Promise<EmbeddedPostgresCtor> {
const mod = await import("embedded-postgres");
return mod.default as EmbeddedPostgresCtor;
}
async function getAvailablePort(): Promise<number> {
return await new Promise((resolve, reject) => {
const server = net.createServer();
server.unref();
server.on("error", reject);
server.listen(0, "127.0.0.1", () => {
const address = server.address();
if (!address || typeof address === "string") {
server.close(() => reject(new Error("Failed to allocate test port")));
return;
}
const { port } = address;
server.close((error) => {
if (error) reject(error);
else resolve(port);
const reservedPorts = getReservedTestPorts();
for (let attempt = 0; attempt < 20; attempt += 1) {
const port = await new Promise<number>((resolve, reject) => {
const server = net.createServer();
server.unref();
server.on("error", reject);
server.listen(0, "127.0.0.1", () => {
const address = server.address();
if (!address || typeof address === "string") {
server.close(() => reject(new Error("Failed to allocate test port")));
return;
}
const { port } = address;
server.close((error) => {
if (error) reject(error);
else resolve(port);
});
});
});
});
}
function formatEmbeddedPostgresError(error: unknown): string {
if (error instanceof Error && error.message.length > 0) return error.message;
if (typeof error === "string" && error.length > 0) return error;
return "embedded Postgres startup failed";
}
async function probeEmbeddedPostgresSupport(): Promise<EmbeddedPostgresTestSupport> {
const dataDir = fs.mkdtempSync(path.join(os.tmpdir(), "paperclip-embedded-postgres-probe-"));
const port = await getAvailablePort();
const EmbeddedPostgres = await getEmbeddedPostgresCtor();
const instance = new EmbeddedPostgres({
databaseDir: dataDir,
user: "paperclip",
password: "paperclip",
port,
persistent: true,
initdbFlags: ["--encoding=UTF8", "--locale=C", "--lc-messages=C"],
onLog: () => {},
onError: () => {},
});
try {
await instance.initialise();
await instance.start();
return { supported: true };
} catch (error) {
return {
supported: false,
reason: formatEmbeddedPostgresError(error),
};
} finally {
await instance.stop().catch(() => {});
fs.rmSync(dataDir, { recursive: true, force: true });
if (!reservedPorts.has(port)) return port;
}
throw new Error(
`Failed to allocate embedded Postgres test port outside reserved Paperclip ports: ${[
...reservedPorts,
].join(", ")}`,
);
}
export async function getEmbeddedPostgresTestSupport(): Promise<EmbeddedPostgresTestSupport> {
if (!embeddedPostgresSupportPromise) {
embeddedPostgresSupportPromise = probeEmbeddedPostgresSupport();
}
return await embeddedPostgresSupportPromise;
}
export async function startEmbeddedPostgresTestDatabase(
tempDirPrefix: string,
): Promise<EmbeddedPostgresTestDatabase> {
async function createEmbeddedPostgresTestInstance(tempDirPrefix: string) {
const dataDir = fs.mkdtempSync(path.join(os.tmpdir(), tempDirPrefix));
const port = await getAvailablePort();
const EmbeddedPostgres = await getEmbeddedPostgresCtor();
@ -118,6 +97,51 @@ export async function startEmbeddedPostgresTestDatabase(
onError: () => {},
});
return { dataDir, port, instance };
}
function cleanupEmbeddedPostgresTestDirs(dataDir: string) {
fs.rmSync(dataDir, { recursive: true, force: true });
}
function formatEmbeddedPostgresError(error: unknown): string {
if (error instanceof Error && error.message.length > 0) return error.message;
if (typeof error === "string" && error.length > 0) return error;
return "embedded Postgres startup failed";
}
async function probeEmbeddedPostgresSupport(): Promise<EmbeddedPostgresTestSupport> {
const { dataDir, instance } = await createEmbeddedPostgresTestInstance(
"paperclip-embedded-postgres-probe-",
);
try {
await instance.initialise();
await instance.start();
return { supported: true };
} catch (error) {
return {
supported: false,
reason: formatEmbeddedPostgresError(error),
};
} finally {
await instance.stop().catch(() => {});
cleanupEmbeddedPostgresTestDirs(dataDir);
}
}
export async function getEmbeddedPostgresTestSupport(): Promise<EmbeddedPostgresTestSupport> {
if (!embeddedPostgresSupportPromise) {
embeddedPostgresSupportPromise = probeEmbeddedPostgresSupport();
}
return await embeddedPostgresSupportPromise;
}
export async function startEmbeddedPostgresTestDatabase(
tempDirPrefix: string,
): Promise<EmbeddedPostgresTestDatabase> {
const { dataDir, port, instance } = await createEmbeddedPostgresTestInstance(tempDirPrefix);
try {
await instance.initialise();
await instance.start();
@ -131,12 +155,12 @@ export async function startEmbeddedPostgresTestDatabase(
connectionString,
cleanup: async () => {
await instance.stop().catch(() => {});
fs.rmSync(dataDir, { recursive: true, force: true });
cleanupEmbeddedPostgresTestDirs(dataDir);
},
};
} catch (error) {
await instance.stop().catch(() => {});
fs.rmSync(dataDir, { recursive: true, force: true });
cleanupEmbeddedPostgresTestDirs(dataDir);
throw new Error(
`Failed to start embedded PostgreSQL test database: ${formatEmbeddedPostgresError(error)}`,
);