[codex] Improve agent runtime recovery and governance (#4086)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - The heartbeat runtime, agent import path, and agent configuration
defaults determine whether work is dispatched safely and predictably.
> - Several accumulated fixes all touched agent execution recovery, wake
routing, import behavior, and runtime concurrency defaults.
> - Those changes need to land together so the heartbeat service and
agent creation defaults stay internally consistent.
> - This pull request groups the runtime/governance changes from the
split branch into one standalone branch.
> - The benefit is safer recovery for stranded runs, bounded high-volume
reads, imported-agent approval correctness, skill-template support, and
a clearer default concurrency policy.

## What Changed

- Fixed stranded continuation recovery so successful automatic retries
are requeued instead of incorrectly blocking the issue.
- Bounded high-volume issue/log reads across issue, heartbeat, agent,
project, and workspace paths.
- Fixed imported-agent approval and instruction-path permission
handling.
- Quarantined seeded worktree execution state during worktree
provisioning.
- Queued approval follow-up wakes and hardened SQL_ASCII heartbeat
output handling.
- Added reusable agent instruction templates for hiring flows.
- Set the default max concurrent agent runs to five and updated related
UI/tests/docs.

## Verification

- `pnpm install --frozen-lockfile`
- `pnpm exec vitest run server/src/__tests__/company-portability.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/heartbeat-comment-wake-batching.test.ts
server/src/__tests__/heartbeat-list.test.ts
server/src/__tests__/issues-service.test.ts
server/src/__tests__/agent-permissions-routes.test.ts
packages/adapter-utils/src/server-utils.test.ts
ui/src/lib/new-agent-runtime-config.test.ts`
- Split integration check: merged this branch first, followed by the
other [PAP-1614](/PAP/issues/PAP-1614) branches, with no merge
conflicts.
- Confirmed this branch does not include `pnpm-lock.yaml`.

## Risks

- Medium risk: touches heartbeat recovery, queueing, and issue list
bounds in central runtime paths.
- Imported-agent and concurrency default behavior changes may affect
existing automation that assumes one-at-a-time default runs.
- No database migrations are included.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5.4 tool-enabled coding model, agentic
code-editing/runtime with local shell and GitHub CLI access; exact
context window and reasoning mode are not exposed by the Paperclip
harness.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [x] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta 2026-04-20 06:19:48 -05:00 committed by GitHub
parent 057fee4836
commit 16b2b84d84
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 1569 additions and 240 deletions

View file

@ -7,6 +7,7 @@ import { and, desc, eq, inArray, not, sql } from "drizzle-orm";
import {
agentSkillSyncSchema,
agentMineInboxQuerySchema,
AGENT_DEFAULT_MAX_CONCURRENT_RUNS,
createAgentKeySchema,
createAgentHireSchema,
createAgentSchema,
@ -37,6 +38,7 @@ import {
companySkillService,
budgetService,
heartbeatService,
ISSUE_LIST_DEFAULT_LIMIT,
issueApprovalService,
issueService,
logActivity,
@ -75,6 +77,15 @@ import {
} from "../services/default-agent-instructions.js";
import { getTelemetryClient } from "../telemetry.js";
const RUN_LOG_DEFAULT_LIMIT_BYTES = 256_000;
const RUN_LOG_MAX_LIMIT_BYTES = 1024 * 1024;
function readRunLogLimitBytes(value: unknown) {
const parsed = Number(value ?? RUN_LOG_DEFAULT_LIMIT_BYTES);
if (!Number.isFinite(parsed)) return RUN_LOG_DEFAULT_LIMIT_BYTES;
return Math.max(1, Math.min(RUN_LOG_MAX_LIMIT_BYTES, Math.trunc(parsed)));
}
export function agentRoutes(db: Db) {
// Legacy hardcoded maps — used as fallback when adapter module does not
// declare capability flags explicitly.
@ -514,6 +525,9 @@ export function agentRoutes(db: Db) {
if (parseBooleanLike(heartbeat.enabled) == null) {
heartbeat.enabled = false;
}
if (parseNumberLike(heartbeat.maxConcurrentRuns) == null) {
heartbeat.maxConcurrentRuns = AGENT_DEFAULT_MAX_CONCURRENT_RUNS;
}
normalizedRuntimeConfig.heartbeat = heartbeat;
return normalizedRuntimeConfig;
@ -1168,6 +1182,7 @@ export function agentRoutes(db: Db) {
assigneeAgentId: req.actor.agentId,
status: "todo,in_progress,blocked",
includeRoutineExecutions: true,
limit: ISSUE_LIST_DEFAULT_LIMIT,
});
res.json(
@ -1198,6 +1213,7 @@ export function agentRoutes(db: Db) {
touchedByUserId: query.userId,
inboxArchivedByUserId: query.userId,
status: query.status,
limit: ISSUE_LIST_DEFAULT_LIMIT,
});
res.json(rows);
@ -1682,6 +1698,10 @@ export function agentRoutes(db: Db) {
});
router.patch("/agents/:id/instructions-path", validate(updateAgentInstructionsPathSchema), async (req, res) => {
if (req.actor.type !== "board") {
throw forbidden("Only board-authenticated callers can manage instructions path or bundle configuration");
}
const id = req.params.id as string;
const existing = await svc.getById(id);
if (!existing) {
@ -2098,6 +2118,42 @@ export function agentRoutes(db: Db) {
res.json(agent);
});
router.post("/agents/:id/approve", async (req, res) => {
assertBoard(req);
const id = req.params.id as string;
const existing = await getAccessibleAgent(req, res, id);
if (!existing) {
return;
}
if (existing.status !== "pending_approval") {
res.status(409).json({ error: "Only pending approval agents can be approved" });
return;
}
const approval = await svc.activatePendingApproval(id);
if (!approval) {
res.status(404).json({ error: "Agent not found" });
return;
}
if (!approval.activated) {
res.status(409).json({ error: "Only pending approval agents can be approved" });
return;
}
const { agent } = approval;
await logActivity(db, {
companyId: agent.companyId,
actorType: "user",
actorId: req.actor.userId ?? "board",
action: "agent.approved",
entityType: "agent",
entityId: agent.id,
details: { source: "agent_detail" },
});
res.json(agent);
});
router.post("/agents/:id/terminate", async (req, res) => {
assertBoard(req);
const id = req.params.id as string;
@ -2492,10 +2548,10 @@ export function agentRoutes(db: Db) {
assertCompanyAccess(req, run.companyId);
const offset = Number(req.query.offset ?? 0);
const limitBytes = Number(req.query.limitBytes ?? 256000);
const limitBytes = readRunLogLimitBytes(req.query.limitBytes);
const result = await heartbeat.readLog(run, {
offset: Number.isFinite(offset) ? offset : 0,
limitBytes: Number.isFinite(limitBytes) ? limitBytes : 256000,
limitBytes,
});
res.set("Cache-Control", "no-cache, no-store");
@ -2527,10 +2583,10 @@ export function agentRoutes(db: Db) {
assertCompanyAccess(req, operation.companyId);
const offset = Number(req.query.offset ?? 0);
const limitBytes = Number(req.query.limitBytes ?? 256000);
const limitBytes = readRunLogLimitBytes(req.query.limitBytes);
const result = await workspaceOperations.readLog(operationId, {
offset: Number.isFinite(offset) ? offset : 0,
limitBytes: Number.isFinite(limitBytes) ? limitBytes : 256000,
limitBytes,
});
res.set("Cache-Control", "no-cache, no-store");