[codex] Add configurable liveness auto-recovery controls (#4587)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - Heartbeat liveness recovery decides when stalled issue trees need
manager-visible follow-up.
> - Automatic recovery issue creation is useful, but operators need
instance-level controls for how aggressive it is.
> - Without controls, recovery behavior is harder to tune for local
development, production operations, and noisy edge cases.
> - This pull request adds configurable liveness auto-recovery settings
across shared contracts, API routes, services, and the instance
experimental settings UI.
> - The benefit is that operators can keep liveness findings advisory or
enable bounded recovery automation with explicit intervals and lookback
windows.

## What Changed

- Added shared types and validators for liveness auto-recovery settings.
- Extended instance settings routes and services to persist and validate
the new controls.
- Wired heartbeat/recovery services to honor enablement, minimum
interval, and lookback settings.
- Added UI controls for liveness recovery under instance experimental
settings.
- Covered the new server behavior with instance settings and liveness
escalation tests.

## Verification

- `pnpm exec vitest run --project @paperclipai/server
server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts
server/src/__tests__/instance-settings-routes.test.ts --pool=forks
--poolOptions.forks.isolate=true`
- `pnpm --filter @paperclipai/shared typecheck`
- `pnpm --filter @paperclipai/server typecheck`
- `pnpm --filter @paperclipai/ui typecheck`

## Risks

- Moderate behavioral risk because recovery automation timing changes
when enabled; defaults keep existing advisory behavior unless the
setting is turned on.
- No database migration in this PR; settings are stored through the
existing instance settings path.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, `gpt-5`, coding model with tool use and local command
execution; context window not exposed by the runtime.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta 2026-04-27 08:46:44 -05:00 committed by GitHub
parent f0f9460d1d
commit fda296ee4f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 679 additions and 54 deletions

View file

@ -1,9 +1,13 @@
import { Router, type Request } from "express";
import type { Db } from "@paperclipai/db";
import { patchInstanceExperimentalSettingsSchema, patchInstanceGeneralSettingsSchema } from "@paperclipai/shared";
import {
issueGraphLivenessAutoRecoveryRequestSchema,
patchInstanceExperimentalSettingsSchema,
patchInstanceGeneralSettingsSchema,
} from "@paperclipai/shared";
import { forbidden } from "../errors.js";
import { validate } from "../middleware/validate.js";
import { instanceSettingsService, logActivity } from "../services/index.js";
import { heartbeatService, instanceSettingsService, logActivity } from "../services/index.js";
import { assertBoardOrgAccess, getActorInfo } from "./authz.js";
function assertCanManageInstanceSettings(req: Request) {
@ -19,6 +23,7 @@ function assertCanManageInstanceSettings(req: Request) {
export function instanceSettingsRoutes(db: Db) {
const router = Router();
const svc = instanceSettingsService(db);
const heartbeat = heartbeatService(db);
router.get("/instance/settings/general", async (req, res) => {
// General settings (e.g. keyboardShortcuts) are readable by any
@ -94,5 +99,53 @@ export function instanceSettingsRoutes(db: Db) {
},
);
router.post(
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview",
validate(issueGraphLivenessAutoRecoveryRequestSchema),
async (req, res) => {
assertCanManageInstanceSettings(req);
res.json(await heartbeat.buildIssueGraphLivenessAutoRecoveryPreview({
lookbackHours: req.body.lookbackHours,
}));
},
);
router.post(
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/run",
validate(issueGraphLivenessAutoRecoveryRequestSchema),
async (req, res) => {
assertCanManageInstanceSettings(req);
const actor = getActorInfo(req);
const result = await heartbeat.reconcileIssueGraphLiveness({
runId: actor.runId,
force: true,
lookbackHours: req.body.lookbackHours,
});
const companyIds = await svc.listCompanyIds();
await Promise.all(
companyIds.map((companyId) =>
logActivity(db, {
companyId,
actorType: actor.actorType,
actorId: actor.actorId,
agentId: actor.agentId,
runId: actor.runId,
action: "instance.settings.issue_graph_liveness_auto_recovery_run",
entityType: "instance_settings",
entityId: "default",
details: {
lookbackHours: result.lookbackHours,
escalationsCreated: result.escalationsCreated,
existingEscalations: result.existingEscalations,
skippedOutsideLookback: result.skippedOutsideLookback,
escalationIssueIds: result.escalationIssueIds,
},
}),
),
);
res.json(result);
},
);
return router;
}