[codex] Add configurable liveness auto-recovery controls (#4587)

## Thinking Path

> - Paperclip orchestrates AI agents for zero-human companies.
> - Heartbeat liveness recovery decides when stalled issue trees need
manager-visible follow-up.
> - Automatic recovery issue creation is useful, but operators need
instance-level controls for how aggressive it is.
> - Without controls, recovery behavior is harder to tune for local
development, production operations, and noisy edge cases.
> - This pull request adds configurable liveness auto-recovery settings
across shared contracts, API routes, services, and the instance
experimental settings UI.
> - The benefit is that operators can keep liveness findings advisory or
enable bounded recovery automation with explicit intervals and lookback
windows.

## What Changed

- Added shared types and validators for liveness auto-recovery settings.
- Extended instance settings routes and services to persist and validate
the new controls.
- Wired heartbeat/recovery services to honor enablement, minimum
interval, and lookback settings.
- Added UI controls for liveness recovery under instance experimental
settings.
- Covered the new server behavior with instance settings and liveness
escalation tests.

## Verification

- `pnpm exec vitest run --project @paperclipai/server
server/src/__tests__/heartbeat-issue-liveness-escalation.test.ts
server/src/__tests__/instance-settings-routes.test.ts --pool=forks
--poolOptions.forks.isolate=true`
- `pnpm --filter @paperclipai/shared typecheck`
- `pnpm --filter @paperclipai/server typecheck`
- `pnpm --filter @paperclipai/ui typecheck`

## Risks

- Moderate behavioral risk because recovery automation timing changes
when enabled; defaults keep existing advisory behavior unless the
setting is turned on.
- No database migration in this PR; settings are stored through the
existing instance settings path.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, `gpt-5`, coding model with tool use and local command
execution; context window not exposed by the runtime.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge

---------

Co-authored-by: Paperclip <noreply@paperclip.ing>
This commit is contained in:
Dotta 2026-04-27 08:46:44 -05:00 committed by GitHub
parent f0f9460d1d
commit fda296ee4f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 679 additions and 54 deletions

View file

@ -1,6 +1,7 @@
import type {
InstanceExperimentalSettings,
InstanceGeneralSettings,
IssueGraphLivenessAutoRecoveryPreview,
PatchInstanceGeneralSettings,
PatchInstanceExperimentalSettings,
} from "@paperclipai/shared";
@ -15,4 +16,25 @@ export const instanceSettingsApi = {
api.get<InstanceExperimentalSettings>("/instance/settings/experimental"),
updateExperimental: (patch: PatchInstanceExperimentalSettings) =>
api.patch<InstanceExperimentalSettings>("/instance/settings/experimental", patch),
previewIssueGraphLivenessAutoRecovery: (input: { lookbackHours?: number }) =>
api.post<IssueGraphLivenessAutoRecoveryPreview>(
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/preview",
input,
),
runIssueGraphLivenessAutoRecovery: (input: { lookbackHours?: number }) =>
api.post<{
findings: number;
autoRecoveryEnabled: boolean;
lookbackHours: number;
cutoff: string;
escalationsCreated: number;
existingEscalations: number;
skipped: number;
skippedAutoRecoveryDisabled: number;
skippedOutsideLookback: number;
escalationIssueIds: string[];
}>(
"/instance/settings/experimental/issue-graph-liveness-auto-recovery/run",
input,
),
};

View file

@ -1,16 +1,130 @@
import { useEffect, useState } from "react";
import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
import { FlaskConical } from "lucide-react";
import type { PatchInstanceExperimentalSettings } from "@paperclipai/shared";
import { Clock, FlaskConical, Play, Search } from "lucide-react";
import type {
IssueGraphLivenessAutoRecoveryPreview,
PatchInstanceExperimentalSettings,
} from "@paperclipai/shared";
import { instanceSettingsApi } from "@/api/instanceSettings";
import { useBreadcrumbs } from "../context/BreadcrumbContext";
import { queryKeys } from "../lib/queryKeys";
import { ToggleSwitch } from "@/components/ui/toggle-switch";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import {
Dialog,
DialogContent,
DialogDescription,
DialogFooter,
DialogHeader,
DialogTitle,
} from "@/components/ui/dialog";
function issueHref(identifier: string | null, issueId: string) {
if (!identifier) return `/issues/${issueId}`;
const prefix = identifier.split("-")[0] || "PAP";
return `/${prefix}/issues/${identifier}`;
}
function formatRecoveryState(state: string) {
return state.replace(/_/g, " ");
}
function RecoveryPreviewDialog({
preview,
open,
onOpenChange,
onEnableOnly,
onEnableAndRun,
isPending,
}: {
preview: IssueGraphLivenessAutoRecoveryPreview | null;
open: boolean;
onOpenChange: (open: boolean) => void;
onEnableOnly: () => void;
onEnableAndRun: () => void;
isPending: boolean;
}) {
const count = preview?.recoverableFindings ?? 0;
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="sm:max-w-3xl">
<DialogHeader>
<DialogTitle>Confirm auto-recovery</DialogTitle>
<DialogDescription>
{preview
? `${count} recovery ${count === 1 ? "task" : "tasks"} match the last ${preview.lookbackHours} hours.`
: "Checking recovery candidates before enabling."}
</DialogDescription>
</DialogHeader>
<div className="max-h-[min(28rem,65vh)] space-y-3 overflow-y-auto pr-1">
{preview && preview.items.length === 0 ? (
<div className="rounded-md border border-border bg-muted/30 px-3 py-4 text-sm text-muted-foreground">
No recovery tasks would be created right now. Auto-recovery can still run for future liveness incidents in
this window.
</div>
) : null}
{preview?.items.map((item) => (
<div key={item.incidentKey} className="rounded-md border border-border bg-card px-3 py-3">
<div className="flex flex-wrap items-center gap-2">
<a
href={issueHref(item.identifier, item.issueId)}
className="text-sm font-medium text-primary underline-offset-2 hover:underline"
>
{item.identifier ?? item.issueId}
</a>
<span className="rounded-sm bg-muted px-1.5 py-0.5 text-xs text-muted-foreground">
{formatRecoveryState(item.state)}
</span>
</div>
<p className="mt-1 text-sm text-foreground">{item.title}</p>
<p className="mt-1 text-xs text-muted-foreground">{item.reason}</p>
<div className="mt-2 text-xs text-muted-foreground">
Recovery target:{" "}
<a
href={issueHref(item.recoveryIdentifier, item.recoveryIssueId)}
className="text-primary underline-offset-2 hover:underline"
>
{item.recoveryIdentifier ?? item.recoveryIssueId}
</a>
</div>
</div>
))}
</div>
{preview && preview.skippedOutsideLookback > 0 ? (
<p className="text-xs text-muted-foreground">
{preview.skippedOutsideLookback} current{" "}
{preview.skippedOutsideLookback === 1 ? "finding is" : "findings are"} outside the configured lookback and
will not be touched.
</p>
) : null}
<DialogFooter>
<Button variant="outline" onClick={() => onOpenChange(false)} disabled={isPending}>
Cancel
</Button>
<Button variant="outline" onClick={onEnableOnly} disabled={isPending || !preview}>
Enable only
</Button>
<Button onClick={onEnableAndRun} disabled={isPending || !preview}>
{count > 0 ? `Enable and create ${count}` : "Enable"}
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
);
}
export function InstanceExperimentalSettings() {
const { setBreadcrumbs } = useBreadcrumbs();
const queryClient = useQueryClient();
const [actionError, setActionError] = useState<string | null>(null);
const [lookbackHoursDraft, setLookbackHoursDraft] = useState("24");
const [previewDialogOpen, setPreviewDialogOpen] = useState(false);
const [pendingPreview, setPendingPreview] = useState<IssueGraphLivenessAutoRecoveryPreview | null>(null);
useEffect(() => {
setBreadcrumbs([
@ -39,6 +153,42 @@ export function InstanceExperimentalSettings() {
},
});
const previewMutation = useMutation({
mutationFn: async (lookbackHours: number) =>
instanceSettingsApi.previewIssueGraphLivenessAutoRecovery({ lookbackHours }),
onSuccess: (preview) => {
setActionError(null);
setPendingPreview(preview);
setPreviewDialogOpen(true);
},
onError: (error) => {
setActionError(error instanceof Error ? error.message : "Failed to preview recovery tasks.");
},
});
const runRecoveryMutation = useMutation({
mutationFn: async (lookbackHours: number) =>
instanceSettingsApi.runIssueGraphLivenessAutoRecovery({ lookbackHours }),
onSuccess: async () => {
setActionError(null);
setPreviewDialogOpen(false);
await Promise.all([
queryClient.invalidateQueries({ queryKey: queryKeys.instance.experimentalSettings }),
queryClient.invalidateQueries({ queryKey: queryKeys.health }),
]);
},
onError: (error) => {
setActionError(error instanceof Error ? error.message : "Failed to create recovery tasks.");
},
});
useEffect(() => {
const next = experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours;
if (typeof next === "number") {
setLookbackHoursDraft(String(next));
}
}, [experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours]);
if (experimentalQuery.isLoading) {
return <div className="text-sm text-muted-foreground">Loading experimental settings...</div>;
}
@ -58,6 +208,41 @@ export function InstanceExperimentalSettings() {
const autoRestartDevServerWhenIdle = experimentalQuery.data?.autoRestartDevServerWhenIdle === true;
const enableIssueGraphLivenessAutoRecovery =
experimentalQuery.data?.enableIssueGraphLivenessAutoRecovery === true;
const lookbackHours =
experimentalQuery.data?.issueGraphLivenessAutoRecoveryLookbackHours ?? 24;
const parsedLookbackHours = Number.parseInt(lookbackHoursDraft, 10);
const lookbackHoursIsValid =
Number.isInteger(parsedLookbackHours) && parsedLookbackHours >= 1 && parsedLookbackHours <= 720;
const recoveryActionPending =
toggleMutation.isPending || previewMutation.isPending || runRecoveryMutation.isPending;
function previewForEnable() {
if (!lookbackHoursIsValid) {
setActionError("Lookback hours must be a whole number from 1 to 720.");
return;
}
previewMutation.mutate(parsedLookbackHours);
}
function enableOnly() {
if (!lookbackHoursIsValid) return;
toggleMutation.mutate({
enableIssueGraphLivenessAutoRecovery: true,
issueGraphLivenessAutoRecoveryLookbackHours: parsedLookbackHours,
}, {
onSuccess: () => setPreviewDialogOpen(false),
});
}
function enableAndRun() {
if (!lookbackHoursIsValid) return;
toggleMutation.mutate({
enableIssueGraphLivenessAutoRecovery: true,
issueGraphLivenessAutoRecoveryLookbackHours: parsedLookbackHours,
}, {
onSuccess: () => runRecoveryMutation.mutate(parsedLookbackHours),
});
}
return (
<div className="max-w-4xl space-y-6">
@ -132,26 +317,99 @@ export function InstanceExperimentalSettings() {
</section>
<section className="rounded-xl border border-border bg-card p-5">
<div className="flex items-start justify-between gap-4">
<div className="space-y-1.5">
<h2 className="text-sm font-semibold">Auto-Create Issue Recovery Tasks</h2>
<p className="max-w-2xl text-sm text-muted-foreground">
Let the heartbeat scheduler create recovery issues for issue dependency chains that have been stalled for
at least 24 hours.
</p>
<div className="flex flex-col gap-5">
<div className="flex items-start justify-between gap-4">
<div className="space-y-1.5">
<h2 className="text-sm font-semibold">Auto-Create Issue Recovery Tasks</h2>
<p className="max-w-2xl text-sm text-muted-foreground">
Let the heartbeat scheduler create recovery issues for issue dependency chains found inside the
configured lookback window.
</p>
</div>
<ToggleSwitch
checked={enableIssueGraphLivenessAutoRecovery}
onCheckedChange={() => {
if (enableIssueGraphLivenessAutoRecovery) {
toggleMutation.mutate({ enableIssueGraphLivenessAutoRecovery: false });
return;
}
previewForEnable();
}}
disabled={recoveryActionPending}
aria-label="Toggle issue graph liveness auto-recovery"
/>
</div>
<ToggleSwitch
checked={enableIssueGraphLivenessAutoRecovery}
onCheckedChange={() =>
toggleMutation.mutate({
enableIssueGraphLivenessAutoRecovery: !enableIssueGraphLivenessAutoRecovery,
})
}
disabled={toggleMutation.isPending}
aria-label="Toggle issue graph liveness auto-recovery"
/>
<div className="grid gap-3 sm:grid-cols-[minmax(10rem,14rem)_1fr] sm:items-end">
<label className="space-y-1.5">
<span className="flex items-center gap-1.5 text-xs font-medium text-muted-foreground">
<Clock className="h-3.5 w-3.5" />
Lookback hours
</span>
<Input
type="number"
min={1}
max={720}
step={1}
value={lookbackHoursDraft}
onChange={(event) => setLookbackHoursDraft(event.target.value)}
aria-invalid={!lookbackHoursIsValid}
/>
</label>
<div className="flex flex-wrap gap-2">
<Button
variant="outline"
onClick={() => {
if (!lookbackHoursIsValid) {
setActionError("Lookback hours must be a whole number from 1 to 720.");
return;
}
toggleMutation.mutate({
issueGraphLivenessAutoRecoveryLookbackHours: parsedLookbackHours,
});
}}
disabled={recoveryActionPending || parsedLookbackHours === lookbackHours}
>
Save hours
</Button>
<Button
variant="outline"
onClick={previewForEnable}
disabled={recoveryActionPending}
>
<Search className="h-4 w-4" />
Preview
</Button>
<Button
onClick={() => {
if (!lookbackHoursIsValid) {
setActionError("Lookback hours must be a whole number from 1 to 720.");
return;
}
runRecoveryMutation.mutate(parsedLookbackHours);
}}
disabled={recoveryActionPending || !enableIssueGraphLivenessAutoRecovery}
>
<Play className="h-4 w-4" />
Run now
</Button>
</div>
</div>
<p className="text-xs text-muted-foreground">
Current window: last {lookbackHours} {lookbackHours === 1 ? "hour" : "hours"}.
</p>
</div>
</section>
<RecoveryPreviewDialog
open={previewDialogOpen}
onOpenChange={setPreviewDialogOpen}
preview={pendingPreview}
onEnableOnly={enableOnly}
onEnableAndRun={enableAndRun}
isPending={recoveryActionPending}
/>
</div>
);
}