[codex] Harden heartbeat scheduling and runtime controls (#4223)

## Thinking Path > - Paperclip orchestrates AI agents through issue checkout, heartbeat runs, routines, and auditable control-plane state > - The runtime path has to recover from lost local processes, transient adapter failures, blocked dependencies, and routine coalescing without stranding work > - The existing branch carried several reliability fixes across heartbeat scheduling, issue runtime controls, routine dispatch, and operator-facing run state > - These changes belong together because they share backend contracts, migrations, and runtime status semantics > - This pull request groups the control-plane/runtime slice so it can merge independently from board UI polish and adapter sandbox work > - The benefit is safer heartbeat recovery, clearer runtime controls, and more predictable recurring execution behavior ## What Changed - Adds bounded heartbeat retry scheduling, scheduled retry state, and Codex transient failure recovery handling. - Tightens heartbeat process recovery, blocker wake behavior, issue comment wake handling, routine dispatch coalescing, and activity/dashboard bounds. - Adds runtime-control MCP tools and Paperclip skill docs for issue workspace runtime management. - Adds migrations `0061_lively_thor_girl.sql` and `0062_routine_run_dispatch_fingerprint.sql`. - Surfaces retry state in run ledger/agent UI and keeps related shared types synchronized. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-retry-scheduling.test.ts server/src/__tests__/heartbeat-process-recovery.test.ts server/src/__tests__/routines-service.test.ts` - `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server` ## Risks - Medium risk: this touches heartbeat recovery and routine dispatch, which are central execution paths. - Migration order matters if split branches land out of order: merge this PR before branches that assume the new runtime/routine fields. - Runtime retry behavior should be watched in CI and in local operator smoke tests because it changes how transient failures are resumed. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use enabled. Exact hosted model build and context window are not exposed in this Paperclip heartbeat environment. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
2026-06-16 02:40:39 +09:00 · 2026-04-21 12:24:11 -05:00 · 2026-04-21 12:24:11 -05:00 · 09d0678840
commit 09d0678840
parent ab9051b595
61 changed files with 17622 additions and 456 deletions
--- a/ui/src/components/ActivityCharts.test.tsx
+++ b/ui/src/components/ActivityCharts.test.tsx
@ -65,6 +65,9 @@ function createRun(overrides: Partial<HeartbeatRun> = {}): HeartbeatRun {
    processStartedAt: null,
    retryOfRunId: null,
    processLossRetryCount: 0,
+    scheduledRetryAt: null,
+    scheduledRetryAttempt: 0,
+    scheduledRetryReason: null,
    livenessState: null,
    livenessReason: null,
    continuationAttempt: 0,
--- a/ui/src/components/IssueRunLedger.test.tsx
+++ b/ui/src/components/IssueRunLedger.test.tsx
@ -192,6 +192,40 @@ describe("IssueRunLedger", () => {
    expect(container.textContent).not.toContain("initial attempt");
  });

+  it("surfaces scheduled retry timing and exhaustion state without opening logs", () => {
+    renderLedger({
+      runs: [
+        createRun({
+          runId: "run-scheduled",
+          status: "scheduled_retry",
+          finishedAt: null,
+          livenessState: null,
+          livenessReason: null,
+          retryOfRunId: "run-root",
+          scheduledRetryAt: "2026-04-18T20:15:00.000Z",
+          scheduledRetryAttempt: 2,
+          scheduledRetryReason: "transient_failure",
+        }),
+        createRun({
+          runId: "run-exhausted",
+          status: "failed",
+          createdAt: "2026-04-18T19:57:00.000Z",
+          retryOfRunId: "run-root",
+          scheduledRetryAttempt: 4,
+          scheduledRetryReason: "transient_failure",
+          retryExhaustedReason: "Bounded retry exhausted after 4 scheduled attempts; no further automatic retry will be queued",
+        }),
+      ],
+    });
+
+    expect(container.textContent).toContain("Retry scheduled");
+    expect(container.textContent).toContain("Attempt 2");
+    expect(container.textContent).toContain("Transient failure");
+    expect(container.textContent).toContain("Next retry");
+    expect(container.textContent).toContain("Retry exhausted");
+    expect(container.textContent).toContain("No further automatic retry queued");
+  });
+
  it("shows timeout, cancel, and budget stop reasons without raw logs", () => {
    renderLedger({
      runs: [
--- a/ui/src/components/IssueRunLedger.tsx
+++ b/ui/src/components/IssueRunLedger.tsx
@ -7,6 +7,7 @@ import { heartbeatsApi, type ActiveRunForIssue, type LiveRunForIssue } from "../
 import { cn, relativeTime } from "../lib/utils";
 import { queryKeys } from "../lib/queryKeys";
 import { keepPreviousDataForSameQueryTail } from "../lib/query-placeholder-data";
+import { describeRunRetryState } from "../lib/runRetryState";

 type IssueRunLedgerProps = {
  issueId: string;
@ -80,6 +81,12 @@ const PENDING_LIVENESS_COPY: LivenessCopy = {
  description: "Liveness is evaluated after the run finishes.",
 };

+const RETRY_PENDING_LIVENESS_COPY: LivenessCopy = {
+  label: "Retry pending",
+  tone: "border-cyan-500/30 bg-cyan-500/10 text-cyan-700 dark:text-cyan-300",
+  description: "Paperclip queued an automatic retry that has not started yet.",
+};
+
 const MISSING_LIVENESS_COPY: LivenessCopy = {
  label: "No liveness data",
  tone: "border-border bg-background text-muted-foreground",
@ -174,10 +181,12 @@ function runSummary(run: LedgerRun, agentMap: ReadonlyMap<string, Pick<Agent, "n
  const agentName = compactAgentName(run, agentMap);
  if (run.status === "running") return `Running now by ${agentName}`;
  if (run.status === "queued") return `Queued for ${agentName}`;
+  if (run.status === "scheduled_retry") return `Automatic retry scheduled for ${agentName}`;
  return `${statusLabel(run.status)} by ${agentName}`;
 }

 function livenessCopyForRun(run: LedgerRun) {
+  if (run.status === "scheduled_retry") return RETRY_PENDING_LIVENESS_COPY;
  if (run.livenessState) return LIVENESS_COPY[run.livenessState];
  return isActiveRun(run) ? PENDING_LIVENESS_COPY : MISSING_LIVENESS_COPY;
 }
@ -204,6 +213,7 @@ function stopReasonLabel(run: RunForIssue) {

 function stopStatusLabel(run: LedgerRun, stopReason: string | null) {
  if (stopReason) return stopReason;
+  if (run.status === "scheduled_retry") return "Retry pending";
  if (run.status === "queued") return "Waiting to start";
  if (run.status === "running") return "Still running";
  if (!run.livenessState) return "Unavailable";
@ -211,6 +221,7 @@ function stopStatusLabel(run: LedgerRun, stopReason: string | null) {
 }

 function lastUsefulActionLabel(run: LedgerRun) {
+  if (run.status === "scheduled_retry") return "Waiting for next attempt";
  if (run.lastUsefulActionAt) return relativeTime(run.lastUsefulActionAt);
  if (isActiveRun(run)) return "No action recorded yet";
  if (run.livenessState === "plan_only" || run.livenessState === "needs_followup") {
@ -251,7 +262,7 @@ export function IssueRunLedger({
  const { data: runs } = useQuery({
    queryKey: queryKeys.issues.runs(issueId),
    queryFn: () => activityApi.runsForIssue(issueId),
-    refetchInterval: hasLiveRuns ? 5000 : false,
+    refetchInterval: hasLiveRuns || issueStatus === "in_progress" ? 5000 : false,
    placeholderData: keepPreviousDataForSameQueryTail<RunForIssue[]>(issueId),
  });
  const { data: liveRuns } = useQuery({
@ -361,6 +372,7 @@ export function IssueRunLedgerContent({
            const duration = formatDuration(run.startedAt, run.finishedAt);
            const exhausted = hasExhaustedContinuation(run);
            const continuation = continuationLabel(run);
+            const retryState = describeRunRetryState(run);
            return (
              <article key={run.runId} className="space-y-2 px-3 py-3">
                <div className="flex flex-wrap items-center gap-2">
@ -396,6 +408,16 @@ export function IssueRunLedgerContent({
                  {continuation ? (
                    <span className="text-[11px] text-muted-foreground">{continuation}</span>
                  ) : null}
+                  {retryState ? (
+                    <span
+                      className={cn(
+                        "rounded-md border px-1.5 py-0.5 text-[11px] font-medium",
+                        retryState.tone,
+                      )}
+                    >
+                      {retryState.badgeLabel}
+                    </span>
+                  ) : null}
                </div>

                <div className="grid gap-2 text-xs text-muted-foreground sm:grid-cols-3">
@ -413,6 +435,24 @@ export function IssueRunLedgerContent({
                  </div>
                </div>

+                {retryState ? (
+                  <div className="rounded-md border border-border/70 bg-accent/20 px-2 py-2 text-xs leading-5 text-muted-foreground">
+                    {retryState.detail ? <p>{retryState.detail}</p> : null}
+                    {retryState.secondary ? <p>{retryState.secondary}</p> : null}
+                    {retryState.retryOfRunId ? (
+                      <p>
+                        Retry of{" "}
+                        <Link
+                          to={`/agents/${run.agentId}/runs/${retryState.retryOfRunId}`}
+                          className="font-mono text-foreground hover:underline"
+                        >
+                          {retryState.retryOfRunId.slice(0, 8)}
+                        </Link>
+                      </p>
+                    ) : null}
+                  </div>
+                ) : null}
+
                {run.livenessReason ? (
                  <p className="min-w-0 break-words text-xs leading-5 text-muted-foreground">
                    {run.livenessReason}
--- a/ui/src/components/StatusBadge.tsx
+++ b/ui/src/components/StatusBadge.tsx
@ -9,7 +9,7 @@ export function StatusBadge({ status }: { status: string }) {
        statusBadge[status] ?? statusBadgeDefault
      )}
    >
-      {status.replace("_", " ")}
+      {status.replace(/_/g, " ")}
    </span>
  );
 }