[codex] Harden heartbeat scheduling and runtime controls (#4223)

## Thinking Path > - Paperclip orchestrates AI agents through issue checkout, heartbeat runs, routines, and auditable control-plane state > - The runtime path has to recover from lost local processes, transient adapter failures, blocked dependencies, and routine coalescing without stranding work > - The existing branch carried several reliability fixes across heartbeat scheduling, issue runtime controls, routine dispatch, and operator-facing run state > - These changes belong together because they share backend contracts, migrations, and runtime status semantics > - This pull request groups the control-plane/runtime slice so it can merge independently from board UI polish and adapter sandbox work > - The benefit is safer heartbeat recovery, clearer runtime controls, and more predictable recurring execution behavior ## What Changed - Adds bounded heartbeat retry scheduling, scheduled retry state, and Codex transient failure recovery handling. - Tightens heartbeat process recovery, blocker wake behavior, issue comment wake handling, routine dispatch coalescing, and activity/dashboard bounds. - Adds runtime-control MCP tools and Paperclip skill docs for issue workspace runtime management. - Adds migrations `0061_lively_thor_girl.sql` and `0062_routine_run_dispatch_fingerprint.sql`. - Surfaces retry state in run ledger/agent UI and keeps related shared types synchronized. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-retry-scheduling.test.ts server/src/__tests__/heartbeat-process-recovery.test.ts server/src/__tests__/routines-service.test.ts` - `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server` ## Risks - Medium risk: this touches heartbeat recovery and routine dispatch, which are central execution paths. - Migration order matters if split branches land out of order: merge this PR before branches that assume the new runtime/routine fields. - Runtime retry behavior should be watched in CI and in local operator smoke tests because it changes how transient failures are resumed. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use enabled. Exact hosted model build and context window are not exposed in this Paperclip heartbeat environment. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
2026-06-14 01:50:39 +09:00 · 2026-04-21 12:24:11 -05:00 · 2026-04-21 12:24:11 -05:00 · 09d0678840
commit 09d0678840
parent ab9051b595
61 changed files with 17622 additions and 456 deletions
--- a/packages/adapter-utils/src/server-utils.test.ts
+++ b/packages/adapter-utils/src/server-utils.test.ts
@ -221,16 +221,6 @@ describe("runChildProcess", () => {
  });
 });

-describe("appendWithByteCap", () => {
-  it("keeps valid UTF-8 when trimming through multibyte text", () => {
-    const output = appendWithByteCap("prefix ", "hello — world", 7);
-
-    expect(output).not.toContain("\uFFFD");
-    expect(Buffer.from(output, "utf8").toString("utf8")).toBe(output);
-    expect(Buffer.byteLength(output, "utf8")).toBeLessThanOrEqual(7);
-  });
-});
-
 describe("renderPaperclipWakePrompt", () => {
  it("keeps the default local-agent prompt action-oriented", () => {
    expect(DEFAULT_PAPERCLIP_AGENT_PROMPT_TEMPLATE).toContain("Start actionable work in this heartbeat");
@ -266,6 +256,42 @@ describe("renderPaperclipWakePrompt", () => {
    expect(prompt).toContain("mark blocked work with the unblock owner/action");
  });

+  it("renders dependency-blocked interaction guidance", () => {
+    const prompt = renderPaperclipWakePrompt({
+      reason: "issue_commented",
+      issue: {
+        id: "issue-1",
+        identifier: "PAP-1703",
+        title: "Blocked parent",
+        status: "todo",
+      },
+      dependencyBlockedInteraction: true,
+      unresolvedBlockerIssueIds: ["blocker-1"],
+      unresolvedBlockerSummaries: [
+        {
+          id: "blocker-1",
+          identifier: "PAP-1723",
+          title: "Finish blocker",
+          status: "todo",
+          priority: "medium",
+        },
+      ],
+      commentWindow: {
+        requestedCount: 1,
+        includedCount: 1,
+        missingCount: 0,
+      },
+      commentIds: ["comment-1"],
+      latestCommentId: "comment-1",
+      comments: [{ id: "comment-1", body: "hello" }],
+      fallbackFetchNeeded: false,
+    });
+
+    expect(prompt).toContain("dependency-blocked interaction: yes");
+    expect(prompt).toContain("respond or triage the human comment");
+    expect(prompt).toContain("PAP-1723 Finish blocker (todo)");
+  });
+
  it("includes continuation and child issue summaries in structured wake context", () => {
    const payload = {
      reason: "issue_children_completed",
@ -335,3 +361,13 @@ describe("renderPaperclipWakePrompt", () => {
    expect(prompt).toContain("Added the helper route and tests.");
  });
 });
+
+describe("appendWithByteCap", () => {
+  it("keeps valid UTF-8 when trimming through multibyte text", () => {
+    const output = appendWithByteCap("prefix ", "hello — world", 7);
+
+    expect(output).not.toContain("\uFFFD");
+    expect(Buffer.from(output, "utf8").toString("utf8")).toBe(output);
+    expect(Buffer.byteLength(output, "utf8")).toBeLessThanOrEqual(7);
+  });
+});
--- a/packages/adapter-utils/src/server-utils.ts
+++ b/packages/adapter-utils/src/server-utils.ts
@ -83,6 +83,7 @@ export const DEFAULT_PAPERCLIP_AGENT_PROMPT_TEMPLATE = [
  "- Start actionable work in this heartbeat; do not stop at a plan unless the issue asks for planning.",
  "- Leave durable progress in comments, documents, or work products with a clear next action.",
  "- Use child issues for parallel or long delegated work instead of polling agents, sessions, or processes.",
+  "- If woken by a human comment on a dependency-blocked issue, respond or triage the comment without treating the blocked deliverable work as unblocked.",
  "- If blocked, mark the issue blocked and name the unblock owner and action.",
  "- Respect budget, pause/cancel, approval gates, and company boundaries.",
 ].join("\n");
@ -313,10 +314,21 @@ type PaperclipWakeChildIssueSummary = {
  summary: string | null;
 };

+type PaperclipWakeBlockerSummary = {
+  id: string | null;
+  identifier: string | null;
+  title: string | null;
+  status: string | null;
+  priority: string | null;
+};
+
 type PaperclipWakePayload = {
  reason: string | null;
  issue: PaperclipWakeIssue | null;
  checkedOutByHarness: boolean;
+  dependencyBlockedInteraction: boolean;
+  unresolvedBlockerIssueIds: string[];
+  unresolvedBlockerSummaries: PaperclipWakeBlockerSummary[];
  executionStage: PaperclipWakeExecutionStage | null;
  continuationSummary: PaperclipWakeContinuationSummary | null;
  livenessContinuation: PaperclipWakeLivenessContinuation | null;
@ -409,6 +421,17 @@ function normalizePaperclipWakeChildIssueSummary(value: unknown): PaperclipWakeC
  return { id, identifier, title, status, priority, summary };
 }

+function normalizePaperclipWakeBlockerSummary(value: unknown): PaperclipWakeBlockerSummary | null {
+  const blocker = parseObject(value);
+  const id = asString(blocker.id, "").trim() || null;
+  const identifier = asString(blocker.identifier, "").trim() || null;
+  const title = asString(blocker.title, "").trim() || null;
+  const status = asString(blocker.status, "").trim() || null;
+  const priority = asString(blocker.priority, "").trim() || null;
+  if (!id && !identifier && !title && !status) return null;
+  return { id, identifier, title, status, priority };
+}
+
 function normalizePaperclipWakeExecutionPrincipal(value: unknown): PaperclipWakeExecutionPrincipal | null {
  const principal = parseObject(value);
  const typeRaw = asString(principal.type, "").trim().toLowerCase();
@ -474,8 +497,18 @@ export function normalizePaperclipWakePayload(value: unknown): PaperclipWakePayl
        .map((entry) => normalizePaperclipWakeChildIssueSummary(entry))
        .filter((entry): entry is PaperclipWakeChildIssueSummary => Boolean(entry))
    : [];
+  const unresolvedBlockerIssueIds = Array.isArray(payload.unresolvedBlockerIssueIds)
+    ? payload.unresolvedBlockerIssueIds
+        .map((entry) => asString(entry, "").trim())
+        .filter(Boolean)
+    : [];
+  const unresolvedBlockerSummaries = Array.isArray(payload.unresolvedBlockerSummaries)
+    ? payload.unresolvedBlockerSummaries
+        .map((entry) => normalizePaperclipWakeBlockerSummary(entry))
+        .filter((entry): entry is PaperclipWakeBlockerSummary => Boolean(entry))
+    : [];

-  if (comments.length === 0 && commentIds.length === 0 && childIssueSummaries.length === 0 && !executionStage && !continuationSummary && !livenessContinuation && !normalizePaperclipWakeIssue(payload.issue)) {
+  if (comments.length === 0 && commentIds.length === 0 && childIssueSummaries.length === 0 && unresolvedBlockerIssueIds.length === 0 && unresolvedBlockerSummaries.length === 0 && !executionStage && !continuationSummary && !livenessContinuation && !normalizePaperclipWakeIssue(payload.issue)) {
    return null;
  }

@ -483,6 +516,9 @@ export function normalizePaperclipWakePayload(value: unknown): PaperclipWakePayl
    reason: asString(payload.reason, "").trim() || null,
    issue: normalizePaperclipWakeIssue(payload.issue),
    checkedOutByHarness: asBoolean(payload.checkedOutByHarness, false),
+    dependencyBlockedInteraction: asBoolean(payload.dependencyBlockedInteraction, false),
+    unresolvedBlockerIssueIds,
+    unresolvedBlockerSummaries,
    executionStage,
    continuationSummary,
    livenessContinuation,
@ -563,6 +599,18 @@ export function renderPaperclipWakePrompt(
  if (normalized.checkedOutByHarness) {
    lines.push("- checkout: already claimed by the harness for this run");
  }
+  if (normalized.dependencyBlockedInteraction) {
+    lines.push("- dependency-blocked interaction: yes");
+    lines.push("- execution scope: respond or triage the human comment; do not treat blocker-dependent deliverable work as unblocked");
+    if (normalized.unresolvedBlockerSummaries.length > 0) {
+      const blockers = normalized.unresolvedBlockerSummaries
+        .map((blocker) => `${blocker.identifier ?? blocker.id ?? "unknown"}${blocker.title ? ` ${blocker.title}` : ""}${blocker.status ? ` (${blocker.status})` : ""}`)
+        .join("; ");
+      lines.push(`- unresolved blockers: ${blockers}`);
+    } else if (normalized.unresolvedBlockerIssueIds.length > 0) {
+      lines.push(`- unresolved blocker issue ids: ${normalized.unresolvedBlockerIssueIds.join(", ")}`);
+    }
+  }
  if (normalized.missingCount > 0) {
    lines.push(`- omitted comments: ${normalized.missingCount}`);
  }
--- a/packages/adapters/codex-local/src/server/execute.ts
+++ b/packages/adapters/codex-local/src/server/execute.ts
@ -22,7 +22,11 @@ import {
  joinPromptSections,
  runChildProcess,
 } from "@paperclipai/adapter-utils/server-utils";
-import { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
+import {
+  parseCodexJsonl,
+  isCodexTransientUpstreamError,
+  isCodexUnknownSessionError,
+} from "./parse.js";
 import { pathExists, prepareManagedCodexHome, resolveManagedCodexHomeDir, resolveSharedCodexHomeDir } from "./codex-home.js";
 import { resolveCodexDesiredSkillNames } from "./skills.js";
 import { buildCodexExecArgs } from "./codex-args.js";
@ -149,6 +153,52 @@ type EnsureCodexSkillsInjectedOptions = {
  linkSkill?: (source: string, target: string) => Promise<void>;
 };

+type CodexTransientFallbackMode =
+  | "same_session"
+  | "safer_invocation"
+  | "fresh_session"
+  | "fresh_session_safer_invocation";
+
+function readCodexTransientFallbackMode(context: Record<string, unknown>): CodexTransientFallbackMode | null {
+  const value = asString(context.codexTransientFallbackMode, "").trim();
+  switch (value) {
+    case "same_session":
+    case "safer_invocation":
+    case "fresh_session":
+    case "fresh_session_safer_invocation":
+      return value;
+    default:
+      return null;
+  }
+}
+
+function fallbackModeUsesSaferInvocation(mode: CodexTransientFallbackMode | null): boolean {
+  return mode === "safer_invocation" || mode === "fresh_session_safer_invocation";
+}
+
+function fallbackModeUsesFreshSession(mode: CodexTransientFallbackMode | null): boolean {
+  return mode === "fresh_session" || mode === "fresh_session_safer_invocation";
+}
+
+function buildCodexTransientHandoffNote(input: {
+  previousSessionId: string | null;
+  fallbackMode: CodexTransientFallbackMode;
+  continuationSummaryBody: string | null;
+}): string {
+  return [
+    "Paperclip session handoff:",
+    input.previousSessionId ? `- Previous session: ${input.previousSessionId}` : "",
+    "- Rotation reason: repeated Codex transient remote-compaction failures",
+    `- Fallback mode: ${input.fallbackMode}`,
+    input.continuationSummaryBody
+      ? `- Issue continuation summary: ${input.continuationSummaryBody.slice(0, 1_500)}`
+      : "",
+    "Continue from the current task state. Rebuild only the minimum context you need.",
+  ]
+    .filter(Boolean)
+    .join("\n");
+}
+
 export async function ensureCodexSkillsInjected(
  onLog: AdapterExecutionContext["onLog"],
  options: EnsureCodexSkillsInjectedOptions = {},
@ -397,7 +447,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
  const canResumeSession =
    runtimeSessionId.length > 0 &&
    (runtimeSessionCwd.length === 0 || path.resolve(runtimeSessionCwd) === path.resolve(cwd));
-  const sessionId = canResumeSession ? runtimeSessionId : null;
+  const codexTransientFallbackMode = readCodexTransientFallbackMode(context);
+  const forceSaferInvocation = fallbackModeUsesSaferInvocation(codexTransientFallbackMode);
+  const forceFreshSession = fallbackModeUsesFreshSession(codexTransientFallbackMode);
+  const sessionId = canResumeSession && !forceFreshSession ? runtimeSessionId : null;
  if (runtimeSessionId && !canResumeSession) {
    await onLog(
      "stdout",
@ -444,28 +497,66 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
  const shouldUseResumeDeltaPrompt = Boolean(sessionId) && wakePrompt.length > 0;
  const promptInstructionsPrefix = shouldUseResumeDeltaPrompt ? "" : instructionsPrefix;
  instructionsChars = promptInstructionsPrefix.length;
+  const continuationSummary = parseObject(context.paperclipContinuationSummary);
+  const continuationSummaryBody = asString(continuationSummary.body, "").trim() || null;
+  const codexFallbackHandoffNote =
+    forceFreshSession
+      ? buildCodexTransientHandoffNote({
+          previousSessionId: runtimeSessionId || runtime.sessionId || null,
+          fallbackMode: codexTransientFallbackMode ?? "fresh_session",
+          continuationSummaryBody,
+        })
+      : "";
  const commandNotes = (() => {
    if (!instructionsFilePath) {
-      return [repoAgentsNote];
+      const notes = [repoAgentsNote];
+      if (forceSaferInvocation) {
+        notes.push("Codex transient fallback requested safer invocation settings for this retry.");
+      }
+      if (forceFreshSession) {
+        notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
+      }
+      return notes;
    }
    if (instructionsPrefix.length > 0) {
      if (shouldUseResumeDeltaPrompt) {
-        return [
+        const notes = [
          `Loaded agent instructions from ${instructionsFilePath}`,
          "Skipped stdin instruction reinjection because an existing Codex session is being resumed with a wake delta.",
          repoAgentsNote,
        ];
+        if (forceSaferInvocation) {
+          notes.push("Codex transient fallback requested safer invocation settings for this retry.");
+        }
+        if (forceFreshSession) {
+          notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
+        }
+        return notes;
      }
-      return [
+      const notes = [
        `Loaded agent instructions from ${instructionsFilePath}`,
        `Prepended instructions + path directive to stdin prompt (relative references from ${instructionsDir}).`,
        repoAgentsNote,
      ];
+      if (forceSaferInvocation) {
+        notes.push("Codex transient fallback requested safer invocation settings for this retry.");
+      }
+      if (forceFreshSession) {
+        notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
+      }
+      return notes;
    }
-    return [
+    const notes = [
      `Configured instructionsFilePath ${instructionsFilePath}, but file could not be read; continuing without injected instructions.`,
      repoAgentsNote,
    ];
+    if (forceSaferInvocation) {
+      notes.push("Codex transient fallback requested safer invocation settings for this retry.");
+    }
+    if (forceFreshSession) {
+      notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
+    }
+    return notes;
  })();
  const renderedPrompt = shouldUseResumeDeltaPrompt ? "" : renderTemplate(promptTemplate, templateData);
  const sessionHandoffNote = asString(context.paperclipSessionHandoffMarkdown, "").trim();
@ -473,6 +564,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
    promptInstructionsPrefix,
    renderedBootstrapPrompt,
    wakePrompt,
+    codexFallbackHandoffNote,
    sessionHandoffNote,
    renderedPrompt,
  ]);
@ -486,7 +578,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
  };

  const runAttempt = async (resumeSessionId: string | null) => {
-    const execArgs = buildCodexExecArgs(config, { resumeSessionId });
+    const execArgs = buildCodexExecArgs(
+      forceSaferInvocation ? { ...config, fastMode: false } : config,
+      { resumeSessionId },
+    );
    const args = execArgs.args;
    const commandNotesWithFastMode =
      execArgs.fastModeIgnoredReason == null
@ -540,6 +635,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
  const toResult = (
    attempt: { proc: { exitCode: number | null; signal: string | null; timedOut: boolean; stdout: string; stderr: string }; rawStderr: string; parsed: ReturnType<typeof parseCodexJsonl> },
    clearSessionOnMissingSession = false,
+    isRetry = false,
  ): AdapterExecutionResult => {
    if (attempt.proc.timedOut) {
      return {
@ -551,7 +647,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
      };
    }

-    const resolvedSessionId = attempt.parsed.sessionId ?? runtimeSessionId ?? runtime.sessionId ?? null;
+    const canFallbackToRuntimeSession = !isRetry && !forceFreshSession;
+    const resolvedSessionId =
+      attempt.parsed.sessionId ??
+      (canFallbackToRuntimeSession ? (runtimeSessionId ?? runtime.sessionId ?? null) : null);
    const resolvedSessionParams = resolvedSessionId
      ? ({
        sessionId: resolvedSessionId,
@ -576,6 +675,15 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
        (attempt.proc.exitCode ?? 0) === 0
          ? null
          : fallbackErrorMessage,
+      errorCode:
+        (attempt.proc.exitCode ?? 0) !== 0 &&
+        isCodexTransientUpstreamError({
+          stdout: attempt.proc.stdout,
+          stderr: attempt.proc.stderr,
+          errorMessage: fallbackErrorMessage,
+        })
+          ? "codex_transient_upstream"
+          : null,
      usage: attempt.parsed.usage,
      sessionId: resolvedSessionId,
      sessionParams: resolvedSessionParams,
@ -590,7 +698,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
        stderr: attempt.proc.stderr,
      },
      summary: attempt.parsed.summary,
-      clearSession: Boolean(clearSessionOnMissingSession && !resolvedSessionId),
+      clearSession: Boolean((clearSessionOnMissingSession || forceFreshSession) && !resolvedSessionId),
    };
  };

@ -606,8 +714,8 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
      `[paperclip] Codex resume session "${sessionId}" is unavailable; retrying with a fresh session.\n`,
    );
    const retry = await runAttempt(null);
-    return toResult(retry, true);
+    return toResult(retry, true, true);
  }

-  return toResult(initial);
+  return toResult(initial, false, false);
 }
--- a/packages/adapters/codex-local/src/server/index.ts
+++ b/packages/adapters/codex-local/src/server/index.ts
@ -1,7 +1,7 @@
 export { execute, ensureCodexSkillsInjected } from "./execute.js";
 export { listCodexSkills, syncCodexSkills } from "./skills.js";
 export { testEnvironment } from "./test.js";
-export { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
+export { parseCodexJsonl, isCodexTransientUpstreamError, isCodexUnknownSessionError } from "./parse.js";
 export {
  getQuotaWindows,
  readCodexAuthInfo,
--- a/packages/adapters/codex-local/src/server/parse.test.ts
+++ b/packages/adapters/codex-local/src/server/parse.test.ts
@ -1,5 +1,9 @@
 import { describe, expect, it } from "vitest";
-import { isCodexUnknownSessionError, parseCodexJsonl } from "./parse.js";
+import {
+  isCodexTransientUpstreamError,
+  isCodexUnknownSessionError,
+  parseCodexJsonl,
+} from "./parse.js";

 describe("parseCodexJsonl", () => {
  it("captures session id, assistant summary, usage, and error message", () => {
@ -81,3 +85,36 @@ describe("isCodexUnknownSessionError", () => {
    expect(isCodexUnknownSessionError("", "model overloaded")).toBe(false);
  });
 });
+
+describe("isCodexTransientUpstreamError", () => {
+  it("classifies the remote-compaction high-demand failure as transient upstream", () => {
+    expect(
+      isCodexTransientUpstreamError({
+        errorMessage:
+          "Error running remote compact task: We're currently experiencing high demand, which may cause temporary errors.",
+      }),
+    ).toBe(true);
+    expect(
+      isCodexTransientUpstreamError({
+        stderr: "We're currently experiencing high demand, which may cause temporary errors.",
+      }),
+    ).toBe(true);
+  });
+
+  it("does not classify deterministic compaction errors as transient", () => {
+    expect(
+      isCodexTransientUpstreamError({
+        errorMessage: [
+          "Error running remote compact task: {",
+          '  "error": {',
+          '    "message": "Unknown parameter: \'prompt_cache_retention\'.",',
+          '    "type": "invalid_request_error",',
+          '    "param": "prompt_cache_retention",',
+          '    "code": "unknown_parameter"',
+          "  }",
+          "}",
+        ].join("\n"),
+      }),
+    ).toBe(false);
+  });
+});
--- a/packages/adapters/codex-local/src/server/parse.ts
+++ b/packages/adapters/codex-local/src/server/parse.ts
@ -1,5 +1,9 @@
 import { asString, asNumber, parseObject, parseJson } from "@paperclipai/adapter-utils/server-utils";

+const CODEX_TRANSIENT_UPSTREAM_RE =
+  /(?:we(?:'|’)re\s+currently\s+experiencing\s+high\s+demand|temporary\s+errors|rate[-\s]?limit(?:ed)?|too\s+many\s+requests|\b429\b|server\s+overloaded|service\s+unavailable|try\s+again\s+later)/i;
+const CODEX_REMOTE_COMPACTION_RE = /remote\s+compact\s+task/i;
+
 export function parseCodexJsonl(stdout: string) {
  let sessionId: string | null = null;
  let finalMessage: string | null = null;
@ -71,3 +75,25 @@ export function isCodexUnknownSessionError(stdout: string, stderr: string): bool
    haystack,
  );
 }
+
+export function isCodexTransientUpstreamError(input: {
+  stdout?: string | null;
+  stderr?: string | null;
+  errorMessage?: string | null;
+}): boolean {
+  const haystack = [
+    input.errorMessage ?? "",
+    input.stdout ?? "",
+    input.stderr ?? "",
+  ]
+    .join("\n")
+    .split(/\r?\n/)
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .join("\n");
+
+  if (!CODEX_TRANSIENT_UPSTREAM_RE.test(haystack)) return false;
+  // Keep automatic retries scoped to the observed remote-compaction/high-demand
+  // failure shape; broader 429s may be caused by user or account limits.
+  return CODEX_REMOTE_COMPACTION_RE.test(haystack) || /high\s+demand|temporary\s+errors/i.test(haystack);
+}
--- a/packages/db/src/migrations/0061_lively_thor_girl.sql
+++ b/packages/db/src/migrations/0061_lively_thor_girl.sql
@ -0,0 +1,3 @@
+ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "scheduled_retry_at" timestamp with time zone;--> statement-breakpoint
+ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "scheduled_retry_attempt" integer DEFAULT 0 NOT NULL;--> statement-breakpoint
+ALTER TABLE "heartbeat_runs" ADD COLUMN IF NOT EXISTS "scheduled_retry_reason" text;
--- a/packages/db/src/migrations/0062_routine_run_dispatch_fingerprint.sql
+++ b/packages/db/src/migrations/0062_routine_run_dispatch_fingerprint.sql
@ -0,0 +1,9 @@
+ALTER TABLE "routine_runs" ADD COLUMN IF NOT EXISTS "dispatch_fingerprint" text;--> statement-breakpoint
+ALTER TABLE "issues" ADD COLUMN IF NOT EXISTS "origin_fingerprint" text DEFAULT 'default' NOT NULL;--> statement-breakpoint
+DROP INDEX IF EXISTS "issues_open_routine_execution_uq";--> statement-breakpoint
+CREATE UNIQUE INDEX IF NOT EXISTS "issues_open_routine_execution_uq" ON "issues" USING btree ("company_id","origin_kind","origin_id","origin_fingerprint") WHERE "issues"."origin_kind" = 'routine_execution'
+          and "issues"."origin_id" is not null
+          and "issues"."hidden_at" is null
+          and "issues"."execution_run_id" is not null
+          and "issues"."status" in ('backlog', 'todo', 'in_progress', 'in_review', 'blocked');--> statement-breakpoint
+CREATE INDEX IF NOT EXISTS "routine_runs_dispatch_fingerprint_idx" ON "routine_runs" USING btree ("routine_id","dispatch_fingerprint");
--- a/packages/db/src/migrations/meta/0061_snapshot.json
+++ b/packages/db/src/migrations/meta/0061_snapshot.json
--- a/packages/db/src/migrations/meta/_journal.json
+++ b/packages/db/src/migrations/meta/_journal.json
@ -428,6 +428,20 @@
      "when": 1776717606743,
      "tag": "0060_orange_annihilus",
      "breakpoints": true
+    },
+    {
+      "idx": 61,
+      "version": "7",
+      "when": 1776785165389,
+      "tag": "0061_lively_thor_girl",
+      "breakpoints": true
+    },
+    {
+      "idx": 62,
+      "version": "7",
+      "when": 1776780000000,
+      "tag": "0062_routine_run_dispatch_fingerprint",
+      "breakpoints": true
    }
  ]
-}
+}
--- a/packages/db/src/schema/heartbeat_runs.ts
+++ b/packages/db/src/schema/heartbeat_runs.ts
@ -38,6 +38,9 @@ export const heartbeatRuns = pgTable(
      onDelete: "set null",
    }),
    processLossRetryCount: integer("process_loss_retry_count").notNull().default(0),
+    scheduledRetryAt: timestamp("scheduled_retry_at", { withTimezone: true }),
+    scheduledRetryAttempt: integer("scheduled_retry_attempt").notNull().default(0),
+    scheduledRetryReason: text("scheduled_retry_reason"),
    issueCommentStatus: text("issue_comment_status").notNull().default("not_applicable"),
    issueCommentSatisfiedByCommentId: uuid("issue_comment_satisfied_by_comment_id"),
    issueCommentRetryQueuedAt: timestamp("issue_comment_retry_queued_at", { withTimezone: true }),
--- a/packages/db/src/schema/issues.ts
+++ b/packages/db/src/schema/issues.ts
@ -44,6 +44,7 @@ export const issues = pgTable(
    originKind: text("origin_kind").notNull().default("manual"),
    originId: text("origin_id"),
    originRunId: text("origin_run_id"),
+    originFingerprint: text("origin_fingerprint").notNull().default("default"),
    requestDepth: integer("request_depth").notNull().default(0),
    billingCode: text("billing_code"),
    assigneeAdapterOverrides: jsonb("assignee_adapter_overrides").$type<Record<string, unknown>>(),
@ -82,7 +83,7 @@ export const issues = pgTable(
    identifierSearchIdx: index("issues_identifier_search_idx").using("gin", table.identifier.op("gin_trgm_ops")),
    descriptionSearchIdx: index("issues_description_search_idx").using("gin", table.description.op("gin_trgm_ops")),
    openRoutineExecutionIdx: uniqueIndex("issues_open_routine_execution_uq")
-      .on(table.companyId, table.originKind, table.originId)
+      .on(table.companyId, table.originKind, table.originId, table.originFingerprint)
      .where(
        sql`${table.originKind} = 'routine_execution'
          and ${table.originId} is not null
--- a/packages/db/src/schema/routines.ts
+++ b/packages/db/src/schema/routines.ts
@ -96,6 +96,7 @@ export const routineRuns = pgTable(
    triggeredAt: timestamp("triggered_at", { withTimezone: true }).notNull().defaultNow(),
    idempotencyKey: text("idempotency_key"),
    triggerPayload: jsonb("trigger_payload").$type<Record<string, unknown>>(),
+    dispatchFingerprint: text("dispatch_fingerprint"),
    linkedIssueId: uuid("linked_issue_id").references(() => issues.id, { onDelete: "set null" }),
    coalescedIntoRunId: uuid("coalesced_into_run_id"),
    failureReason: text("failure_reason"),
@ -106,6 +107,7 @@ export const routineRuns = pgTable(
  (table) => ({
    companyRoutineIdx: index("routine_runs_company_routine_idx").on(table.companyId, table.routineId, table.createdAt),
    triggerIdx: index("routine_runs_trigger_idx").on(table.triggerId, table.createdAt),
+    dispatchFingerprintIdx: index("routine_runs_dispatch_fingerprint_idx").on(table.routineId, table.dispatchFingerprint),
    linkedIssueIdx: index("routine_runs_linked_issue_idx").on(table.linkedIssueId),
    idempotencyIdx: index("routine_runs_trigger_idempotency_idx").on(table.triggerId, table.idempotencyKey),
  }),
--- a/packages/mcp-server/README.md
+++ b/packages/mcp-server/README.md
@ -47,6 +47,8 @@ Read tools:
 - `paperclipListDocumentRevisions`
 - `paperclipListProjects`
 - `paperclipGetProject`
+- `paperclipGetIssueWorkspaceRuntime`
+- `paperclipWaitForIssueWorkspaceService`
 - `paperclipListGoals`
 - `paperclipGetGoal`
 - `paperclipListApprovals`
@ -63,6 +65,7 @@ Write tools:
 - `paperclipAddComment`
 - `paperclipUpsertIssueDocument`
 - `paperclipRestoreIssueDocumentRevision`
+- `paperclipControlIssueWorkspaceServices`
 - `paperclipCreateApproval`
 - `paperclipLinkIssueApproval`
 - `paperclipUnlinkIssueApproval`
--- a/packages/mcp-server/src/tools.test.ts
+++ b/packages/mcp-server/src/tools.test.ts
@ -107,6 +107,81 @@ describe("paperclip MCP tools", () => {
    });
  });

+  it("controls issue workspace services through the current execution workspace", async () => {
+    const fetchMock = vi.fn()
+      .mockResolvedValueOnce(mockJsonResponse({
+        currentExecutionWorkspace: {
+          id: "44444444-4444-4444-8444-444444444444",
+          runtimeServices: [],
+        },
+      }))
+      .mockResolvedValueOnce(mockJsonResponse({
+        operation: { id: "operation-1" },
+        workspace: {
+          id: "44444444-4444-4444-8444-444444444444",
+          runtimeServices: [
+            {
+              id: "55555555-5555-4555-8555-555555555555",
+              serviceName: "web",
+              status: "running",
+              url: "http://127.0.0.1:5173",
+            },
+          ],
+        },
+      }));
+    vi.stubGlobal("fetch", fetchMock);
+
+    const tool = getTool("paperclipControlIssueWorkspaceServices");
+    await tool.execute({
+      issueId: "PAP-1135",
+      action: "restart",
+      workspaceCommandId: "web",
+    });
+
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+    const [lookupUrl, lookupInit] = fetchMock.mock.calls[0] as [string, RequestInit];
+    expect(String(lookupUrl)).toBe("http://localhost:3100/api/issues/PAP-1135/heartbeat-context");
+    expect(lookupInit.method).toBe("GET");
+
+    const [controlUrl, controlInit] = fetchMock.mock.calls[1] as [string, RequestInit];
+    expect(String(controlUrl)).toBe(
+      "http://localhost:3100/api/execution-workspaces/44444444-4444-4444-8444-444444444444/runtime-services/restart",
+    );
+    expect(controlInit.method).toBe("POST");
+    expect(JSON.parse(String(controlInit.body))).toEqual({
+      workspaceCommandId: "web",
+    });
+  });
+
+  it("waits for an issue workspace runtime service URL", async () => {
+    const fetchMock = vi.fn()
+      .mockResolvedValueOnce(mockJsonResponse({
+        currentExecutionWorkspace: {
+          id: "44444444-4444-4444-8444-444444444444",
+          runtimeServices: [
+            {
+              id: "55555555-5555-4555-8555-555555555555",
+              serviceName: "web",
+              status: "running",
+              healthStatus: "healthy",
+              url: "http://127.0.0.1:5173",
+            },
+          ],
+        },
+      }));
+    vi.stubGlobal("fetch", fetchMock);
+
+    const tool = getTool("paperclipWaitForIssueWorkspaceService");
+    const response = await tool.execute({
+      issueId: "PAP-1135",
+      serviceName: "web",
+      timeoutSeconds: 1,
+    });
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    expect(response.content[0]?.text).toContain("http://127.0.0.1:5173");
+  });
+
  it("creates approvals with the expected company-scoped payload", async () => {
    const fetchMock = vi.fn().mockResolvedValue(
      mockJsonResponse({ id: "approval-1" }),
--- a/packages/mcp-server/src/tools.ts
+++ b/packages/mcp-server/src/tools.ts
@ -124,6 +124,66 @@ const apiRequestSchema = z.object({
  jsonBody: z.string().optional(),
 });

+const workspaceRuntimeControlTargetSchema = z.object({
+  workspaceCommandId: z.string().min(1).optional().nullable(),
+  runtimeServiceId: z.string().uuid().optional().nullable(),
+  serviceIndex: z.number().int().nonnegative().optional().nullable(),
+});
+
+const issueWorkspaceRuntimeControlSchema = z.object({
+  issueId: issueIdSchema,
+  action: z.enum(["start", "stop", "restart"]),
+}).merge(workspaceRuntimeControlTargetSchema);
+
+const waitForIssueWorkspaceServiceSchema = z.object({
+  issueId: issueIdSchema,
+  runtimeServiceId: z.string().uuid().optional().nullable(),
+  serviceName: z.string().min(1).optional().nullable(),
+  timeoutSeconds: z.number().int().positive().max(300).optional(),
+});
+
+function sleep(ms: number) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+function readCurrentExecutionWorkspace(context: unknown): Record<string, unknown> | null {
+  if (!context || typeof context !== "object") return null;
+  const workspace = (context as { currentExecutionWorkspace?: unknown }).currentExecutionWorkspace;
+  return workspace && typeof workspace === "object" ? workspace as Record<string, unknown> : null;
+}
+
+function readWorkspaceRuntimeServices(workspace: Record<string, unknown> | null): Array<Record<string, unknown>> {
+  const raw = workspace?.runtimeServices;
+  return Array.isArray(raw)
+    ? raw.filter((entry): entry is Record<string, unknown> => Boolean(entry) && typeof entry === "object")
+    : [];
+}
+
+function selectRuntimeService(
+  services: Array<Record<string, unknown>>,
+  input: { runtimeServiceId?: string | null; serviceName?: string | null },
+) {
+  if (input.runtimeServiceId) {
+    return services.find((service) => service.id === input.runtimeServiceId) ?? null;
+  }
+  if (input.serviceName) {
+    return services.find((service) => service.serviceName === input.serviceName) ?? null;
+  }
+  return services.find((service) => service.status === "running" || service.status === "starting")
+    ?? services[0]
+    ?? null;
+}
+
+async function getIssueWorkspaceRuntime(client: PaperclipApiClient, issueId: string) {
+  const context = await client.requestJson("GET", `/issues/${encodeURIComponent(issueId)}/heartbeat-context`);
+  const workspace = readCurrentExecutionWorkspace(context);
+  return {
+    context,
+    workspace,
+    runtimeServices: readWorkspaceRuntimeServices(workspace),
+  };
+}
+
 export function createToolDefinitions(client: PaperclipApiClient): ToolDefinition[] {
  return [
    makeTool(
@ -247,6 +307,55 @@ export function createToolDefinitions(client: PaperclipApiClient): ToolDefinitio
        return client.requestJson("GET", `/projects/${encodeURIComponent(projectId)}${qs}`);
      },
    ),
+    makeTool(
+      "paperclipGetIssueWorkspaceRuntime",
+      "Get the current execution workspace and runtime services for an issue, including service URLs",
+      z.object({ issueId: issueIdSchema }),
+      async ({ issueId }) => getIssueWorkspaceRuntime(client, issueId),
+    ),
+    makeTool(
+      "paperclipControlIssueWorkspaceServices",
+      "Start, stop, or restart the current issue execution workspace runtime services",
+      issueWorkspaceRuntimeControlSchema,
+      async ({ issueId, action, ...target }) => {
+        const runtime = await getIssueWorkspaceRuntime(client, issueId);
+        const workspaceId = typeof runtime.workspace?.id === "string" ? runtime.workspace.id : null;
+        if (!workspaceId) {
+          throw new Error("Issue has no current execution workspace");
+        }
+        return client.requestJson(
+          "POST",
+          `/execution-workspaces/${encodeURIComponent(workspaceId)}/runtime-services/${action}`,
+          { body: target },
+        );
+      },
+    ),
+    makeTool(
+      "paperclipWaitForIssueWorkspaceService",
+      "Wait until an issue execution workspace runtime service is running and has a URL when one is exposed",
+      waitForIssueWorkspaceServiceSchema,
+      async ({ issueId, runtimeServiceId, serviceName, timeoutSeconds }) => {
+        const deadline = Date.now() + (timeoutSeconds ?? 60) * 1000;
+        let latest: Awaited<ReturnType<typeof getIssueWorkspaceRuntime>> | null = null;
+        while (Date.now() <= deadline) {
+          latest = await getIssueWorkspaceRuntime(client, issueId);
+          const service = selectRuntimeService(latest.runtimeServices, { runtimeServiceId, serviceName });
+          if (service?.status === "running" && service.healthStatus !== "unhealthy") {
+            return {
+              workspace: latest.workspace,
+              service,
+            };
+          }
+          await sleep(1000);
+        }
+
+        return {
+          timedOut: true,
+          latestWorkspace: latest?.workspace ?? null,
+          latestRuntimeServices: latest?.runtimeServices ?? [],
+        };
+      },
+    ),
    makeTool(
      "paperclipListGoals",
      "List goals in a company",
--- a/packages/shared/src/constants.ts
+++ b/packages/shared/src/constants.ts
@ -67,9 +67,7 @@ export const AGENT_ROLE_LABELS: Record<AgentRole, string> = {
 };

 export const AGENT_DEFAULT_MAX_CONCURRENT_RUNS = 5;
-
 export const WORKSPACE_BRANCH_ROUTINE_VARIABLE = "workspaceBranch";
-
 export const AGENT_ICON_NAMES = [
  "bot",
  "cpu",
@ -353,6 +351,7 @@ export type WakeupRequestStatus = (typeof WAKEUP_REQUEST_STATUSES)[number];

 export const HEARTBEAT_RUN_STATUSES = [
  "queued",
+  "scheduled_retry",
  "running",
  "succeeded",
  "failed",
--- a/packages/shared/src/types/heartbeat.ts
+++ b/packages/shared/src/types/heartbeat.ts
@ -39,6 +39,10 @@ export interface HeartbeatRun {
  processStartedAt: Date | null;
  retryOfRunId: string | null;
  processLossRetryCount: number;
+  scheduledRetryAt?: Date | null;
+  scheduledRetryAttempt?: number;
+  scheduledRetryReason?: string | null;
+  retryExhaustedReason?: string | null;
  livenessState: RunLivenessState | null;
  livenessReason: string | null;
  continuationAttempt: number;
--- a/packages/shared/src/types/issue.ts
+++ b/packages/shared/src/types/issue.ts
@ -217,6 +217,7 @@ export interface Issue {
  originKind?: IssueOriginKind;
  originId?: string | null;
  originRunId?: string | null;
+  originFingerprint?: string | null;
  requestDepth: number;
  billingCode: string | null;
  assigneeAdapterOverrides: IssueAssigneeAdapterOverrides | null;
--- a/packages/shared/src/types/routine.ts
+++ b/packages/shared/src/types/routine.ts
@ -95,6 +95,7 @@ export interface RoutineRun {
  triggeredAt: Date;
  idempotencyKey: string | null;
  triggerPayload: Record<string, unknown> | null;
+  dispatchFingerprint: string | null;
  linkedIssueId: string | null;
  coalescedIntoRunId: string | null;
  failureReason: string | null;