[codex] Harden heartbeat scheduling and runtime controls (#4223)

## Thinking Path

> - Paperclip orchestrates AI agents through issue checkout, heartbeat
runs, routines, and auditable control-plane state
> - The runtime path has to recover from lost local processes, transient
adapter failures, blocked dependencies, and routine coalescing without
stranding work
> - The existing branch carried several reliability fixes across
heartbeat scheduling, issue runtime controls, routine dispatch, and
operator-facing run state
> - These changes belong together because they share backend contracts,
migrations, and runtime status semantics
> - This pull request groups the control-plane/runtime slice so it can
merge independently from board UI polish and adapter sandbox work
> - The benefit is safer heartbeat recovery, clearer runtime controls,
and more predictable recurring execution behavior

## What Changed

- Adds bounded heartbeat retry scheduling, scheduled retry state, and
Codex transient failure recovery handling.
- Tightens heartbeat process recovery, blocker wake behavior, issue
comment wake handling, routine dispatch coalescing, and
activity/dashboard bounds.
- Adds runtime-control MCP tools and Paperclip skill docs for issue
workspace runtime management.
- Adds migrations `0061_lively_thor_girl.sql` and
`0062_routine_run_dispatch_fingerprint.sql`.
- Surfaces retry state in run ledger/agent UI and keeps related shared
types synchronized.

## Verification

- `pnpm exec vitest run
server/src/__tests__/heartbeat-retry-scheduling.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/routines-service.test.ts`
- `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server`

## Risks

- Medium risk: this touches heartbeat recovery and routine dispatch,
which are central execution paths.
- Migration order matters if split branches land out of order: merge
this PR before branches that assume the new runtime/routine fields.
- Runtime retry behavior should be watched in CI and in local operator
smoke tests because it changes how transient failures are resumed.

> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.

## Model Used

- OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use
enabled. Exact hosted model build and context window are not exposed in
this Paperclip heartbeat environment.

## Checklist

- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
This commit is contained in:
Dotta 2026-04-21 12:24:11 -05:00 committed by GitHub
parent ab9051b595
commit 09d0678840
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
61 changed files with 17622 additions and 456 deletions

View file

@ -22,7 +22,11 @@ import {
joinPromptSections,
runChildProcess,
} from "@paperclipai/adapter-utils/server-utils";
import { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
import {
parseCodexJsonl,
isCodexTransientUpstreamError,
isCodexUnknownSessionError,
} from "./parse.js";
import { pathExists, prepareManagedCodexHome, resolveManagedCodexHomeDir, resolveSharedCodexHomeDir } from "./codex-home.js";
import { resolveCodexDesiredSkillNames } from "./skills.js";
import { buildCodexExecArgs } from "./codex-args.js";
@ -149,6 +153,52 @@ type EnsureCodexSkillsInjectedOptions = {
linkSkill?: (source: string, target: string) => Promise<void>;
};
type CodexTransientFallbackMode =
| "same_session"
| "safer_invocation"
| "fresh_session"
| "fresh_session_safer_invocation";
function readCodexTransientFallbackMode(context: Record<string, unknown>): CodexTransientFallbackMode | null {
const value = asString(context.codexTransientFallbackMode, "").trim();
switch (value) {
case "same_session":
case "safer_invocation":
case "fresh_session":
case "fresh_session_safer_invocation":
return value;
default:
return null;
}
}
function fallbackModeUsesSaferInvocation(mode: CodexTransientFallbackMode | null): boolean {
return mode === "safer_invocation" || mode === "fresh_session_safer_invocation";
}
function fallbackModeUsesFreshSession(mode: CodexTransientFallbackMode | null): boolean {
return mode === "fresh_session" || mode === "fresh_session_safer_invocation";
}
function buildCodexTransientHandoffNote(input: {
previousSessionId: string | null;
fallbackMode: CodexTransientFallbackMode;
continuationSummaryBody: string | null;
}): string {
return [
"Paperclip session handoff:",
input.previousSessionId ? `- Previous session: ${input.previousSessionId}` : "",
"- Rotation reason: repeated Codex transient remote-compaction failures",
`- Fallback mode: ${input.fallbackMode}`,
input.continuationSummaryBody
? `- Issue continuation summary: ${input.continuationSummaryBody.slice(0, 1_500)}`
: "",
"Continue from the current task state. Rebuild only the minimum context you need.",
]
.filter(Boolean)
.join("\n");
}
export async function ensureCodexSkillsInjected(
onLog: AdapterExecutionContext["onLog"],
options: EnsureCodexSkillsInjectedOptions = {},
@ -397,7 +447,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
const canResumeSession =
runtimeSessionId.length > 0 &&
(runtimeSessionCwd.length === 0 || path.resolve(runtimeSessionCwd) === path.resolve(cwd));
const sessionId = canResumeSession ? runtimeSessionId : null;
const codexTransientFallbackMode = readCodexTransientFallbackMode(context);
const forceSaferInvocation = fallbackModeUsesSaferInvocation(codexTransientFallbackMode);
const forceFreshSession = fallbackModeUsesFreshSession(codexTransientFallbackMode);
const sessionId = canResumeSession && !forceFreshSession ? runtimeSessionId : null;
if (runtimeSessionId && !canResumeSession) {
await onLog(
"stdout",
@ -444,28 +497,66 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
const shouldUseResumeDeltaPrompt = Boolean(sessionId) && wakePrompt.length > 0;
const promptInstructionsPrefix = shouldUseResumeDeltaPrompt ? "" : instructionsPrefix;
instructionsChars = promptInstructionsPrefix.length;
const continuationSummary = parseObject(context.paperclipContinuationSummary);
const continuationSummaryBody = asString(continuationSummary.body, "").trim() || null;
const codexFallbackHandoffNote =
forceFreshSession
? buildCodexTransientHandoffNote({
previousSessionId: runtimeSessionId || runtime.sessionId || null,
fallbackMode: codexTransientFallbackMode ?? "fresh_session",
continuationSummaryBody,
})
: "";
const commandNotes = (() => {
if (!instructionsFilePath) {
return [repoAgentsNote];
const notes = [repoAgentsNote];
if (forceSaferInvocation) {
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
}
if (forceFreshSession) {
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
}
return notes;
}
if (instructionsPrefix.length > 0) {
if (shouldUseResumeDeltaPrompt) {
return [
const notes = [
`Loaded agent instructions from ${instructionsFilePath}`,
"Skipped stdin instruction reinjection because an existing Codex session is being resumed with a wake delta.",
repoAgentsNote,
];
if (forceSaferInvocation) {
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
}
if (forceFreshSession) {
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
}
return notes;
}
return [
const notes = [
`Loaded agent instructions from ${instructionsFilePath}`,
`Prepended instructions + path directive to stdin prompt (relative references from ${instructionsDir}).`,
repoAgentsNote,
];
if (forceSaferInvocation) {
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
}
if (forceFreshSession) {
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
}
return notes;
}
return [
const notes = [
`Configured instructionsFilePath ${instructionsFilePath}, but file could not be read; continuing without injected instructions.`,
repoAgentsNote,
];
if (forceSaferInvocation) {
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
}
if (forceFreshSession) {
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
}
return notes;
})();
const renderedPrompt = shouldUseResumeDeltaPrompt ? "" : renderTemplate(promptTemplate, templateData);
const sessionHandoffNote = asString(context.paperclipSessionHandoffMarkdown, "").trim();
@ -473,6 +564,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
promptInstructionsPrefix,
renderedBootstrapPrompt,
wakePrompt,
codexFallbackHandoffNote,
sessionHandoffNote,
renderedPrompt,
]);
@ -486,7 +578,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
};
const runAttempt = async (resumeSessionId: string | null) => {
const execArgs = buildCodexExecArgs(config, { resumeSessionId });
const execArgs = buildCodexExecArgs(
forceSaferInvocation ? { ...config, fastMode: false } : config,
{ resumeSessionId },
);
const args = execArgs.args;
const commandNotesWithFastMode =
execArgs.fastModeIgnoredReason == null
@ -540,6 +635,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
const toResult = (
attempt: { proc: { exitCode: number | null; signal: string | null; timedOut: boolean; stdout: string; stderr: string }; rawStderr: string; parsed: ReturnType<typeof parseCodexJsonl> },
clearSessionOnMissingSession = false,
isRetry = false,
): AdapterExecutionResult => {
if (attempt.proc.timedOut) {
return {
@ -551,7 +647,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
};
}
const resolvedSessionId = attempt.parsed.sessionId ?? runtimeSessionId ?? runtime.sessionId ?? null;
const canFallbackToRuntimeSession = !isRetry && !forceFreshSession;
const resolvedSessionId =
attempt.parsed.sessionId ??
(canFallbackToRuntimeSession ? (runtimeSessionId ?? runtime.sessionId ?? null) : null);
const resolvedSessionParams = resolvedSessionId
? ({
sessionId: resolvedSessionId,
@ -576,6 +675,15 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
(attempt.proc.exitCode ?? 0) === 0
? null
: fallbackErrorMessage,
errorCode:
(attempt.proc.exitCode ?? 0) !== 0 &&
isCodexTransientUpstreamError({
stdout: attempt.proc.stdout,
stderr: attempt.proc.stderr,
errorMessage: fallbackErrorMessage,
})
? "codex_transient_upstream"
: null,
usage: attempt.parsed.usage,
sessionId: resolvedSessionId,
sessionParams: resolvedSessionParams,
@ -590,7 +698,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
stderr: attempt.proc.stderr,
},
summary: attempt.parsed.summary,
clearSession: Boolean(clearSessionOnMissingSession && !resolvedSessionId),
clearSession: Boolean((clearSessionOnMissingSession || forceFreshSession) && !resolvedSessionId),
};
};
@ -606,8 +714,8 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
`[paperclip] Codex resume session "${sessionId}" is unavailable; retrying with a fresh session.\n`,
);
const retry = await runAttempt(null);
return toResult(retry, true);
return toResult(retry, true, true);
}
return toResult(initial);
return toResult(initial, false, false);
}

View file

@ -1,7 +1,7 @@
export { execute, ensureCodexSkillsInjected } from "./execute.js";
export { listCodexSkills, syncCodexSkills } from "./skills.js";
export { testEnvironment } from "./test.js";
export { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
export { parseCodexJsonl, isCodexTransientUpstreamError, isCodexUnknownSessionError } from "./parse.js";
export {
getQuotaWindows,
readCodexAuthInfo,

View file

@ -1,5 +1,9 @@
import { describe, expect, it } from "vitest";
import { isCodexUnknownSessionError, parseCodexJsonl } from "./parse.js";
import {
isCodexTransientUpstreamError,
isCodexUnknownSessionError,
parseCodexJsonl,
} from "./parse.js";
describe("parseCodexJsonl", () => {
it("captures session id, assistant summary, usage, and error message", () => {
@ -81,3 +85,36 @@ describe("isCodexUnknownSessionError", () => {
expect(isCodexUnknownSessionError("", "model overloaded")).toBe(false);
});
});
describe("isCodexTransientUpstreamError", () => {
it("classifies the remote-compaction high-demand failure as transient upstream", () => {
expect(
isCodexTransientUpstreamError({
errorMessage:
"Error running remote compact task: We're currently experiencing high demand, which may cause temporary errors.",
}),
).toBe(true);
expect(
isCodexTransientUpstreamError({
stderr: "We're currently experiencing high demand, which may cause temporary errors.",
}),
).toBe(true);
});
it("does not classify deterministic compaction errors as transient", () => {
expect(
isCodexTransientUpstreamError({
errorMessage: [
"Error running remote compact task: {",
' "error": {',
' "message": "Unknown parameter: \'prompt_cache_retention\'.",',
' "type": "invalid_request_error",',
' "param": "prompt_cache_retention",',
' "code": "unknown_parameter"',
" }",
"}",
].join("\n"),
}),
).toBe(false);
});
});

View file

@ -1,5 +1,9 @@
import { asString, asNumber, parseObject, parseJson } from "@paperclipai/adapter-utils/server-utils";
const CODEX_TRANSIENT_UPSTREAM_RE =
/(?:we(?:'|)re\s+currently\s+experiencing\s+high\s+demand|temporary\s+errors|rate[-\s]?limit(?:ed)?|too\s+many\s+requests|\b429\b|server\s+overloaded|service\s+unavailable|try\s+again\s+later)/i;
const CODEX_REMOTE_COMPACTION_RE = /remote\s+compact\s+task/i;
export function parseCodexJsonl(stdout: string) {
let sessionId: string | null = null;
let finalMessage: string | null = null;
@ -71,3 +75,25 @@ export function isCodexUnknownSessionError(stdout: string, stderr: string): bool
haystack,
);
}
export function isCodexTransientUpstreamError(input: {
stdout?: string | null;
stderr?: string | null;
errorMessage?: string | null;
}): boolean {
const haystack = [
input.errorMessage ?? "",
input.stdout ?? "",
input.stderr ?? "",
]
.join("\n")
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.join("\n");
if (!CODEX_TRANSIENT_UPSTREAM_RE.test(haystack)) return false;
// Keep automatic retries scoped to the observed remote-compaction/high-demand
// failure shape; broader 429s may be caused by user or account limits.
return CODEX_REMOTE_COMPACTION_RE.test(haystack) || /high\s+demand|temporary\s+errors/i.test(haystack);
}