mirror of
https://github.com/alkimake/paperclip.git
synced 2026-06-14 01:50:39 +09:00
[codex] Harden heartbeat scheduling and runtime controls (#4223)
## Thinking Path > - Paperclip orchestrates AI agents through issue checkout, heartbeat runs, routines, and auditable control-plane state > - The runtime path has to recover from lost local processes, transient adapter failures, blocked dependencies, and routine coalescing without stranding work > - The existing branch carried several reliability fixes across heartbeat scheduling, issue runtime controls, routine dispatch, and operator-facing run state > - These changes belong together because they share backend contracts, migrations, and runtime status semantics > - This pull request groups the control-plane/runtime slice so it can merge independently from board UI polish and adapter sandbox work > - The benefit is safer heartbeat recovery, clearer runtime controls, and more predictable recurring execution behavior ## What Changed - Adds bounded heartbeat retry scheduling, scheduled retry state, and Codex transient failure recovery handling. - Tightens heartbeat process recovery, blocker wake behavior, issue comment wake handling, routine dispatch coalescing, and activity/dashboard bounds. - Adds runtime-control MCP tools and Paperclip skill docs for issue workspace runtime management. - Adds migrations `0061_lively_thor_girl.sql` and `0062_routine_run_dispatch_fingerprint.sql`. - Surfaces retry state in run ledger/agent UI and keeps related shared types synchronized. ## Verification - `pnpm exec vitest run server/src/__tests__/heartbeat-retry-scheduling.test.ts server/src/__tests__/heartbeat-process-recovery.test.ts server/src/__tests__/routines-service.test.ts` - `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server` ## Risks - Medium risk: this touches heartbeat recovery and routine dispatch, which are central execution paths. - Migration order matters if split branches land out of order: merge this PR before branches that assume the new runtime/routine fields. - Runtime retry behavior should be watched in CI and in local operator smoke tests because it changes how transient failures are resumed. > For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and discuss it in `#dev` before opening the PR. Feature PRs that overlap with planned core work may need to be redirected — check the roadmap first. See `CONTRIBUTING.md`. ## Model Used - OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use enabled. Exact hosted model build and context window are not exposed in this Paperclip heartbeat environment. ## Checklist - [x] I have included a thinking path that traces from project context to this change - [x] I have specified the model used (with version and capability details) - [x] I have checked ROADMAP.md and confirmed this PR does not duplicate planned core work - [x] I have run tests locally and they pass - [x] I have added or updated tests where applicable - [ ] If this change affects the UI, I have included before/after screenshots - [x] I have updated relevant documentation to reflect my changes - [x] I have considered and documented any risks above - [x] I will address all Greptile and reviewer comments before requesting merge
This commit is contained in:
parent
ab9051b595
commit
09d0678840
61 changed files with 17622 additions and 456 deletions
|
|
@ -22,7 +22,11 @@ import {
|
|||
joinPromptSections,
|
||||
runChildProcess,
|
||||
} from "@paperclipai/adapter-utils/server-utils";
|
||||
import { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
|
||||
import {
|
||||
parseCodexJsonl,
|
||||
isCodexTransientUpstreamError,
|
||||
isCodexUnknownSessionError,
|
||||
} from "./parse.js";
|
||||
import { pathExists, prepareManagedCodexHome, resolveManagedCodexHomeDir, resolveSharedCodexHomeDir } from "./codex-home.js";
|
||||
import { resolveCodexDesiredSkillNames } from "./skills.js";
|
||||
import { buildCodexExecArgs } from "./codex-args.js";
|
||||
|
|
@ -149,6 +153,52 @@ type EnsureCodexSkillsInjectedOptions = {
|
|||
linkSkill?: (source: string, target: string) => Promise<void>;
|
||||
};
|
||||
|
||||
type CodexTransientFallbackMode =
|
||||
| "same_session"
|
||||
| "safer_invocation"
|
||||
| "fresh_session"
|
||||
| "fresh_session_safer_invocation";
|
||||
|
||||
function readCodexTransientFallbackMode(context: Record<string, unknown>): CodexTransientFallbackMode | null {
|
||||
const value = asString(context.codexTransientFallbackMode, "").trim();
|
||||
switch (value) {
|
||||
case "same_session":
|
||||
case "safer_invocation":
|
||||
case "fresh_session":
|
||||
case "fresh_session_safer_invocation":
|
||||
return value;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackModeUsesSaferInvocation(mode: CodexTransientFallbackMode | null): boolean {
|
||||
return mode === "safer_invocation" || mode === "fresh_session_safer_invocation";
|
||||
}
|
||||
|
||||
function fallbackModeUsesFreshSession(mode: CodexTransientFallbackMode | null): boolean {
|
||||
return mode === "fresh_session" || mode === "fresh_session_safer_invocation";
|
||||
}
|
||||
|
||||
function buildCodexTransientHandoffNote(input: {
|
||||
previousSessionId: string | null;
|
||||
fallbackMode: CodexTransientFallbackMode;
|
||||
continuationSummaryBody: string | null;
|
||||
}): string {
|
||||
return [
|
||||
"Paperclip session handoff:",
|
||||
input.previousSessionId ? `- Previous session: ${input.previousSessionId}` : "",
|
||||
"- Rotation reason: repeated Codex transient remote-compaction failures",
|
||||
`- Fallback mode: ${input.fallbackMode}`,
|
||||
input.continuationSummaryBody
|
||||
? `- Issue continuation summary: ${input.continuationSummaryBody.slice(0, 1_500)}`
|
||||
: "",
|
||||
"Continue from the current task state. Rebuild only the minimum context you need.",
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
export async function ensureCodexSkillsInjected(
|
||||
onLog: AdapterExecutionContext["onLog"],
|
||||
options: EnsureCodexSkillsInjectedOptions = {},
|
||||
|
|
@ -397,7 +447,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
const canResumeSession =
|
||||
runtimeSessionId.length > 0 &&
|
||||
(runtimeSessionCwd.length === 0 || path.resolve(runtimeSessionCwd) === path.resolve(cwd));
|
||||
const sessionId = canResumeSession ? runtimeSessionId : null;
|
||||
const codexTransientFallbackMode = readCodexTransientFallbackMode(context);
|
||||
const forceSaferInvocation = fallbackModeUsesSaferInvocation(codexTransientFallbackMode);
|
||||
const forceFreshSession = fallbackModeUsesFreshSession(codexTransientFallbackMode);
|
||||
const sessionId = canResumeSession && !forceFreshSession ? runtimeSessionId : null;
|
||||
if (runtimeSessionId && !canResumeSession) {
|
||||
await onLog(
|
||||
"stdout",
|
||||
|
|
@ -444,28 +497,66 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
const shouldUseResumeDeltaPrompt = Boolean(sessionId) && wakePrompt.length > 0;
|
||||
const promptInstructionsPrefix = shouldUseResumeDeltaPrompt ? "" : instructionsPrefix;
|
||||
instructionsChars = promptInstructionsPrefix.length;
|
||||
const continuationSummary = parseObject(context.paperclipContinuationSummary);
|
||||
const continuationSummaryBody = asString(continuationSummary.body, "").trim() || null;
|
||||
const codexFallbackHandoffNote =
|
||||
forceFreshSession
|
||||
? buildCodexTransientHandoffNote({
|
||||
previousSessionId: runtimeSessionId || runtime.sessionId || null,
|
||||
fallbackMode: codexTransientFallbackMode ?? "fresh_session",
|
||||
continuationSummaryBody,
|
||||
})
|
||||
: "";
|
||||
const commandNotes = (() => {
|
||||
if (!instructionsFilePath) {
|
||||
return [repoAgentsNote];
|
||||
const notes = [repoAgentsNote];
|
||||
if (forceSaferInvocation) {
|
||||
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
|
||||
}
|
||||
if (forceFreshSession) {
|
||||
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
|
||||
}
|
||||
return notes;
|
||||
}
|
||||
if (instructionsPrefix.length > 0) {
|
||||
if (shouldUseResumeDeltaPrompt) {
|
||||
return [
|
||||
const notes = [
|
||||
`Loaded agent instructions from ${instructionsFilePath}`,
|
||||
"Skipped stdin instruction reinjection because an existing Codex session is being resumed with a wake delta.",
|
||||
repoAgentsNote,
|
||||
];
|
||||
if (forceSaferInvocation) {
|
||||
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
|
||||
}
|
||||
if (forceFreshSession) {
|
||||
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
|
||||
}
|
||||
return notes;
|
||||
}
|
||||
return [
|
||||
const notes = [
|
||||
`Loaded agent instructions from ${instructionsFilePath}`,
|
||||
`Prepended instructions + path directive to stdin prompt (relative references from ${instructionsDir}).`,
|
||||
repoAgentsNote,
|
||||
];
|
||||
if (forceSaferInvocation) {
|
||||
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
|
||||
}
|
||||
if (forceFreshSession) {
|
||||
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
|
||||
}
|
||||
return notes;
|
||||
}
|
||||
return [
|
||||
const notes = [
|
||||
`Configured instructionsFilePath ${instructionsFilePath}, but file could not be read; continuing without injected instructions.`,
|
||||
repoAgentsNote,
|
||||
];
|
||||
if (forceSaferInvocation) {
|
||||
notes.push("Codex transient fallback requested safer invocation settings for this retry.");
|
||||
}
|
||||
if (forceFreshSession) {
|
||||
notes.push("Codex transient fallback forced a fresh session with a continuation handoff.");
|
||||
}
|
||||
return notes;
|
||||
})();
|
||||
const renderedPrompt = shouldUseResumeDeltaPrompt ? "" : renderTemplate(promptTemplate, templateData);
|
||||
const sessionHandoffNote = asString(context.paperclipSessionHandoffMarkdown, "").trim();
|
||||
|
|
@ -473,6 +564,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
promptInstructionsPrefix,
|
||||
renderedBootstrapPrompt,
|
||||
wakePrompt,
|
||||
codexFallbackHandoffNote,
|
||||
sessionHandoffNote,
|
||||
renderedPrompt,
|
||||
]);
|
||||
|
|
@ -486,7 +578,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
};
|
||||
|
||||
const runAttempt = async (resumeSessionId: string | null) => {
|
||||
const execArgs = buildCodexExecArgs(config, { resumeSessionId });
|
||||
const execArgs = buildCodexExecArgs(
|
||||
forceSaferInvocation ? { ...config, fastMode: false } : config,
|
||||
{ resumeSessionId },
|
||||
);
|
||||
const args = execArgs.args;
|
||||
const commandNotesWithFastMode =
|
||||
execArgs.fastModeIgnoredReason == null
|
||||
|
|
@ -540,6 +635,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
const toResult = (
|
||||
attempt: { proc: { exitCode: number | null; signal: string | null; timedOut: boolean; stdout: string; stderr: string }; rawStderr: string; parsed: ReturnType<typeof parseCodexJsonl> },
|
||||
clearSessionOnMissingSession = false,
|
||||
isRetry = false,
|
||||
): AdapterExecutionResult => {
|
||||
if (attempt.proc.timedOut) {
|
||||
return {
|
||||
|
|
@ -551,7 +647,10 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
};
|
||||
}
|
||||
|
||||
const resolvedSessionId = attempt.parsed.sessionId ?? runtimeSessionId ?? runtime.sessionId ?? null;
|
||||
const canFallbackToRuntimeSession = !isRetry && !forceFreshSession;
|
||||
const resolvedSessionId =
|
||||
attempt.parsed.sessionId ??
|
||||
(canFallbackToRuntimeSession ? (runtimeSessionId ?? runtime.sessionId ?? null) : null);
|
||||
const resolvedSessionParams = resolvedSessionId
|
||||
? ({
|
||||
sessionId: resolvedSessionId,
|
||||
|
|
@ -576,6 +675,15 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
(attempt.proc.exitCode ?? 0) === 0
|
||||
? null
|
||||
: fallbackErrorMessage,
|
||||
errorCode:
|
||||
(attempt.proc.exitCode ?? 0) !== 0 &&
|
||||
isCodexTransientUpstreamError({
|
||||
stdout: attempt.proc.stdout,
|
||||
stderr: attempt.proc.stderr,
|
||||
errorMessage: fallbackErrorMessage,
|
||||
})
|
||||
? "codex_transient_upstream"
|
||||
: null,
|
||||
usage: attempt.parsed.usage,
|
||||
sessionId: resolvedSessionId,
|
||||
sessionParams: resolvedSessionParams,
|
||||
|
|
@ -590,7 +698,7 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
stderr: attempt.proc.stderr,
|
||||
},
|
||||
summary: attempt.parsed.summary,
|
||||
clearSession: Boolean(clearSessionOnMissingSession && !resolvedSessionId),
|
||||
clearSession: Boolean((clearSessionOnMissingSession || forceFreshSession) && !resolvedSessionId),
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -606,8 +714,8 @@ export async function execute(ctx: AdapterExecutionContext): Promise<AdapterExec
|
|||
`[paperclip] Codex resume session "${sessionId}" is unavailable; retrying with a fresh session.\n`,
|
||||
);
|
||||
const retry = await runAttempt(null);
|
||||
return toResult(retry, true);
|
||||
return toResult(retry, true, true);
|
||||
}
|
||||
|
||||
return toResult(initial);
|
||||
return toResult(initial, false, false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
export { execute, ensureCodexSkillsInjected } from "./execute.js";
|
||||
export { listCodexSkills, syncCodexSkills } from "./skills.js";
|
||||
export { testEnvironment } from "./test.js";
|
||||
export { parseCodexJsonl, isCodexUnknownSessionError } from "./parse.js";
|
||||
export { parseCodexJsonl, isCodexTransientUpstreamError, isCodexUnknownSessionError } from "./parse.js";
|
||||
export {
|
||||
getQuotaWindows,
|
||||
readCodexAuthInfo,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import { isCodexUnknownSessionError, parseCodexJsonl } from "./parse.js";
|
||||
import {
|
||||
isCodexTransientUpstreamError,
|
||||
isCodexUnknownSessionError,
|
||||
parseCodexJsonl,
|
||||
} from "./parse.js";
|
||||
|
||||
describe("parseCodexJsonl", () => {
|
||||
it("captures session id, assistant summary, usage, and error message", () => {
|
||||
|
|
@ -81,3 +85,36 @@ describe("isCodexUnknownSessionError", () => {
|
|||
expect(isCodexUnknownSessionError("", "model overloaded")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isCodexTransientUpstreamError", () => {
|
||||
it("classifies the remote-compaction high-demand failure as transient upstream", () => {
|
||||
expect(
|
||||
isCodexTransientUpstreamError({
|
||||
errorMessage:
|
||||
"Error running remote compact task: We're currently experiencing high demand, which may cause temporary errors.",
|
||||
}),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isCodexTransientUpstreamError({
|
||||
stderr: "We're currently experiencing high demand, which may cause temporary errors.",
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("does not classify deterministic compaction errors as transient", () => {
|
||||
expect(
|
||||
isCodexTransientUpstreamError({
|
||||
errorMessage: [
|
||||
"Error running remote compact task: {",
|
||||
' "error": {',
|
||||
' "message": "Unknown parameter: \'prompt_cache_retention\'.",',
|
||||
' "type": "invalid_request_error",',
|
||||
' "param": "prompt_cache_retention",',
|
||||
' "code": "unknown_parameter"',
|
||||
" }",
|
||||
"}",
|
||||
].join("\n"),
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
import { asString, asNumber, parseObject, parseJson } from "@paperclipai/adapter-utils/server-utils";
|
||||
|
||||
const CODEX_TRANSIENT_UPSTREAM_RE =
|
||||
/(?:we(?:'|’)re\s+currently\s+experiencing\s+high\s+demand|temporary\s+errors|rate[-\s]?limit(?:ed)?|too\s+many\s+requests|\b429\b|server\s+overloaded|service\s+unavailable|try\s+again\s+later)/i;
|
||||
const CODEX_REMOTE_COMPACTION_RE = /remote\s+compact\s+task/i;
|
||||
|
||||
export function parseCodexJsonl(stdout: string) {
|
||||
let sessionId: string | null = null;
|
||||
let finalMessage: string | null = null;
|
||||
|
|
@ -71,3 +75,25 @@ export function isCodexUnknownSessionError(stdout: string, stderr: string): bool
|
|||
haystack,
|
||||
);
|
||||
}
|
||||
|
||||
export function isCodexTransientUpstreamError(input: {
|
||||
stdout?: string | null;
|
||||
stderr?: string | null;
|
||||
errorMessage?: string | null;
|
||||
}): boolean {
|
||||
const haystack = [
|
||||
input.errorMessage ?? "",
|
||||
input.stdout ?? "",
|
||||
input.stderr ?? "",
|
||||
]
|
||||
.join("\n")
|
||||
.split(/\r?\n/)
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
|
||||
if (!CODEX_TRANSIENT_UPSTREAM_RE.test(haystack)) return false;
|
||||
// Keep automatic retries scoped to the observed remote-compaction/high-demand
|
||||
// failure shape; broader 429s may be caused by user or account limits.
|
||||
return CODEX_REMOTE_COMPACTION_RE.test(haystack) || /high\s+demand|temporary\s+errors/i.test(haystack);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue