2026-03-03 08:45:26 -06:00
import { asString , asNumber , parseObject , parseJson } from "@paperclipai/adapter-utils/server-utils" ;
2026-02-18 13:53:03 -06:00
[codex] Harden heartbeat scheduling and runtime controls (#4223)
## Thinking Path
> - Paperclip orchestrates AI agents through issue checkout, heartbeat
runs, routines, and auditable control-plane state
> - The runtime path has to recover from lost local processes, transient
adapter failures, blocked dependencies, and routine coalescing without
stranding work
> - The existing branch carried several reliability fixes across
heartbeat scheduling, issue runtime controls, routine dispatch, and
operator-facing run state
> - These changes belong together because they share backend contracts,
migrations, and runtime status semantics
> - This pull request groups the control-plane/runtime slice so it can
merge independently from board UI polish and adapter sandbox work
> - The benefit is safer heartbeat recovery, clearer runtime controls,
and more predictable recurring execution behavior
## What Changed
- Adds bounded heartbeat retry scheduling, scheduled retry state, and
Codex transient failure recovery handling.
- Tightens heartbeat process recovery, blocker wake behavior, issue
comment wake handling, routine dispatch coalescing, and
activity/dashboard bounds.
- Adds runtime-control MCP tools and Paperclip skill docs for issue
workspace runtime management.
- Adds migrations `0061_lively_thor_girl.sql` and
`0062_routine_run_dispatch_fingerprint.sql`.
- Surfaces retry state in run ledger/agent UI and keeps related shared
types synchronized.
## Verification
- `pnpm exec vitest run
server/src/__tests__/heartbeat-retry-scheduling.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/routines-service.test.ts`
- `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server`
## Risks
- Medium risk: this touches heartbeat recovery and routine dispatch,
which are central execution paths.
- Migration order matters if split branches land out of order: merge
this PR before branches that assume the new runtime/routine fields.
- Runtime retry behavior should be watched in CI and in local operator
smoke tests because it changes how transient failures are resumed.
> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.
## Model Used
- OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use
enabled. Exact hosted model build and context window are not exposed in
this Paperclip heartbeat environment.
## Checklist
- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
2026-04-21 12:24:11 -05:00
const CODEX_TRANSIENT_UPSTREAM_RE =
/(?:we(?:'|’ )re\s+currently\s+experiencing\s+high\s+demand|temporary\s+errors|rate[-\s]?limit(?:ed)?|too\s+many\s+requests|\b429\b|server\s+overloaded|service\s+unavailable|try\s+again\s+later)/i ;
const CODEX_REMOTE_COMPACTION_RE = /remote\s+compact\s+task/i ;
2026-02-18 13:53:03 -06:00
export function parseCodexJsonl ( stdout : string ) {
let sessionId : string | null = null ;
2026-04-10 22:26:21 -05:00
let finalMessage : string | null = null ;
2026-02-19 14:39:37 -06:00
let errorMessage : string | null = null ;
2026-02-18 13:53:03 -06:00
const usage = {
inputTokens : 0 ,
cachedInputTokens : 0 ,
outputTokens : 0 ,
} ;
for ( const rawLine of stdout . split ( /\r?\n/ ) ) {
const line = rawLine . trim ( ) ;
if ( ! line ) continue ;
const event = parseJson ( line ) ;
if ( ! event ) continue ;
const type = asString ( event . type , "" ) ;
if ( type === "thread.started" ) {
sessionId = asString ( event . thread_id , sessionId ? ? "" ) || sessionId ;
continue ;
}
2026-02-19 14:39:37 -06:00
if ( type === "error" ) {
const msg = asString ( event . message , "" ) . trim ( ) ;
if ( msg ) errorMessage = msg ;
continue ;
}
2026-02-18 13:53:03 -06:00
if ( type === "item.completed" ) {
const item = parseObject ( event . item ) ;
if ( asString ( item . type , "" ) === "agent_message" ) {
const text = asString ( item . text , "" ) ;
2026-04-10 22:26:21 -05:00
if ( text ) finalMessage = text ;
2026-02-18 13:53:03 -06:00
}
continue ;
}
if ( type === "turn.completed" ) {
const usageObj = parseObject ( event . usage ) ;
usage . inputTokens = asNumber ( usageObj . input_tokens , usage . inputTokens ) ;
usage . cachedInputTokens = asNumber ( usageObj . cached_input_tokens , usage . cachedInputTokens ) ;
usage . outputTokens = asNumber ( usageObj . output_tokens , usage . outputTokens ) ;
2026-02-19 14:39:37 -06:00
continue ;
}
if ( type === "turn.failed" ) {
const err = parseObject ( event . error ) ;
const msg = asString ( err . message , "" ) . trim ( ) ;
if ( msg ) errorMessage = msg ;
2026-02-18 13:53:03 -06:00
}
}
return {
sessionId ,
2026-04-10 22:26:21 -05:00
summary : finalMessage?.trim ( ) ? ? "" ,
2026-02-18 13:53:03 -06:00
usage ,
2026-02-19 14:39:37 -06:00
errorMessage ,
2026-02-18 13:53:03 -06:00
} ;
}
2026-02-19 14:01:58 -06:00
export function isCodexUnknownSessionError ( stdout : string , stderr : string ) : boolean {
const haystack = ` ${ stdout } \ n ${ stderr } `
. split ( /\r?\n/ )
. map ( ( line ) = > line . trim ( ) )
. filter ( Boolean )
. join ( "\n" ) ;
2026-04-06 17:07:11 +09:00
return /unknown (session|thread)|session .* not found|thread .* not found|conversation .* not found|missing rollout path for thread|state db missing rollout path|no rollout found for thread id/i . test (
2026-02-19 14:01:58 -06:00
haystack ,
) ;
}
[codex] Harden heartbeat scheduling and runtime controls (#4223)
## Thinking Path
> - Paperclip orchestrates AI agents through issue checkout, heartbeat
runs, routines, and auditable control-plane state
> - The runtime path has to recover from lost local processes, transient
adapter failures, blocked dependencies, and routine coalescing without
stranding work
> - The existing branch carried several reliability fixes across
heartbeat scheduling, issue runtime controls, routine dispatch, and
operator-facing run state
> - These changes belong together because they share backend contracts,
migrations, and runtime status semantics
> - This pull request groups the control-plane/runtime slice so it can
merge independently from board UI polish and adapter sandbox work
> - The benefit is safer heartbeat recovery, clearer runtime controls,
and more predictable recurring execution behavior
## What Changed
- Adds bounded heartbeat retry scheduling, scheduled retry state, and
Codex transient failure recovery handling.
- Tightens heartbeat process recovery, blocker wake behavior, issue
comment wake handling, routine dispatch coalescing, and
activity/dashboard bounds.
- Adds runtime-control MCP tools and Paperclip skill docs for issue
workspace runtime management.
- Adds migrations `0061_lively_thor_girl.sql` and
`0062_routine_run_dispatch_fingerprint.sql`.
- Surfaces retry state in run ledger/agent UI and keeps related shared
types synchronized.
## Verification
- `pnpm exec vitest run
server/src/__tests__/heartbeat-retry-scheduling.test.ts
server/src/__tests__/heartbeat-process-recovery.test.ts
server/src/__tests__/routines-service.test.ts`
- `pnpm exec vitest run src/tools.test.ts` from `packages/mcp-server`
## Risks
- Medium risk: this touches heartbeat recovery and routine dispatch,
which are central execution paths.
- Migration order matters if split branches land out of order: merge
this PR before branches that assume the new runtime/routine fields.
- Runtime retry behavior should be watched in CI and in local operator
smoke tests because it changes how transient failures are resumed.
> For core feature work, check [`ROADMAP.md`](ROADMAP.md) first and
discuss it in `#dev` before opening the PR. Feature PRs that overlap
with planned core work may need to be redirected — check the roadmap
first. See `CONTRIBUTING.md`.
## Model Used
- OpenAI Codex, GPT-5-based coding agent runtime, shell/git tool use
enabled. Exact hosted model build and context window are not exposed in
this Paperclip heartbeat environment.
## Checklist
- [x] I have included a thinking path that traces from project context
to this change
- [x] I have specified the model used (with version and capability
details)
- [x] I have checked ROADMAP.md and confirmed this PR does not duplicate
planned core work
- [x] I have run tests locally and they pass
- [x] I have added or updated tests where applicable
- [ ] If this change affects the UI, I have included before/after
screenshots
- [x] I have updated relevant documentation to reflect my changes
- [x] I have considered and documented any risks above
- [x] I will address all Greptile and reviewer comments before
requesting merge
2026-04-21 12:24:11 -05:00
export function isCodexTransientUpstreamError ( input : {
stdout? : string | null ;
stderr? : string | null ;
errorMessage? : string | null ;
} ) : boolean {
const haystack = [
input . errorMessage ? ? "" ,
input . stdout ? ? "" ,
input . stderr ? ? "" ,
]
. join ( "\n" )
. split ( /\r?\n/ )
. map ( ( line ) = > line . trim ( ) )
. filter ( Boolean )
. join ( "\n" ) ;
if ( ! CODEX_TRANSIENT_UPSTREAM_RE . test ( haystack ) ) return false ;
// Keep automatic retries scoped to the observed remote-compaction/high-demand
// failure shape; broader 429s may be caused by user or account limits.
return CODEX_REMOTE_COMPACTION_RE . test ( haystack ) || /high\s+demand|temporary\s+errors/i . test ( haystack ) ;
}