watchdog: self-heal FATAL session-state errors + supervise the orchestrator

- heal_session: detect the unrecoverable "thinking/redacted_thinking blocks cannot
  be modified" 400 (recurs every turn, session stays alive so the dead-check misses
  it) and kill+restart the loop fresh (re-orients from repo). Consolidates the
  dead/fatal/limit handling for builder+adversary.
- heal_orchestrator: keep the orchestrator alive too, conflict-safe. Restarts via
  launch-orchestrator.sh ONLY when no orchestrator is alive anywhere — liveness
  detects both a managed cc-ci-orchestrator tmux session AND a hand-launched
  terminal session (any non-loop claude), so it never double-resumes the
  conversation (the likely cause of the thinking-block crashes). Kill+restart if
  the managed session is wedged on the FATAL error. Toggle: WATCH_ORCHESTRATOR=0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 21:09:21 +01:00
parent 36a6c9872a
commit 11a2ce652d

View File

@ -45,6 +45,12 @@ SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping
BUILDER_SESSION="cc-ci-builder"
ADV_SESSION="cc-ci-adv"
WATCHDOG_SESSION="cc-ci-watchdog"
# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh.
ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}"
ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}"
# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself
# and don't want it auto-(re)launched.
WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}"
# Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order,
# auto-transitions on the phase's "## DONE" (in BUILDER_DIR/<statusbasename>), and STOPS after the
@ -143,13 +149,71 @@ ping_session() {
# the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is
# just legitimately idle-waiting on a handoff.
LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)'
nudge_if_limit_stalled() {
local s="$1" pane
# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays
# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the
# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary
# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop
# re-orients from the repo. Matched conservatively so it never fires on transient/working states.
FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified'
# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on
# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen).
heal_session() {
local role="$1" s="$2" dir="$3" pane
if ! session_alive "$s"; then
log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))"
start_agent "$role" "$s" "$dir"; return 0
fi
pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)"
if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then return 0; fi # actively working
if ! printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then return 0; fi # not a limit stall
log "limit-stall detected on $sre-nudging to resume"
ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # actively working — leave alone
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
log "FATAL session-state error on $role ($s)kill + restart fresh (re-orients from repo)"
tmux kill-session -t "$s" 2>/dev/null || true
start_agent "$role" "$s" "$dir"; return 0
fi
if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then
log "limit-stall detected on $role ($s) — re-nudging to resume"
ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
fi
}
# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second
# orchestrator that resumes the same conversation while one is already running (that double-resume is
# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be
# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator
# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two
# loop sessions (identified by their --remote-control name), or the managed tmux session exists.
orchestrator_alive() {
local pid args
for pid in $(pgrep -x claude 2>/dev/null); do
args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)"
# skip the two loops (matched by their remote-control session NAME, not a stray path mention)
printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv)'?" && continue
return 0 # a non-loop claude process => orchestrator (or operator) is alive
done
tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0
return 1
}
# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY
# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart.
heal_orchestrator() {
[[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0
[[ -x "$ORCH_LAUNCHER" ]] || return 0
if orchestrator_alive; then
if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then
local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)"
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # working — leave alone
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh"
tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
fi
fi
return 0
fi
log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER"
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
}
# Edge-triggered handoff signalling for the CURRENT phase. Reads the loops' local clones.
@ -239,8 +303,9 @@ watchdog_loop() {
exit 0
fi
else
if session_alive "$BUILDER_SESSION"; then nudge_if_limit_stalled "$BUILDER_SESSION"; else log "builder gone — restarting (phase $pid)"; start_agent builder "$BUILDER_SESSION" "$BUILDER_DIR"; fi
if session_alive "$ADV_SESSION"; then nudge_if_limit_stalled "$ADV_SESSION"; else log "adversary gone — restarting (phase $pid)"; start_agent adversary "$ADV_SESSION" "$ADV_DIR"; fi
heal_session builder "$BUILDER_SESSION" "$BUILDER_DIR"
heal_session adversary "$ADV_SESSION" "$ADV_DIR"
heal_orchestrator
fi
fi
sleep "$SIGNAL_INTERVAL"