watchdog: self-heal FATAL session-state errors + supervise the orchestrator
- heal_session: detect the unrecoverable "thinking/redacted_thinking blocks cannot be modified" 400 (recurs every turn, session stays alive so the dead-check misses it) and kill+restart the loop fresh (re-orients from repo). Consolidates the dead/fatal/limit handling for builder+adversary. - heal_orchestrator: keep the orchestrator alive too, conflict-safe. Restarts via launch-orchestrator.sh ONLY when no orchestrator is alive anywhere — liveness detects both a managed cc-ci-orchestrator tmux session AND a hand-launched terminal session (any non-loop claude), so it never double-resumes the conversation (the likely cause of the thinking-block crashes). Kill+restart if the managed session is wedged on the FATAL error. Toggle: WATCH_ORCHESTRATOR=0. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -45,6 +45,12 @@ SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping
|
||||
BUILDER_SESSION="cc-ci-builder"
|
||||
ADV_SESSION="cc-ci-adv"
|
||||
WATCHDOG_SESSION="cc-ci-watchdog"
|
||||
# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh.
|
||||
ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}"
|
||||
ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}"
|
||||
# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself
|
||||
# and don't want it auto-(re)launched.
|
||||
WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}"
|
||||
|
||||
# Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order,
|
||||
# auto-transitions on the phase's "## DONE" (in BUILDER_DIR/<statusbasename>), and STOPS after the
|
||||
@ -143,13 +149,71 @@ ping_session() {
|
||||
# the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is
|
||||
# just legitimately idle-waiting on a handoff.
|
||||
LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)'
|
||||
nudge_if_limit_stalled() {
|
||||
local s="$1" pane
|
||||
# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays
|
||||
# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the
|
||||
# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary
|
||||
# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop
|
||||
# re-orients from the repo. Matched conservatively so it never fires on transient/working states.
|
||||
FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified'
|
||||
|
||||
# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on
|
||||
# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen).
|
||||
heal_session() {
|
||||
local role="$1" s="$2" dir="$3" pane
|
||||
if ! session_alive "$s"; then
|
||||
log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))"
|
||||
start_agent "$role" "$s" "$dir"; return 0
|
||||
fi
|
||||
pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)"
|
||||
if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then return 0; fi # actively working
|
||||
if ! printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then return 0; fi # not a limit stall
|
||||
log "limit-stall detected on $s — re-nudging to resume"
|
||||
ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
|
||||
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # actively working — leave alone
|
||||
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
|
||||
log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)"
|
||||
tmux kill-session -t "$s" 2>/dev/null || true
|
||||
start_agent "$role" "$s" "$dir"; return 0
|
||||
fi
|
||||
if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then
|
||||
log "limit-stall detected on $role ($s) — re-nudging to resume"
|
||||
ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
|
||||
fi
|
||||
}
|
||||
|
||||
# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second
|
||||
# orchestrator that resumes the same conversation while one is already running (that double-resume is
|
||||
# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be
|
||||
# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator
|
||||
# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two
|
||||
# loop sessions (identified by their --remote-control name), or the managed tmux session exists.
|
||||
orchestrator_alive() {
|
||||
local pid args
|
||||
for pid in $(pgrep -x claude 2>/dev/null); do
|
||||
args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)"
|
||||
# skip the two loops (matched by their remote-control session NAME, not a stray path mention)
|
||||
printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv)'?" && continue
|
||||
return 0 # a non-loop claude process => orchestrator (or operator) is alive
|
||||
done
|
||||
tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0
|
||||
return 1
|
||||
}
|
||||
|
||||
# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY
|
||||
# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart.
|
||||
heal_orchestrator() {
|
||||
[[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0
|
||||
[[ -x "$ORCH_LAUNCHER" ]] || return 0
|
||||
if orchestrator_alive; then
|
||||
if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then
|
||||
local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)"
|
||||
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # working — leave alone
|
||||
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
|
||||
log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh"
|
||||
tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true
|
||||
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER"
|
||||
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
# Edge-triggered handoff signalling for the CURRENT phase. Reads the loops' local clones.
|
||||
@ -239,8 +303,9 @@ watchdog_loop() {
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
if session_alive "$BUILDER_SESSION"; then nudge_if_limit_stalled "$BUILDER_SESSION"; else log "builder gone — restarting (phase $pid)"; start_agent builder "$BUILDER_SESSION" "$BUILDER_DIR"; fi
|
||||
if session_alive "$ADV_SESSION"; then nudge_if_limit_stalled "$ADV_SESSION"; else log "adversary gone — restarting (phase $pid)"; start_agent adversary "$ADV_SESSION" "$ADV_DIR"; fi
|
||||
heal_session builder "$BUILDER_SESSION" "$BUILDER_DIR"
|
||||
heal_session adversary "$ADV_SESSION" "$ADV_DIR"
|
||||
heal_orchestrator
|
||||
fi
|
||||
fi
|
||||
sleep "$SIGNAL_INTERVAL"
|
||||
|
||||
Reference in New Issue
Block a user