watchdog: self-heal FATAL session-state errors + supervise the orchestrator

- heal_session: detect the unrecoverable "thinking/redacted_thinking blocks cannot be modified" 400 (recurs every turn, session stays alive so the dead-check misses it) and kill+restart the loop fresh (re-orients from repo). Consolidates the dead/fatal/limit handling for builder+adversary. - heal_orchestrator: keep the orchestrator alive too, conflict-safe. Restarts via launch-orchestrator.sh ONLY when no orchestrator is alive anywhere — liveness detects both a managed cc-ci-orchestrator tmux session AND a hand-launched terminal session (any non-loop claude), so it never double-resumes the conversation (the likely cause of the thinking-block crashes). Kill+restart if the managed session is wedged on the FATAL error. Toggle: WATCH_ORCHESTRATOR=0. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-28 21:09:21 +01:00
parent 36a6c9872a
commit 11a2ce652d
1 changed files with 73 additions and 8 deletions
--- a/cc-ci-plan/launch.sh
+++ b/cc-ci-plan/launch.sh
@ -45,6 +45,12 @@ SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}"  # seconds between HANDOFF checks (ping
 BUILDER_SESSION="cc-ci-builder"
 ADV_SESSION="cc-ci-adv"
 WATCHDOG_SESSION="cc-ci-watchdog"
+# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh.
+ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}"
+ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}"
+# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself
+# and don't want it auto-(re)launched.
+WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}"

 # Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order,
 # auto-transitions on the phase's "## DONE" (in BUILDER_DIR/<statusbasename>), and STOPS after the
@ -143,13 +149,71 @@ ping_session() {
 # the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is
 # just legitimately idle-waiting on a handoff.
 LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)'
-nudge_if_limit_stalled() {
-  local s="$1" pane
+# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays
+# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the
+# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary
+# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop
+# re-orients from the repo. Matched conservatively so it never fires on transient/working states.
+FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified'
+
+# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on
+# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen).
+heal_session() {
+  local role="$1" s="$2" dir="$3" pane
+  if ! session_alive "$s"; then
+    log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))"
+    start_agent "$role" "$s" "$dir"; return 0
+  fi
  pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)"
-  if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then return 0; fi    # actively working
-  if ! printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then return 0; fi        # not a limit stall
-  log "limit-stall detected on $s — re-nudging to resume"
-  ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
+  printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0   # actively working — leave alone
+  if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
+    log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)"
+    tmux kill-session -t "$s" 2>/dev/null || true
+    start_agent "$role" "$s" "$dir"; return 0
+  fi
+  if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then
+    log "limit-stall detected on $role ($s) — re-nudging to resume"
+    ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
+  fi
+}
+
+# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second
+# orchestrator that resumes the same conversation while one is already running (that double-resume is
+# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be
+# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator
+# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two
+# loop sessions (identified by their --remote-control name), or the managed tmux session exists.
+orchestrator_alive() {
+  local pid args
+  for pid in $(pgrep -x claude 2>/dev/null); do
+    args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)"
+    # skip the two loops (matched by their remote-control session NAME, not a stray path mention)
+    printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv)'?" && continue
+    return 0   # a non-loop claude process => orchestrator (or operator) is alive
+  done
+  tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0
+  return 1
+}
+
+# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY
+# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart.
+heal_orchestrator() {
+  [[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0
+  [[ -x "$ORCH_LAUNCHER" ]] || return 0
+  if orchestrator_alive; then
+    if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then
+      local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)"
+      printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0   # working — leave alone
+      if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
+        log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh"
+        tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true
+        "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
+      fi
+    fi
+    return 0
+  fi
+  log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER"
+  "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
 }

 # Edge-triggered handoff signalling for the CURRENT phase. Reads the loops' local clones.
@ -239,8 +303,9 @@ watchdog_loop() {
          exit 0
        fi
      else
-        if session_alive "$BUILDER_SESSION"; then nudge_if_limit_stalled "$BUILDER_SESSION"; else log "builder gone — restarting (phase $pid)"; start_agent builder   "$BUILDER_SESSION" "$BUILDER_DIR"; fi
-        if session_alive "$ADV_SESSION";     then nudge_if_limit_stalled "$ADV_SESSION";     else log "adversary gone — restarting (phase $pid)"; start_agent adversary "$ADV_SESSION"     "$ADV_DIR"; fi
+        heal_session builder   "$BUILDER_SESSION" "$BUILDER_DIR"
+        heal_session adversary "$ADV_SESSION"     "$ADV_DIR"
+        heal_orchestrator
      fi
    fi
    sleep "$SIGNAL_INTERVAL"