diff --git a/cc-ci-plan/launch.sh b/cc-ci-plan/launch.sh index b27f04d..2b5d44e 100755 --- a/cc-ci-plan/launch.sh +++ b/cc-ci-plan/launch.sh @@ -45,6 +45,12 @@ SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping BUILDER_SESSION="cc-ci-builder" ADV_SESSION="cc-ci-adv" WATCHDOG_SESSION="cc-ci-watchdog" +# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh. +ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}" +ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}" +# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself +# and don't want it auto-(re)launched. +WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}" # Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order, # auto-transitions on the phase's "## DONE" (in BUILDER_DIR/), and STOPS after the @@ -143,13 +149,71 @@ ping_session() { # the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is # just legitimately idle-waiting on a handoff. LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)' -nudge_if_limit_stalled() { - local s="$1" pane +# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays +# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the +# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary +# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop +# re-orients from the repo. Matched conservatively so it never fires on transient/working states. +FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified' + +# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on +# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen). +heal_session() { + local role="$1" s="$2" dir="$3" pane + if ! session_alive "$s"; then + log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))" + start_agent "$role" "$s" "$dir"; return 0 + fi pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)" - if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then return 0; fi # actively working - if ! printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then return 0; fi # not a limit stall - log "limit-stall detected on $s — re-nudging to resume" - ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing." + printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # actively working — leave alone + if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then + log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)" + tmux kill-session -t "$s" 2>/dev/null || true + start_agent "$role" "$s" "$dir"; return 0 + fi + if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then + log "limit-stall detected on $role ($s) — re-nudging to resume" + ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing." + fi +} + +# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second +# orchestrator that resumes the same conversation while one is already running (that double-resume is +# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be +# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator +# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two +# loop sessions (identified by their --remote-control name), or the managed tmux session exists. +orchestrator_alive() { + local pid args + for pid in $(pgrep -x claude 2>/dev/null); do + args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)" + # skip the two loops (matched by their remote-control session NAME, not a stray path mention) + printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv)'?" && continue + return 0 # a non-loop claude process => orchestrator (or operator) is alive + done + tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0 + return 1 +} + +# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY +# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart. +heal_orchestrator() { + [[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0 + [[ -x "$ORCH_LAUNCHER" ]] || return 0 + if orchestrator_alive; then + if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then + local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)" + printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # working — leave alone + if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then + log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh" + tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true + "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true + fi + fi + return 0 + fi + log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER" + "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true } # Edge-triggered handoff signalling for the CURRENT phase. Reads the loops' local clones. @@ -239,8 +303,9 @@ watchdog_loop() { exit 0 fi else - if session_alive "$BUILDER_SESSION"; then nudge_if_limit_stalled "$BUILDER_SESSION"; else log "builder gone — restarting (phase $pid)"; start_agent builder "$BUILDER_SESSION" "$BUILDER_DIR"; fi - if session_alive "$ADV_SESSION"; then nudge_if_limit_stalled "$ADV_SESSION"; else log "adversary gone — restarting (phase $pid)"; start_agent adversary "$ADV_SESSION" "$ADV_DIR"; fi + heal_session builder "$BUILDER_SESSION" "$BUILDER_DIR" + heal_session adversary "$ADV_SESSION" "$ADV_DIR" + heal_orchestrator fi fi sleep "$SIGNAL_INTERVAL"