cc-ci-orchestrator/cc-ci-plan/launch.sh

#!/usr/bin/env bash
#
# launch.sh — start and supervise the two cc-ci autonomous loops + a phase-aware watchdog.
#
# Model (see plan.md §6 / §6.1): two INDEPENDENT Claude Code sessions —
#   • Builder   (tmux session: cc-ci-builder)   working clone /srv/cc-ci/cc-ci
#   • Adversary (tmux session: cc-ci-adv)        working clone /srv/cc-ci/cc-ci-adv
# coordinating only through the git repo on git.autonomic.zone.
#
# PHASES: the watchdog runs an ordered sequence of sub-phases (default: 1c → 1b → 1d → 1e → 2w → 2 → 2b → 3 → 4;
# 2w = warm-canonical/--quick, interjected; Phase 2 pauses for it then resumes).
# Each phase has its own plan + phase-namespaced loop-state files (STATUS-<id>.md etc.). When a phase's
# STATUS-<id>.md shows "## DONE", the watchdog AUTO-TRANSITIONS to the next phase; after the LAST
# phase (4, final review/polish/cleanup) it STOPS the loops and exits (end of the whole build).
#
# Three jobs: ITERATION (each agent's /loop), RESILIENCE (restart a dead loop), HANDOFF SIGNALLING
# (ping the waiting loop the moment its counterpart hands off), PHASE SEQUENCING (this file).
#
# Usage:
#   ./launch.sh start       # start the sequence at phase 0 + watchdog (stops/relaunches loops)
#   ./launch.sh watchdog    # run only the supervision loop in the foreground
#   ./launch.sh status      # show phase + session + DONE state
#   ./launch.sh logs builder|adversary|watchdog   # tail a session/log
#   ./launch.sh stop        # stop both loops + watchdog

set -euo pipefail

# Absolute path to this script, so the watchdog re-invokes it correctly regardless of cwd.
SELF="$(readlink -f "${BASH_SOURCE[0]}")"

# ----- config -------------------------------------------------------------
PLAN_DIR="${PLAN_DIR:-/srv/cc-ci/cc-ci-plan}"
CLAUDE_BIN="${CLAUDE_BIN:-claude}"
CLAUDE_FLAGS="${CLAUDE_FLAGS:---dangerously-skip-permissions}"
# REMOTE_CONTROL=1 → interactive --remote-control sessions (viewable at claude.ai/code), required
# for /loop. The box must be logged into the claude.ai account. =0 for plain interactive.
REMOTE_CONTROL="${REMOTE_CONTROL:-1}"

BUILDER_DIR="${BUILDER_DIR:-/srv/cc-ci/cc-ci}"        # Builder's repo clone
ADV_DIR="${ADV_DIR:-/srv/cc-ci/cc-ci-adv}"            # Adversary's repo clone
LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}"

WATCH_INTERVAL="${WATCH_INTERVAL:-300}"   # seconds between HEAVY checks (phase DONE / restart dead loops)
SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}"  # seconds between HANDOFF checks (ping the waiting loop)
STALL_IDLE="${STALL_IDLE:-300}"           # NO-marker case: seconds a loop may sit idle (turn ended
                                          # without declaring a wait) before the watchdog reboots it
STALL_GRACE="${STALL_GRACE:-180}"         # marker case: seconds PAST a loop's WAITING-UNTIL before
                                          # reboot. The real ScheduleWakeup fires AT the stated time;
                                          # grace covers wake+start latency + marker/scheduler skew so
                                          # the watchdog never RACES (pre-empts) a healthy self-wake.

BUILDER_SESSION="cc-ci-builder"
ADV_SESSION="cc-ci-adv"
WATCHDOG_SESSION="cc-ci-watchdog"
# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh.
ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}"
ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}"
# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself
# and don't want it auto-(re)launched.
WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}"

# Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order,
# auto-transitions on the phase's "## DONE" (in BUILDER_DIR/<statusbasename>), and STOPS after the
# last one (manual gate). Override PHASES_SPEC (semicolon-separated) to change the sequence.
PHASES_SPEC="${PHASES_SPEC:-1c|plan-phase1c-full-reproducibility.md|STATUS-1c.md;1b|plan-phase1b-review-lint.md|STATUS-1b.md;1d|plan-phase1d-generic-test-suite.md|STATUS-1d.md;1e|plan-phase1e-harness-corrections.md|STATUS-1e.md;2w|plan-phase2w-warm-canonical-quick.md|STATUS-2w.md;2pc|plan-phase2pc-image-cache.md|STATUS-2pc.md;2|plan-phase2-recipe-tests.md|STATUS-2.md;2b|plan-phase2b-test-performance.md|STATUS-2b.md;3|plan-phase3-results-ux.md|STATUS-3.md;4|plan-phase4-final-review-polish-cleanup.md|STATUS-4.md;5|plan-phase5-verify-upgrade-flow.md|STATUS-5.md}"
IFS=';' read -r -a PHASES <<< "$PHASES_SPEC"
PHASE_IDX_FILE="${PHASE_IDX_FILE:-$LOG_DIR/.phase-idx}"
# --------------------------------------------------------------------------

log() { printf '[launch %(%H:%M:%S)T] %s\n' -1 "$*"; }
die() { log "ERROR: $*"; exit 1; }
need() { command -v "$1" >/dev/null 2>&1 || die "missing dependency: $1"; }

# ----- phase helpers ------------------------------------------------------
cur_idx()      { local i; i="$(cat "$PHASE_IDX_FILE" 2>/dev/null || echo 0)"; [[ "$i" =~ ^[0-9]+$ ]] || i=0; echo "$i"; }
phase_id()     { echo "${PHASES[$1]}" | cut -d'|' -f1; }
phase_plan()   { echo "${PHASES[$1]}" | cut -d'|' -f2; }
phase_status() { echo "${PHASES[$1]}" | cut -d'|' -f3; }
phase_review() { echo "REVIEW-$(phase_id "$1").md"; }
# Loop-state files may sit at the repo root OR under machine-docs/ (the 1b RL6 move). Prefer
# machine-docs/ if present, else root — so the watchdog survives the move whenever it happens.
resolve_state() { local dir="$1" base="$2"; if [[ -f "$dir/machine-docs/$base" ]]; then echo "$dir/machine-docs/$base"; else echo "$dir/$base"; fi; }
phase_done()   { grep -qE '^##[[:space:]]+DONE' "$(resolve_state "$BUILDER_DIR" "$1")" 2>/dev/null; }   # $1 = status basename (read locally)
all_ids()      { local p; for p in "${PHASES[@]}"; do printf '%s ' "$(echo "$p" | cut -d'|' -f1)"; done; }

preflight() {
  need tmux
  command -v "$CLAUDE_BIN" >/dev/null 2>&1 || die "claude CLI not found (set CLAUDE_BIN)"
  local p plan
  for p in "${PHASES[@]}"; do
    plan="$(echo "$p" | cut -d'|' -f2)"
    [[ -f "$PLAN_DIR/$plan" ]] || die "missing phase plan $PLAN_DIR/$plan"
  done
  [[ -f "$PLAN_DIR/prompts/builder.md"   ]] || die "missing $PLAN_DIR/prompts/builder.md"
  [[ -f "$PLAN_DIR/prompts/adversary.md" ]] || die "missing $PLAN_DIR/prompts/adversary.md"
  mkdir -p "$LOG_DIR"
}

session_alive() { tmux has-session -t "$1" 2>/dev/null; }

# Build the per-session kickoff (phase preamble + base role prompt) and launch claude interactively.
# role ∈ {builder, adversary}. Passed as a POSITIONAL arg via inner $(cat ...) — never stdin
# (piping forces print mode and breaks /loop + remote-control).
start_agent() {
  local role="$1" session="$2" workdir="$3"
  if session_alive "$session"; then log "$session already running — leaving it"; return 0; fi
  mkdir -p "$workdir"
  local idx pid plan status kf
  idx="$(cur_idx)"; pid="$(phase_id "$idx")"; plan="$(phase_plan "$idx")"; status="$(phase_status "$idx")"
  kf="$LOG_DIR/.kickoff-$session.txt"
  {
    cat <<PREAMBLE
*** cc-ci SUB-PHASE ${pid} ***
SINGLE SOURCE OF TRUTH for THIS phase: /srv/cc-ci/cc-ci-plan/${plan} — read it in full now; it defines this phase's mission and Definition of Done.
The general loop protocol still applies and lives in /srv/cc-ci/cc-ci-plan/plan.md (§6.1 coordination, §7 pacing, §9 guardrails) — read those sections too.
Track loop state in PHASE-NAMESPACED files in your repo clone: ${status}, BACKLOG-${pid}.md, REVIEW-${pid}.md, JOURNAL-${pid}.md. DECISIONS.md is shared (append).
"Done" for this phase = the Builder writes "## DONE" to ${status} ONLY after every Definition-of-Done item is Adversary-verified with a fresh PASS in REVIEW-${pid}.md (handshake per §6.1).
The repo's Phase-1 STATUS.md / BACKLOG.md / REVIEW.md are HISTORY from the completed Phase 1 — do NOT use them as your state; use the phase-namespaced files above.
Wherever the standing rules below say "plan.md"/"STATUS.md"/"BACKLOG.md"/"REVIEW.md", substitute the phase plan and these phase-namespaced files.

=== standing role & rules ===
PREAMBLE
    cat "$PLAN_DIR/prompts/$role.md"
  } > "$kf"
  log "starting $session (phase=$pid, plan=$plan, cwd=$workdir, rc=$REMOTE_CONTROL)"
  local rc=""
  [[ "$REMOTE_CONTROL" == "1" ]] && rc="--remote-control '$session'"
  tmux new-session -d -s "$session" -c "$workdir" \
    "$CLAUDE_BIN $rc $CLAUDE_FLAGS \"\$(cat '$kf')\""
  tmux pipe-pane -o -t "$session" "cat >> '$LOG_DIR/$session.log'"
}

start_loops() {
  start_agent builder   "$BUILDER_SESSION" "$BUILDER_DIR"
  start_agent adversary "$ADV_SESSION"     "$ADV_DIR"
}

stop_loops() {
  local s
  for s in "$BUILDER_SESSION" "$ADV_SESSION"; do
    if session_alive "$s"; then log "killing $s"; tmux kill-session -t "$s" || true; fi
  done
}

# Wake a loop by typing a one-line message into its tmux session (queues if mid-turn).
ping_session() {
  local s="$1" msg="$2"
  session_alive "$s" || return 0
  tmux send-keys -t "$s" -l -- "$msg" 2>/dev/null && { sleep 0.3; tmux send-keys -t "$s" Enter 2>/dev/null; }
}

# A loop can stall ALIVE on a usage/spend-limit notice: the claude process stays up (so the
# dead-session restart never fires) but makes no progress, and the /loop self-pacing is dead because
# the limit interrupted the turn that would have scheduled the next tick. Detect that signature
# (limit text present + no active-turn marker) and re-nudge it each heavy tick — once the limit resets
# the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is
# just legitimately idle-waiting on a handoff.
LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)'
# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays
# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the
# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary
# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop
# re-orients from the repo. Matched conservatively so it never fires on transient/working states.
FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified'

# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on
# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen).
heal_session() {
  local role="$1" s="$2" dir="$3" pane
  if ! session_alive "$s"; then
    log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))"
    start_agent "$role" "$s" "$dir"; return 0
  fi
  pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)"
  printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0   # actively working — leave alone
  if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
    log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)"
    tmux kill-session -t "$s" 2>/dev/null || true
    start_agent "$role" "$s" "$dir"; return 0
  fi
  if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then
    log "limit-stall detected on $role ($s) — re-nudging to resume"
    ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
  fi
}

# --- Idle-wedge detection (complements heal_session's dead/FATAL/limit cases) ----------------------
# A loop can sit ALIVE but wedged — e.g. garbled output at the context limit — showing none of the
# heal_session signals (not dead, no FATAL string, no limit notice). The loops therefore DECLARE every
# wait with a final-line marker `WAITING-UNTIL: <ISO-8601 UTC>` and cap each wait at 10 min (plan §7).
# A healthy idle loop ALWAYS has a current marker as its last message; a wedge does not (or has one
# whose time has already passed). So: reboot a loop that has been idle (no "esc to interrupt") for
# >= STALL_IDLE seconds AND (has no WAITING-UNTIL marker OR is now past the time that marker named).
# Runs every signal tick (30 s) for fine resolution; rebooting is safe — the loop re-orients from
# git + its phase STATUS/REVIEW files.
declare -A _wd_idle_since   # session -> epoch first seen idle this stretch (0/unset = working)

_parse_waiting_until() {    # arg1 = pane text; echoes epoch seconds of the last marker, or nothing
  local line ts
  line="$(printf '%s\n' "$1" | grep -oE 'WAITING-UNTIL:[[:space:]]*[0-9][0-9T:Z+-]+' | tail -1)"
  [[ -n "$line" ]] || return 0
  ts="$(printf '%s' "${line#WAITING-UNTIL:}" | tr -d '[:space:]')"
  date -u -d "$ts" +%s 2>/dev/null || true
}

stall_check_one() {
  local role="$1" s="$2" dir="$3" pane now until idle since reason
  session_alive "$s" || { _wd_idle_since[$s]=0; return 0; }   # dead => heal_session handles it
  now="$(printf '%(%s)T' -1)"
  pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -40 || true)"
  if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then
    _wd_idle_since[$s]=0; return 0                            # actively working — not idle
  fi
  since="${_wd_idle_since[$s]:-0}"
  if [[ "$since" == 0 ]]; then since="$now"; _wd_idle_since[$s]="$now"; fi
  idle=$(( now - since ))
  until="$(_parse_waiting_until "$pane")"
  if [[ -n "$until" ]]; then
    # Declared wait: the loop's own ScheduleWakeup fires AT 'until'. Reboot ONLY once we are
    # STALL_GRACE seconds PAST it — i.e. the self-wake genuinely failed. Never reboot before/at
    # 'until' (that races and pre-empts the healthy wake — the original false-reboot bug).
    (( now > until + STALL_GRACE )) || return 0
    reason="past its WAITING-UNTIL by $(( now - until ))s — self-wake did not fire"
  else
    # No declared wait: a turn ended without scheduling/declaring. Treat as a wedge once idle a while.
    (( idle >= STALL_IDLE )) || return 0
    reason="idle ${idle}s with no WAITING-UNTIL marker"
  fi
  log "stall: $role ($s) $reason — kill + reboot (re-orients from repo)"
  tmux kill-session -t "$s" 2>/dev/null || true
  start_agent "$role" "$s" "$dir"
  _wd_idle_since[$s]=0
}

stall_check() {
  stall_check_one builder   "$BUILDER_SESSION" "$BUILDER_DIR"
  stall_check_one adversary "$ADV_SESSION"     "$ADV_DIR"
}

# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second
# orchestrator that resumes the same conversation while one is already running (that double-resume is
# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be
# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator
# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two
# loop sessions (identified by their --remote-control name), or the managed tmux session exists.
orchestrator_alive() {
  local pid args
  for pid in $(pgrep -x claude 2>/dev/null); do
    args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)"
    # skip the loops + the one-shot upgrader job (matched by remote-control session NAME, not a
    # stray path mention) — none of these is the orchestrator.
    printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv|upgrader)'?" && continue
    return 0   # a non-loop claude process => orchestrator (or operator) is alive
  done
  tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0
  return 1
}

# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY
# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart.
heal_orchestrator() {
  [[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0
  [[ -x "$ORCH_LAUNCHER" ]] || return 0
  if orchestrator_alive; then
    if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then
      local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)"
      printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0   # working — leave alone
      if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
        log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh"
        tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true
        "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
      fi
    fi
    return 0
  fi
  log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER"
  "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
}

# Detect handoffs against the PUSHED origin/main — i.e. exactly what the RECEIVER will pull — NOT the
# writer's local working tree. (Reading the working tree fired on a claim/verdict the writer hadn't
# pushed yet; the receiver then pulled a stale remote, saw "no formal gate", and a clarifying
# inbox round-trip ensued. Mirroring origin/main eliminates that race.) origin/main is the shared
# branch, so all four files are read from one clone's origin/main after a single best-effort fetch.
_wd_fetch_origin() { git -C "$1" fetch -q origin 2>/dev/null || true; }
_wd_show_pushed()  { git -C "$1" show "origin/main:machine-docs/$2" 2>/dev/null || git -C "$1" show "origin/main:$2" 2>/dev/null || true; }

_wd_last_sha=""; _wd_adv_inbox_seen=""; _wd_builder_inbox_seen=""
handoff_reset() { _wd_last_sha=""; _wd_adv_inbox_seen=""; _wd_builder_inbox_seen=""; }   # call on phase transition
# Signal handoffs off the loops' CONVENTIONAL COMMIT PREFIXES on origin/main — NOT by parsing
# free-form markdown prose (brittle). The loops consistently prefix every gate claim `claim(...)`
# and every verdict/finding `review(...)`. So: a new `claim(` commit pushed => ping the Adversary;
# a new `review(` commit => ping the Builder. Edge-triggered on the origin/main SHA (append-only —
# the loops never force-push), so it can't double-fire or mis-route. INBOX files are detected
# separately (which file changed routes the ping). All reads are of the PUSHED state (what the
# receiver pulls).
handoff_check() {
  local head subjects adv_inbox builder_inbox h
  _wd_fetch_origin "$BUILDER_DIR"
  head="$(git -C "$BUILDER_DIR" rev-parse origin/main 2>/dev/null || true)"
  if [[ -n "$head" ]]; then
    if [[ -z "$_wd_last_sha" ]]; then
      _wd_last_sha="$head"   # baseline silently on first observation / restart
    elif [[ "$head" != "$_wd_last_sha" ]]; then
      subjects="$(git -C "$BUILDER_DIR" log --format='%s' "${_wd_last_sha}..origin/main" 2>/dev/null || true)"
      if printf '%s\n' "$subjects" | grep -qiE '^claim'; then
        log "handoff: new claim(...) commit on origin/main -> pinging Adversary"
        ping_session "$ADV_SESSION" "watchdog ping: the Builder pushed a gate CLAIM (claim(...) commit). Pull and verify the claimed gate now."
      fi
      if printf '%s\n' "$subjects" | grep -qiE '^review'; then
        log "handoff: new review(...) commit on origin/main -> pinging Builder"
        ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary pushed a verdict/finding (review(...) commit). Pull REVIEW and act — proceed if it PASSes your gate, address it if it's a finding."
      fi
      _wd_last_sha="$head"
    fi
  fi

  adv_inbox="$(_wd_show_pushed "$BUILDER_DIR" "ADVERSARY-INBOX.md")"
  builder_inbox="$(_wd_show_pushed "$BUILDER_DIR" "BUILDER-INBOX.md")"

  # INBOX side-channel (§6.1), detected on the pushed state. Receiver deletes after consuming =>
  # absent on origin/main => re-arm so the next write re-pings.
  if [[ -n "$adv_inbox" ]]; then
    h="$(printf '%s' "$adv_inbox" | md5sum | awk '{print $1}')"
    if [[ "$h" != "$_wd_adv_inbox_seen" ]]; then
      log "handoff: ADVERSARY-INBOX.md new/changed (pushed) -> pinging Adversary"
      ping_session "$ADV_SESSION" "watchdog ping: the Builder pushed machine-docs/ADVERSARY-INBOX.md — pull, read it, act, then delete the file (commit + push) to mark it consumed."
      _wd_adv_inbox_seen="$h"
    fi
  else
    _wd_adv_inbox_seen=""
  fi
  if [[ -n "$builder_inbox" ]]; then
    h="$(printf '%s' "$builder_inbox" | md5sum | awk '{print $1}')"
    if [[ "$h" != "$_wd_builder_inbox_seen" ]]; then
      log "handoff: BUILDER-INBOX.md new/changed (pushed) -> pinging Builder"
      ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary pushed machine-docs/BUILDER-INBOX.md — pull, read it, act, then delete the file (commit + push) to mark it consumed."
      _wd_builder_inbox_seen="$h"
    fi
  else
    _wd_builder_inbox_seen=""
  fi
}

watchdog_loop() {
  local idx pid status next
  idx="$(cur_idx)"; pid="$(phase_id "$idx")"
  log "watchdog up (phase=$pid [$((idx+1))/${#PHASES[@]}], seq='$(all_ids)', signal=${SIGNAL_INTERVAL}s, heavy=${WATCH_INTERVAL}s)"
  local elapsed="$WATCH_INTERVAL"
  while true; do
    handoff_check
    stall_check
    if (( elapsed >= WATCH_INTERVAL )); then
      elapsed=0
      idx="$(cur_idx)"; pid="$(phase_id "$idx")"; status="$(phase_status "$idx")"
      if phase_done "$status"; then
        next=$((idx + 1))
        if (( next < ${#PHASES[@]} )); then
          log "PHASE $pid DONE (## DONE in $status) — auto-transitioning to $(phase_id "$next")."
          stop_loops
          echo "$next" > "$PHASE_IDX_FILE"
          handoff_reset
          start_loops
        else
          log "PHASE SEQUENCE COMPLETE (last phase $pid DONE). Stopping loops — entire build (1c→3) finished."
          stop_loops
          printf 'cc-ci phase sequence complete %(%F %T)T. Phases: %s. Loops stopped; entire build finished.\n' -1 "$(all_ids)" > "$LOG_DIR/SEQUENCE-COMPLETE"
          log "watchdog exiting."
          exit 0
        fi
      else
        heal_session builder   "$BUILDER_SESSION" "$BUILDER_DIR"
        heal_session adversary "$ADV_SESSION"     "$ADV_DIR"
        heal_orchestrator
      fi
    fi
    sleep "$SIGNAL_INTERVAL"
    elapsed=$(( elapsed + SIGNAL_INTERVAL ))
  done
}

start_watchdog() {
  if session_alive "$WATCHDOG_SESSION"; then log "watchdog already running"; return 0; fi
  log "starting watchdog"
  tmux new-session -d -s "$WATCHDOG_SESSION" -c "$PLAN_DIR" \
    "exec >>'$LOG_DIR/watchdog.log' 2>&1; '$SELF' watchdog"
}

cmd_status() {
  local idx pid; idx="$(cur_idx)"; pid="$(phase_id "$idx")"
  echo "  phase: $pid [$((idx+1))/${#PHASES[@]}]  plan=$(phase_plan "$idx")  status=$(phase_status "$idx")"
  local s
  for s in "$BUILDER_SESSION" "$ADV_SESSION" "$WATCHDOG_SESSION"; do
    if session_alive "$s"; then echo "  $s: RUNNING"; else echo "  $s: stopped"; fi
  done
  if phase_done "$(phase_status "$idx")"; then echo "  phase $pid: ## DONE"; else echo "  phase $pid: in progress"; fi
  [[ -f "$LOG_DIR/SEQUENCE-COMPLETE" ]] && echo "  >>> $(cat "$LOG_DIR/SEQUENCE-COMPLETE")"
}

case "${1:-}" in
  start)
    preflight
    # Fresh sequence: stop any running loops, reset to phase 0 (unless RESUME_PHASE=1 keeps the idx).
    stop_loops
    if [[ "${RESUME_PHASE:-}" != "1" ]]; then echo 0 > "$PHASE_IDX_FILE"; fi
    rm -f "$LOG_DIR/SEQUENCE-COMPLETE"
    start_loops
    start_watchdog
    log "started at phase $(phase_id "$(cur_idx)"). status: ./launch.sh status | attach: tmux attach -t $BUILDER_SESSION"
    ;;
  watchdog)  preflight; watchdog_loop ;;
  status)    cmd_status ;;
  logs)
    case "${2:-}" in
      builder)   tail -f "$LOG_DIR/$BUILDER_SESSION.log" ;;
      adversary) tail -f "$LOG_DIR/$ADV_SESSION.log" ;;
      watchdog)  tail -f "$LOG_DIR/watchdog.log" ;;
      *) die "usage: $0 logs builder|adversary|watchdog" ;;
    esac
    ;;
  stop)
    stop_loops
    if session_alive "$WATCHDOG_SESSION"; then log "killing $WATCHDOG_SESSION"; tmux kill-session -t "$WATCHDOG_SESSION" || true; fi
    log "stopped."
    ;;
  *)
    cat <<EOF
cc-ci loop launcher (phase-aware)

  $0 start    start the phase sequence at phase 0 + watchdog (stops any running loops first)
  $0 status   show phase + session + DONE state
  $0 logs builder|adversary|watchdog   tail a log
  $0 stop     stop both loops + watchdog
  $0 watchdog run supervision loop in foreground

Phase sequence (auto-transition on per-phase ## DONE; STOP after the last = manual gate):
  $(all_ids)
Env: CLAUDE_BIN=$CLAUDE_BIN  REMOTE_CONTROL=$REMOTE_CONTROL  WATCH_INTERVAL=${WATCH_INTERVAL}s  SIGNAL_INTERVAL=${SIGNAL_INTERVAL}s
       PHASES_SPEC='$PHASES_SPEC'
       RESUME_PHASE=1 to keep the current phase index instead of resetting to 0.
EOF
    ;;
esac