#!/usr/bin/env bash # # launch.sh — start and supervise the two cc-ci autonomous loops + a phase-aware watchdog. # # Model (see plan.md §6 / §6.1): two INDEPENDENT Claude Code sessions — # • Builder (tmux session: cc-ci-builder) working clone /srv/cc-ci/cc-ci # • Adversary (tmux session: cc-ci-adv) working clone /srv/cc-ci/cc-ci-adv # coordinating only through the git repo on git.autonomic.zone. # # PHASES: the watchdog runs an ordered sequence of sub-phases (default: 1c → 1b → 1d → 1e → 2w → 2 → 2b → 3 → 4; # 2w = warm-canonical/--quick, interjected; Phase 2 pauses for it then resumes). # Each phase has its own plan + phase-namespaced loop-state files (STATUS-.md etc.). When a phase's # STATUS-.md shows "## DONE", the watchdog AUTO-TRANSITIONS to the next phase; after the LAST # phase (4, final review/polish/cleanup) it STOPS the loops and exits (end of the whole build). # # Three jobs: ITERATION (each agent's /loop), RESILIENCE (restart a dead loop), HANDOFF SIGNALLING # (ping the waiting loop the moment its counterpart hands off), PHASE SEQUENCING (this file). # # Usage: # ./launch.sh start # start the sequence at phase 0 + watchdog (stops/relaunches loops) # ./launch.sh watchdog # run only the supervision loop in the foreground # ./launch.sh status # show phase + session + DONE state # ./launch.sh logs builder|adversary|watchdog # tail a session/log # ./launch.sh stop # stop both loops + watchdog set -euo pipefail # Absolute path to this script, so the watchdog re-invokes it correctly regardless of cwd. SELF="$(readlink -f "${BASH_SOURCE[0]}")" # ----- config ------------------------------------------------------------- PLAN_DIR="${PLAN_DIR:-/srv/cc-ci/cc-ci-plan}" CLAUDE_BIN="${CLAUDE_BIN:-claude}" CLAUDE_FLAGS="${CLAUDE_FLAGS:---dangerously-skip-permissions}" # REMOTE_CONTROL=1 → interactive --remote-control sessions (viewable at claude.ai/code), required # for /loop. The box must be logged into the claude.ai account. =0 for plain interactive. REMOTE_CONTROL="${REMOTE_CONTROL:-1}" BUILDER_DIR="${BUILDER_DIR:-/srv/cc-ci/cc-ci}" # Builder's repo clone ADV_DIR="${ADV_DIR:-/srv/cc-ci/cc-ci-adv}" # Adversary's repo clone LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}" WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (phase DONE / restart dead loops) SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop) BUILDER_SESSION="cc-ci-builder" ADV_SESSION="cc-ci-adv" WATCHDOG_SESSION="cc-ci-watchdog" # Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh. ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}" ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}" # Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself # and don't want it auto-(re)launched. WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}" # Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order, # auto-transitions on the phase's "## DONE" (in BUILDER_DIR/), and STOPS after the # last one (manual gate). Override PHASES_SPEC (semicolon-separated) to change the sequence. PHASES_SPEC="${PHASES_SPEC:-1c|plan-phase1c-full-reproducibility.md|STATUS-1c.md;1b|plan-phase1b-review-lint.md|STATUS-1b.md;1d|plan-phase1d-generic-test-suite.md|STATUS-1d.md;1e|plan-phase1e-harness-corrections.md|STATUS-1e.md;2w|plan-phase2w-warm-canonical-quick.md|STATUS-2w.md;2|plan-phase2-recipe-tests.md|STATUS-2.md;2b|plan-phase2b-test-performance.md|STATUS-2b.md;3|plan-phase3-results-ux.md|STATUS-3.md;4|plan-phase4-final-review-polish-cleanup.md|STATUS-4.md}" IFS=';' read -r -a PHASES <<< "$PHASES_SPEC" PHASE_IDX_FILE="${PHASE_IDX_FILE:-$LOG_DIR/.phase-idx}" # -------------------------------------------------------------------------- log() { printf '[launch %(%H:%M:%S)T] %s\n' -1 "$*"; } die() { log "ERROR: $*"; exit 1; } need() { command -v "$1" >/dev/null 2>&1 || die "missing dependency: $1"; } # ----- phase helpers ------------------------------------------------------ cur_idx() { local i; i="$(cat "$PHASE_IDX_FILE" 2>/dev/null || echo 0)"; [[ "$i" =~ ^[0-9]+$ ]] || i=0; echo "$i"; } phase_id() { echo "${PHASES[$1]}" | cut -d'|' -f1; } phase_plan() { echo "${PHASES[$1]}" | cut -d'|' -f2; } phase_status() { echo "${PHASES[$1]}" | cut -d'|' -f3; } phase_review() { echo "REVIEW-$(phase_id "$1").md"; } # Loop-state files may sit at the repo root OR under machine-docs/ (the 1b RL6 move). Prefer # machine-docs/ if present, else root — so the watchdog survives the move whenever it happens. resolve_state() { local dir="$1" base="$2"; if [[ -f "$dir/machine-docs/$base" ]]; then echo "$dir/machine-docs/$base"; else echo "$dir/$base"; fi; } phase_done() { grep -qE '^##[[:space:]]+DONE' "$(resolve_state "$BUILDER_DIR" "$1")" 2>/dev/null; } # $1 = status basename (read locally) all_ids() { local p; for p in "${PHASES[@]}"; do printf '%s ' "$(echo "$p" | cut -d'|' -f1)"; done; } preflight() { need tmux command -v "$CLAUDE_BIN" >/dev/null 2>&1 || die "claude CLI not found (set CLAUDE_BIN)" local p plan for p in "${PHASES[@]}"; do plan="$(echo "$p" | cut -d'|' -f2)" [[ -f "$PLAN_DIR/$plan" ]] || die "missing phase plan $PLAN_DIR/$plan" done [[ -f "$PLAN_DIR/prompts/builder.md" ]] || die "missing $PLAN_DIR/prompts/builder.md" [[ -f "$PLAN_DIR/prompts/adversary.md" ]] || die "missing $PLAN_DIR/prompts/adversary.md" mkdir -p "$LOG_DIR" } session_alive() { tmux has-session -t "$1" 2>/dev/null; } # Build the per-session kickoff (phase preamble + base role prompt) and launch claude interactively. # role ∈ {builder, adversary}. Passed as a POSITIONAL arg via inner $(cat ...) — never stdin # (piping forces print mode and breaks /loop + remote-control). start_agent() { local role="$1" session="$2" workdir="$3" if session_alive "$session"; then log "$session already running — leaving it"; return 0; fi mkdir -p "$workdir" local idx pid plan status kf idx="$(cur_idx)"; pid="$(phase_id "$idx")"; plan="$(phase_plan "$idx")"; status="$(phase_status "$idx")" kf="$LOG_DIR/.kickoff-$session.txt" { cat < "$kf" log "starting $session (phase=$pid, plan=$plan, cwd=$workdir, rc=$REMOTE_CONTROL)" local rc="" [[ "$REMOTE_CONTROL" == "1" ]] && rc="--remote-control '$session'" tmux new-session -d -s "$session" -c "$workdir" \ "$CLAUDE_BIN $rc $CLAUDE_FLAGS \"\$(cat '$kf')\"" tmux pipe-pane -o -t "$session" "cat >> '$LOG_DIR/$session.log'" } start_loops() { start_agent builder "$BUILDER_SESSION" "$BUILDER_DIR" start_agent adversary "$ADV_SESSION" "$ADV_DIR" } stop_loops() { local s for s in "$BUILDER_SESSION" "$ADV_SESSION"; do if session_alive "$s"; then log "killing $s"; tmux kill-session -t "$s" || true; fi done } # Wake a loop by typing a one-line message into its tmux session (queues if mid-turn). ping_session() { local s="$1" msg="$2" session_alive "$s" || return 0 tmux send-keys -t "$s" -l -- "$msg" 2>/dev/null && { sleep 0.3; tmux send-keys -t "$s" Enter 2>/dev/null; } } # A loop can stall ALIVE on a usage/spend-limit notice: the claude process stays up (so the # dead-session restart never fires) but makes no progress, and the /loop self-pacing is dead because # the limit interrupted the turn that would have scheduled the next tick. Detect that signature # (limit text present + no active-turn marker) and re-nudge it each heavy tick — once the limit resets # the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is # just legitimately idle-waiting on a handoff. LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)' # FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays # alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the # "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary # repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop # re-orients from the repo. Matched conservatively so it never fires on transient/working states. FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified' # Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on # a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen). heal_session() { local role="$1" s="$2" dir="$3" pane if ! session_alive "$s"; then log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))" start_agent "$role" "$s" "$dir"; return 0 fi pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)" printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # actively working — leave alone if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)" tmux kill-session -t "$s" 2>/dev/null || true start_agent "$role" "$s" "$dir"; return 0 fi if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then log "limit-stall detected on $role ($s) — re-nudging to resume" ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing." fi } # Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second # orchestrator that resumes the same conversation while one is already running (that double-resume is # the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be # running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator # started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two # loop sessions (identified by their --remote-control name), or the managed tmux session exists. orchestrator_alive() { local pid args for pid in $(pgrep -x claude 2>/dev/null); do args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)" # skip the two loops (matched by their remote-control session NAME, not a stray path mention) printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv)'?" && continue return 0 # a non-loop claude process => orchestrator (or operator) is alive done tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0 return 1 } # Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY # when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart. heal_orchestrator() { [[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0 [[ -x "$ORCH_LAUNCHER" ]] || return 0 if orchestrator_alive; then if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)" printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # working — leave alone if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh" tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true fi fi return 0 fi log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER" "$ORCH_LAUNCHER" start >/dev/null 2>&1 || true } # Edge-triggered handoff signalling for the CURRENT phase. Reads the loops' local clones. # Ping the Adversary only when a gate id NEWLY appears on a "CLAIMED … awaiting" line (never on # the baseline / restart / a passed-but-kept line). Ping the Builder when the phase REVIEW changes. _wd_awaiting=""; _wd_baselined=""; _wd_last_review="" _wd_adv_inbox_seen=""; _wd_builder_inbox_seen="" handoff_reset() { _wd_awaiting=""; _wd_baselined=""; _wd_last_review=""; _wd_adv_inbox_seen=""; _wd_builder_inbox_seen=""; } # call on phase transition handoff_check() { local idx sf rf cur now added idx="$(cur_idx)" sf="$(resolve_state "$BUILDER_DIR" "$(phase_status "$idx")")"; rf="$(resolve_state "$ADV_DIR" "$(phase_review "$idx")")" if [[ -f "$sf" ]]; then now="$(grep -iE 'CLAIMED.*awaiting' "$sf" 2>/dev/null | grep -oiE 'M[0-9]+(\.[0-9]+)?|[A-Z][0-9]+' | tr '[:lower:]' '[:upper:]' | sort -u || true)" if [[ -n "$_wd_baselined" ]]; then added="$(comm -13 <(printf '%s\n' "$_wd_awaiting" | sort -u) <(printf '%s\n' "$now" | sort -u) | grep -vE '^$' || true)" if [[ -n "$added" ]]; then log "handoff: gate(s) newly awaiting verification: $(echo $added) -> pinging Adversary" ping_session "$ADV_SESSION" "watchdog ping: the Builder CLAIMED gate(s) [$(echo $added)] in $(phase_status "$idx") and is awaiting your verification. Pull and verify now." fi fi _wd_awaiting="$now"; _wd_baselined=1 fi if [[ -f "$rf" ]]; then cur="$(md5sum "$rf" 2>/dev/null | awk '{print $1}' || true)" if [[ -n "$cur" && "$cur" != "$_wd_last_review" ]]; then [[ -n "$_wd_last_review" ]] && { log "handoff: $(phase_review "$idx") changed -> pinging Builder" ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary updated $(phase_review "$idx") (a verdict or finding). Pull and act now — if it PASSes your gate, proceed; if it's a finding, address it." } _wd_last_review="$cur" fi fi # INBOX side-channel (§6.1). The sender writes the receiver's inbox in their OWN clone, so we # detect from the sender side. Edge-trigger on content hash so a fresh message (sender re-wrote # before receiver consumed) re-pings. Receiver deletes after processing => hash empty => next # write re-triggers. local adv_inbox builder_inbox h adv_inbox="$(resolve_state "$BUILDER_DIR" "ADVERSARY-INBOX.md")" if [[ -f "$adv_inbox" ]]; then h="$(md5sum "$adv_inbox" 2>/dev/null | awk '{print $1}' || true)" if [[ -n "$h" && "$h" != "$_wd_adv_inbox_seen" ]]; then log "handoff: ADVERSARY-INBOX.md new/changed -> pinging Adversary" ping_session "$ADV_SESSION" "watchdog ping: the Builder wrote machine-docs/ADVERSARY-INBOX.md — pull, read the message, act on it, then delete the file (commit + push) to mark it consumed." _wd_adv_inbox_seen="$h" fi else _wd_adv_inbox_seen="" # consumed; ready for the next write fi builder_inbox="$(resolve_state "$ADV_DIR" "BUILDER-INBOX.md")" if [[ -f "$builder_inbox" ]]; then h="$(md5sum "$builder_inbox" 2>/dev/null | awk '{print $1}' || true)" if [[ -n "$h" && "$h" != "$_wd_builder_inbox_seen" ]]; then log "handoff: BUILDER-INBOX.md new/changed -> pinging Builder" ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary wrote machine-docs/BUILDER-INBOX.md — pull, read the message, act on it, then delete the file (commit + push) to mark it consumed." _wd_builder_inbox_seen="$h" fi else _wd_builder_inbox_seen="" fi } watchdog_loop() { local idx pid status next idx="$(cur_idx)"; pid="$(phase_id "$idx")" log "watchdog up (phase=$pid [$((idx+1))/${#PHASES[@]}], seq='$(all_ids)', signal=${SIGNAL_INTERVAL}s, heavy=${WATCH_INTERVAL}s)" local elapsed="$WATCH_INTERVAL" while true; do handoff_check if (( elapsed >= WATCH_INTERVAL )); then elapsed=0 idx="$(cur_idx)"; pid="$(phase_id "$idx")"; status="$(phase_status "$idx")" if phase_done "$status"; then next=$((idx + 1)) if (( next < ${#PHASES[@]} )); then log "PHASE $pid DONE (## DONE in $status) — auto-transitioning to $(phase_id "$next")." stop_loops echo "$next" > "$PHASE_IDX_FILE" handoff_reset start_loops else log "PHASE SEQUENCE COMPLETE (last phase $pid DONE). Stopping loops — entire build (1c→3) finished." stop_loops printf 'cc-ci phase sequence complete %(%F %T)T. Phases: %s. Loops stopped; entire build finished.\n' -1 "$(all_ids)" > "$LOG_DIR/SEQUENCE-COMPLETE" log "watchdog exiting." exit 0 fi else heal_session builder "$BUILDER_SESSION" "$BUILDER_DIR" heal_session adversary "$ADV_SESSION" "$ADV_DIR" heal_orchestrator fi fi sleep "$SIGNAL_INTERVAL" elapsed=$(( elapsed + SIGNAL_INTERVAL )) done } start_watchdog() { if session_alive "$WATCHDOG_SESSION"; then log "watchdog already running"; return 0; fi log "starting watchdog" tmux new-session -d -s "$WATCHDOG_SESSION" -c "$PLAN_DIR" \ "exec >>'$LOG_DIR/watchdog.log' 2>&1; '$SELF' watchdog" } cmd_status() { local idx pid; idx="$(cur_idx)"; pid="$(phase_id "$idx")" echo " phase: $pid [$((idx+1))/${#PHASES[@]}] plan=$(phase_plan "$idx") status=$(phase_status "$idx")" local s for s in "$BUILDER_SESSION" "$ADV_SESSION" "$WATCHDOG_SESSION"; do if session_alive "$s"; then echo " $s: RUNNING"; else echo " $s: stopped"; fi done if phase_done "$(phase_status "$idx")"; then echo " phase $pid: ## DONE"; else echo " phase $pid: in progress"; fi [[ -f "$LOG_DIR/SEQUENCE-COMPLETE" ]] && echo " >>> $(cat "$LOG_DIR/SEQUENCE-COMPLETE")" } case "${1:-}" in start) preflight # Fresh sequence: stop any running loops, reset to phase 0 (unless RESUME_PHASE=1 keeps the idx). stop_loops if [[ "${RESUME_PHASE:-}" != "1" ]]; then echo 0 > "$PHASE_IDX_FILE"; fi rm -f "$LOG_DIR/SEQUENCE-COMPLETE" start_loops start_watchdog log "started at phase $(phase_id "$(cur_idx)"). status: ./launch.sh status | attach: tmux attach -t $BUILDER_SESSION" ;; watchdog) preflight; watchdog_loop ;; status) cmd_status ;; logs) case "${2:-}" in builder) tail -f "$LOG_DIR/$BUILDER_SESSION.log" ;; adversary) tail -f "$LOG_DIR/$ADV_SESSION.log" ;; watchdog) tail -f "$LOG_DIR/watchdog.log" ;; *) die "usage: $0 logs builder|adversary|watchdog" ;; esac ;; stop) stop_loops if session_alive "$WATCHDOG_SESSION"; then log "killing $WATCHDOG_SESSION"; tmux kill-session -t "$WATCHDOG_SESSION" || true; fi log "stopped." ;; *) cat <