The weekly upgrade run now executes inside a dedicated, remote-control agent (cc-ci-upgrader) — viewable/steerable at claude.ai/code like the Builder — rather than buried in headless cron output. - launch-upgrader.sh: spins up the cc-ci-upgrader tmux session under --remote-control with a kickoff that runs /upgrade-all (DEFAULT mode) to completion. On finish the agent STOPS and stays idle (does NOT self-terminate) so the run + summary stay reviewable in the web UI. `start` = use-or-create: leaves an in-flight (busy) run alone, else clears a finished/idle/wedged session and runs fresh; `fresh` always restarts. UPGRADER_ARGS passes flags (e.g. --dry-run); never --with-tests. - launch.sh: orchestrator_alive() now also skips the cc-ci-upgrader remote-control name, so the upgrader job isn't mistaken for the orchestrator. - upgrade-all skill: documents it runs as the cc-ci-upgrader agent; the weekly cron invokes `launch-upgrader.sh start` (not /upgrade-all inline). - Phase 5: V8a verifies the agent lifecycle (launch → run to completion → stay idle/viewable → next start clears it); V9 stops the verification session. - cron memory: weekly task = launch-upgrader.sh start at 0 3 * * 6 UTC. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
444 lines
24 KiB
Bash
Executable File
444 lines
24 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# launch.sh — start and supervise the two cc-ci autonomous loops + a phase-aware watchdog.
|
|
#
|
|
# Model (see plan.md §6 / §6.1): two INDEPENDENT Claude Code sessions —
|
|
# • Builder (tmux session: cc-ci-builder) working clone /srv/cc-ci/cc-ci
|
|
# • Adversary (tmux session: cc-ci-adv) working clone /srv/cc-ci/cc-ci-adv
|
|
# coordinating only through the git repo on git.autonomic.zone.
|
|
#
|
|
# PHASES: the watchdog runs an ordered sequence of sub-phases (default: 1c → 1b → 1d → 1e → 2w → 2 → 2b → 3 → 4;
|
|
# 2w = warm-canonical/--quick, interjected; Phase 2 pauses for it then resumes).
|
|
# Each phase has its own plan + phase-namespaced loop-state files (STATUS-<id>.md etc.). When a phase's
|
|
# STATUS-<id>.md shows "## DONE", the watchdog AUTO-TRANSITIONS to the next phase; after the LAST
|
|
# phase (4, final review/polish/cleanup) it STOPS the loops and exits (end of the whole build).
|
|
#
|
|
# Three jobs: ITERATION (each agent's /loop), RESILIENCE (restart a dead loop), HANDOFF SIGNALLING
|
|
# (ping the waiting loop the moment its counterpart hands off), PHASE SEQUENCING (this file).
|
|
#
|
|
# Usage:
|
|
# ./launch.sh start # start the sequence at phase 0 + watchdog (stops/relaunches loops)
|
|
# ./launch.sh watchdog # run only the supervision loop in the foreground
|
|
# ./launch.sh status # show phase + session + DONE state
|
|
# ./launch.sh logs builder|adversary|watchdog # tail a session/log
|
|
# ./launch.sh stop # stop both loops + watchdog
|
|
|
|
set -euo pipefail
|
|
|
|
# Absolute path to this script, so the watchdog re-invokes it correctly regardless of cwd.
|
|
SELF="$(readlink -f "${BASH_SOURCE[0]}")"
|
|
|
|
# ----- config -------------------------------------------------------------
|
|
PLAN_DIR="${PLAN_DIR:-/srv/cc-ci/cc-ci-plan}"
|
|
CLAUDE_BIN="${CLAUDE_BIN:-claude}"
|
|
CLAUDE_FLAGS="${CLAUDE_FLAGS:---dangerously-skip-permissions}"
|
|
# REMOTE_CONTROL=1 → interactive --remote-control sessions (viewable at claude.ai/code), required
|
|
# for /loop. The box must be logged into the claude.ai account. =0 for plain interactive.
|
|
REMOTE_CONTROL="${REMOTE_CONTROL:-1}"
|
|
|
|
BUILDER_DIR="${BUILDER_DIR:-/srv/cc-ci/cc-ci}" # Builder's repo clone
|
|
ADV_DIR="${ADV_DIR:-/srv/cc-ci/cc-ci-adv}" # Adversary's repo clone
|
|
LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}"
|
|
|
|
WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (phase DONE / restart dead loops)
|
|
SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop)
|
|
STALL_IDLE="${STALL_IDLE:-300}" # NO-marker case: seconds a loop may sit idle (turn ended
|
|
# without declaring a wait) before the watchdog reboots it
|
|
STALL_GRACE="${STALL_GRACE:-180}" # marker case: seconds PAST a loop's WAITING-UNTIL before
|
|
# reboot. The real ScheduleWakeup fires AT the stated time;
|
|
# grace covers wake+start latency + marker/scheduler skew so
|
|
# the watchdog never RACES (pre-empts) a healthy self-wake.
|
|
|
|
BUILDER_SESSION="cc-ci-builder"
|
|
ADV_SESSION="cc-ci-adv"
|
|
WATCHDOG_SESSION="cc-ci-watchdog"
|
|
# Orchestrator (supervisory session) — the watchdog keeps it alive too, via launch-orchestrator.sh.
|
|
ORCH_SESSION="${ORCH_SESSION:-cc-ci-orchestrator}"
|
|
ORCH_LAUNCHER="${ORCH_LAUNCHER:-$PLAN_DIR/launch-orchestrator.sh}"
|
|
# Watchdog supervision of the orchestrator can be disabled (=0) if you run the orchestrator yourself
|
|
# and don't want it auto-(re)launched.
|
|
WATCH_ORCHESTRATOR="${WATCH_ORCHESTRATOR:-1}"
|
|
|
|
# Ordered phase sequence: each entry "id|planfile|statusbasename". The watchdog runs them in order,
|
|
# auto-transitions on the phase's "## DONE" (in BUILDER_DIR/<statusbasename>), and STOPS after the
|
|
# last one (manual gate). Override PHASES_SPEC (semicolon-separated) to change the sequence.
|
|
PHASES_SPEC="${PHASES_SPEC:-1c|plan-phase1c-full-reproducibility.md|STATUS-1c.md;1b|plan-phase1b-review-lint.md|STATUS-1b.md;1d|plan-phase1d-generic-test-suite.md|STATUS-1d.md;1e|plan-phase1e-harness-corrections.md|STATUS-1e.md;2w|plan-phase2w-warm-canonical-quick.md|STATUS-2w.md;2pc|plan-phase2pc-image-cache.md|STATUS-2pc.md;2|plan-phase2-recipe-tests.md|STATUS-2.md;2b|plan-phase2b-test-performance.md|STATUS-2b.md;3|plan-phase3-results-ux.md|STATUS-3.md;4|plan-phase4-final-review-polish-cleanup.md|STATUS-4.md;5|plan-phase5-verify-upgrade-flow.md|STATUS-5.md}"
|
|
IFS=';' read -r -a PHASES <<< "$PHASES_SPEC"
|
|
PHASE_IDX_FILE="${PHASE_IDX_FILE:-$LOG_DIR/.phase-idx}"
|
|
# --------------------------------------------------------------------------
|
|
|
|
log() { printf '[launch %(%H:%M:%S)T] %s\n' -1 "$*"; }
|
|
die() { log "ERROR: $*"; exit 1; }
|
|
need() { command -v "$1" >/dev/null 2>&1 || die "missing dependency: $1"; }
|
|
|
|
# ----- phase helpers ------------------------------------------------------
|
|
cur_idx() { local i; i="$(cat "$PHASE_IDX_FILE" 2>/dev/null || echo 0)"; [[ "$i" =~ ^[0-9]+$ ]] || i=0; echo "$i"; }
|
|
phase_id() { echo "${PHASES[$1]}" | cut -d'|' -f1; }
|
|
phase_plan() { echo "${PHASES[$1]}" | cut -d'|' -f2; }
|
|
phase_status() { echo "${PHASES[$1]}" | cut -d'|' -f3; }
|
|
phase_review() { echo "REVIEW-$(phase_id "$1").md"; }
|
|
# Loop-state files may sit at the repo root OR under machine-docs/ (the 1b RL6 move). Prefer
|
|
# machine-docs/ if present, else root — so the watchdog survives the move whenever it happens.
|
|
resolve_state() { local dir="$1" base="$2"; if [[ -f "$dir/machine-docs/$base" ]]; then echo "$dir/machine-docs/$base"; else echo "$dir/$base"; fi; }
|
|
phase_done() { grep -qE '^##[[:space:]]+DONE' "$(resolve_state "$BUILDER_DIR" "$1")" 2>/dev/null; } # $1 = status basename (read locally)
|
|
all_ids() { local p; for p in "${PHASES[@]}"; do printf '%s ' "$(echo "$p" | cut -d'|' -f1)"; done; }
|
|
|
|
preflight() {
|
|
need tmux
|
|
command -v "$CLAUDE_BIN" >/dev/null 2>&1 || die "claude CLI not found (set CLAUDE_BIN)"
|
|
local p plan
|
|
for p in "${PHASES[@]}"; do
|
|
plan="$(echo "$p" | cut -d'|' -f2)"
|
|
[[ -f "$PLAN_DIR/$plan" ]] || die "missing phase plan $PLAN_DIR/$plan"
|
|
done
|
|
[[ -f "$PLAN_DIR/prompts/builder.md" ]] || die "missing $PLAN_DIR/prompts/builder.md"
|
|
[[ -f "$PLAN_DIR/prompts/adversary.md" ]] || die "missing $PLAN_DIR/prompts/adversary.md"
|
|
mkdir -p "$LOG_DIR"
|
|
}
|
|
|
|
session_alive() { tmux has-session -t "$1" 2>/dev/null; }
|
|
|
|
# Build the per-session kickoff (phase preamble + base role prompt) and launch claude interactively.
|
|
# role ∈ {builder, adversary}. Passed as a POSITIONAL arg via inner $(cat ...) — never stdin
|
|
# (piping forces print mode and breaks /loop + remote-control).
|
|
start_agent() {
|
|
local role="$1" session="$2" workdir="$3"
|
|
if session_alive "$session"; then log "$session already running — leaving it"; return 0; fi
|
|
mkdir -p "$workdir"
|
|
local idx pid plan status kf
|
|
idx="$(cur_idx)"; pid="$(phase_id "$idx")"; plan="$(phase_plan "$idx")"; status="$(phase_status "$idx")"
|
|
kf="$LOG_DIR/.kickoff-$session.txt"
|
|
{
|
|
cat <<PREAMBLE
|
|
*** cc-ci SUB-PHASE ${pid} ***
|
|
SINGLE SOURCE OF TRUTH for THIS phase: /srv/cc-ci/cc-ci-plan/${plan} — read it in full now; it defines this phase's mission and Definition of Done.
|
|
The general loop protocol still applies and lives in /srv/cc-ci/cc-ci-plan/plan.md (§6.1 coordination, §7 pacing, §9 guardrails) — read those sections too.
|
|
Track loop state in PHASE-NAMESPACED files in your repo clone: ${status}, BACKLOG-${pid}.md, REVIEW-${pid}.md, JOURNAL-${pid}.md. DECISIONS.md is shared (append).
|
|
"Done" for this phase = the Builder writes "## DONE" to ${status} ONLY after every Definition-of-Done item is Adversary-verified with a fresh PASS in REVIEW-${pid}.md (handshake per §6.1).
|
|
The repo's Phase-1 STATUS.md / BACKLOG.md / REVIEW.md are HISTORY from the completed Phase 1 — do NOT use them as your state; use the phase-namespaced files above.
|
|
Wherever the standing rules below say "plan.md"/"STATUS.md"/"BACKLOG.md"/"REVIEW.md", substitute the phase plan and these phase-namespaced files.
|
|
|
|
=== standing role & rules ===
|
|
PREAMBLE
|
|
cat "$PLAN_DIR/prompts/$role.md"
|
|
} > "$kf"
|
|
log "starting $session (phase=$pid, plan=$plan, cwd=$workdir, rc=$REMOTE_CONTROL)"
|
|
local rc=""
|
|
[[ "$REMOTE_CONTROL" == "1" ]] && rc="--remote-control '$session'"
|
|
tmux new-session -d -s "$session" -c "$workdir" \
|
|
"$CLAUDE_BIN $rc $CLAUDE_FLAGS \"\$(cat '$kf')\""
|
|
tmux pipe-pane -o -t "$session" "cat >> '$LOG_DIR/$session.log'"
|
|
}
|
|
|
|
start_loops() {
|
|
start_agent builder "$BUILDER_SESSION" "$BUILDER_DIR"
|
|
start_agent adversary "$ADV_SESSION" "$ADV_DIR"
|
|
}
|
|
|
|
stop_loops() {
|
|
local s
|
|
for s in "$BUILDER_SESSION" "$ADV_SESSION"; do
|
|
if session_alive "$s"; then log "killing $s"; tmux kill-session -t "$s" || true; fi
|
|
done
|
|
}
|
|
|
|
# Wake a loop by typing a one-line message into its tmux session (queues if mid-turn).
|
|
ping_session() {
|
|
local s="$1" msg="$2"
|
|
session_alive "$s" || return 0
|
|
tmux send-keys -t "$s" -l -- "$msg" 2>/dev/null && { sleep 0.3; tmux send-keys -t "$s" Enter 2>/dev/null; }
|
|
}
|
|
|
|
# A loop can stall ALIVE on a usage/spend-limit notice: the claude process stays up (so the
|
|
# dead-session restart never fires) but makes no progress, and the /loop self-pacing is dead because
|
|
# the limit interrupted the turn that would have scheduled the next tick. Detect that signature
|
|
# (limit text present + no active-turn marker) and re-nudge it each heavy tick — once the limit resets
|
|
# the next nudge lands and the loop resumes. Gated on the limit text so we NEVER nudge a loop that is
|
|
# just legitimately idle-waiting on a handoff.
|
|
LIMIT_RE='spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)'
|
|
# FATAL = an unrecoverable session-state API error that recurs on EVERY turn (so the session stays
|
|
# alive but wedged — a nudge can't fix it; only a fresh session can). The confirmed case: the
|
|
# "thinking/redacted_thinking blocks ... cannot be modified" 400 that has hit the Adversary
|
|
# repeatedly (interrupted-mid-thinking corrupts the replayed history). Kill + restart fresh; the loop
|
|
# re-orients from the repo. Matched conservatively so it never fires on transient/working states.
|
|
FATAL_RE='redacted_thinking|blocks cannot be modified|cannot be modified'
|
|
|
|
# Heal one loop session: dead -> restart; wedged on a FATAL error -> kill + restart fresh; stalled on
|
|
# a usage limit -> nudge. No-op while actively working ("esc to interrupt" on screen).
|
|
heal_session() {
|
|
local role="$1" s="$2" dir="$3" pane
|
|
if ! session_alive "$s"; then
|
|
log "$role ($s) gone — restarting (phase $(phase_id "$(cur_idx)"))"
|
|
start_agent "$role" "$s" "$dir"; return 0
|
|
fi
|
|
pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -25 || true)"
|
|
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # actively working — leave alone
|
|
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
|
|
log "FATAL session-state error on $role ($s) — kill + restart fresh (re-orients from repo)"
|
|
tmux kill-session -t "$s" 2>/dev/null || true
|
|
start_agent "$role" "$s" "$dir"; return 0
|
|
fi
|
|
if printf '%s\n' "$pane" | grep -qiE "$LIMIT_RE"; then
|
|
log "limit-stall detected on $role ($s) — re-nudging to resume"
|
|
ping_session "$s" "watchdog: the usage/spend limit appears lifted — RESUME your loop now. Pull latest, re-read your phase STATUS/REVIEW files, and continue from where you stopped; re-arm your loop pacing."
|
|
fi
|
|
}
|
|
|
|
# --- Idle-wedge detection (complements heal_session's dead/FATAL/limit cases) ----------------------
|
|
# A loop can sit ALIVE but wedged — e.g. garbled output at the context limit — showing none of the
|
|
# heal_session signals (not dead, no FATAL string, no limit notice). The loops therefore DECLARE every
|
|
# wait with a final-line marker `WAITING-UNTIL: <ISO-8601 UTC>` and cap each wait at 10 min (plan §7).
|
|
# A healthy idle loop ALWAYS has a current marker as its last message; a wedge does not (or has one
|
|
# whose time has already passed). So: reboot a loop that has been idle (no "esc to interrupt") for
|
|
# >= STALL_IDLE seconds AND (has no WAITING-UNTIL marker OR is now past the time that marker named).
|
|
# Runs every signal tick (30 s) for fine resolution; rebooting is safe — the loop re-orients from
|
|
# git + its phase STATUS/REVIEW files.
|
|
declare -A _wd_idle_since # session -> epoch first seen idle this stretch (0/unset = working)
|
|
|
|
_parse_waiting_until() { # arg1 = pane text; echoes epoch seconds of the last marker, or nothing
|
|
local line ts
|
|
line="$(printf '%s\n' "$1" | grep -oE 'WAITING-UNTIL:[[:space:]]*[0-9][0-9T:Z+-]+' | tail -1)"
|
|
[[ -n "$line" ]] || return 0
|
|
ts="$(printf '%s' "${line#WAITING-UNTIL:}" | tr -d '[:space:]')"
|
|
date -u -d "$ts" +%s 2>/dev/null || true
|
|
}
|
|
|
|
stall_check_one() {
|
|
local role="$1" s="$2" dir="$3" pane now until idle since reason
|
|
session_alive "$s" || { _wd_idle_since[$s]=0; return 0; } # dead => heal_session handles it
|
|
now="$(printf '%(%s)T' -1)"
|
|
pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -40 || true)"
|
|
if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then
|
|
_wd_idle_since[$s]=0; return 0 # actively working — not idle
|
|
fi
|
|
since="${_wd_idle_since[$s]:-0}"
|
|
if [[ "$since" == 0 ]]; then since="$now"; _wd_idle_since[$s]="$now"; fi
|
|
idle=$(( now - since ))
|
|
until="$(_parse_waiting_until "$pane")"
|
|
if [[ -n "$until" ]]; then
|
|
# Declared wait: the loop's own ScheduleWakeup fires AT 'until'. Reboot ONLY once we are
|
|
# STALL_GRACE seconds PAST it — i.e. the self-wake genuinely failed. Never reboot before/at
|
|
# 'until' (that races and pre-empts the healthy wake — the original false-reboot bug).
|
|
(( now > until + STALL_GRACE )) || return 0
|
|
reason="past its WAITING-UNTIL by $(( now - until ))s — self-wake did not fire"
|
|
else
|
|
# No declared wait: a turn ended without scheduling/declaring. Treat as a wedge once idle a while.
|
|
(( idle >= STALL_IDLE )) || return 0
|
|
reason="idle ${idle}s with no WAITING-UNTIL marker"
|
|
fi
|
|
log "stall: $role ($s) $reason — kill + reboot (re-orients from repo)"
|
|
tmux kill-session -t "$s" 2>/dev/null || true
|
|
start_agent "$role" "$s" "$dir"
|
|
_wd_idle_since[$s]=0
|
|
}
|
|
|
|
stall_check() {
|
|
stall_check_one builder "$BUILDER_SESSION" "$BUILDER_DIR"
|
|
stall_check_one adversary "$ADV_SESSION" "$ADV_DIR"
|
|
}
|
|
|
|
# Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second
|
|
# orchestrator that resumes the same conversation while one is already running (that double-resume is
|
|
# the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be
|
|
# running as a managed tmux session (cc-ci-orchestrator) OR as a plain terminal session the operator
|
|
# started by hand (no flags). So: alive iff any `claude` process exists that is NOT one of the two
|
|
# loop sessions (identified by their --remote-control name), or the managed tmux session exists.
|
|
orchestrator_alive() {
|
|
local pid args
|
|
for pid in $(pgrep -x claude 2>/dev/null); do
|
|
args="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null || true)"
|
|
# skip the loops + the one-shot upgrader job (matched by remote-control session NAME, not a
|
|
# stray path mention) — none of these is the orchestrator.
|
|
printf '%s' "$args" | grep -qE -- "--remote-control +'?cc-ci-(builder|adv|upgrader)'?" && continue
|
|
return 0 # a non-loop claude process => orchestrator (or operator) is alive
|
|
done
|
|
tmux has-session -t "$ORCH_SESSION" 2>/dev/null && return 0
|
|
return 1
|
|
}
|
|
|
|
# Keep the orchestrator alive: restart it (via launch-orchestrator.sh, which resumes its session) ONLY
|
|
# when none is running; if it's the managed tmux session and wedged on a FATAL error, kill+restart.
|
|
heal_orchestrator() {
|
|
[[ "$WATCH_ORCHESTRATOR" == "1" ]] || return 0
|
|
[[ -x "$ORCH_LAUNCHER" ]] || return 0
|
|
if orchestrator_alive; then
|
|
if tmux has-session -t "$ORCH_SESSION" 2>/dev/null; then
|
|
local pane; pane="$(tmux capture-pane -pt "$ORCH_SESSION" 2>/dev/null | tail -25 || true)"
|
|
printf '%s\n' "$pane" | grep -q 'esc to interrupt' && return 0 # working — leave alone
|
|
if printf '%s\n' "$pane" | grep -qiE "$FATAL_RE"; then
|
|
log "FATAL session-state error on orchestrator ($ORCH_SESSION) — kill + restart fresh"
|
|
tmux kill-session -t "$ORCH_SESSION" 2>/dev/null || true
|
|
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
|
|
fi
|
|
fi
|
|
return 0
|
|
fi
|
|
log "orchestrator not running anywhere — restarting via $ORCH_LAUNCHER"
|
|
"$ORCH_LAUNCHER" start >/dev/null 2>&1 || true
|
|
}
|
|
|
|
# Detect handoffs against the PUSHED origin/main — i.e. exactly what the RECEIVER will pull — NOT the
|
|
# writer's local working tree. (Reading the working tree fired on a claim/verdict the writer hadn't
|
|
# pushed yet; the receiver then pulled a stale remote, saw "no formal gate", and a clarifying
|
|
# inbox round-trip ensued. Mirroring origin/main eliminates that race.) origin/main is the shared
|
|
# branch, so all four files are read from one clone's origin/main after a single best-effort fetch.
|
|
_wd_fetch_origin() { git -C "$1" fetch -q origin 2>/dev/null || true; }
|
|
_wd_show_pushed() { git -C "$1" show "origin/main:machine-docs/$2" 2>/dev/null || git -C "$1" show "origin/main:$2" 2>/dev/null || true; }
|
|
|
|
_wd_last_sha=""; _wd_adv_inbox_seen=""; _wd_builder_inbox_seen=""
|
|
handoff_reset() { _wd_last_sha=""; _wd_adv_inbox_seen=""; _wd_builder_inbox_seen=""; } # call on phase transition
|
|
# Signal handoffs off the loops' CONVENTIONAL COMMIT PREFIXES on origin/main — NOT by parsing
|
|
# free-form markdown prose (brittle). The loops consistently prefix every gate claim `claim(...)`
|
|
# and every verdict/finding `review(...)`. So: a new `claim(` commit pushed => ping the Adversary;
|
|
# a new `review(` commit => ping the Builder. Edge-triggered on the origin/main SHA (append-only —
|
|
# the loops never force-push), so it can't double-fire or mis-route. INBOX files are detected
|
|
# separately (which file changed routes the ping). All reads are of the PUSHED state (what the
|
|
# receiver pulls).
|
|
handoff_check() {
|
|
local head subjects adv_inbox builder_inbox h
|
|
_wd_fetch_origin "$BUILDER_DIR"
|
|
head="$(git -C "$BUILDER_DIR" rev-parse origin/main 2>/dev/null || true)"
|
|
if [[ -n "$head" ]]; then
|
|
if [[ -z "$_wd_last_sha" ]]; then
|
|
_wd_last_sha="$head" # baseline silently on first observation / restart
|
|
elif [[ "$head" != "$_wd_last_sha" ]]; then
|
|
subjects="$(git -C "$BUILDER_DIR" log --format='%s' "${_wd_last_sha}..origin/main" 2>/dev/null || true)"
|
|
if printf '%s\n' "$subjects" | grep -qiE '^claim'; then
|
|
log "handoff: new claim(...) commit on origin/main -> pinging Adversary"
|
|
ping_session "$ADV_SESSION" "watchdog ping: the Builder pushed a gate CLAIM (claim(...) commit). Pull and verify the claimed gate now."
|
|
fi
|
|
if printf '%s\n' "$subjects" | grep -qiE '^review'; then
|
|
log "handoff: new review(...) commit on origin/main -> pinging Builder"
|
|
ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary pushed a verdict/finding (review(...) commit). Pull REVIEW and act — proceed if it PASSes your gate, address it if it's a finding."
|
|
fi
|
|
_wd_last_sha="$head"
|
|
fi
|
|
fi
|
|
|
|
adv_inbox="$(_wd_show_pushed "$BUILDER_DIR" "ADVERSARY-INBOX.md")"
|
|
builder_inbox="$(_wd_show_pushed "$BUILDER_DIR" "BUILDER-INBOX.md")"
|
|
|
|
# INBOX side-channel (§6.1), detected on the pushed state. Receiver deletes after consuming =>
|
|
# absent on origin/main => re-arm so the next write re-pings.
|
|
if [[ -n "$adv_inbox" ]]; then
|
|
h="$(printf '%s' "$adv_inbox" | md5sum | awk '{print $1}')"
|
|
if [[ "$h" != "$_wd_adv_inbox_seen" ]]; then
|
|
log "handoff: ADVERSARY-INBOX.md new/changed (pushed) -> pinging Adversary"
|
|
ping_session "$ADV_SESSION" "watchdog ping: the Builder pushed machine-docs/ADVERSARY-INBOX.md — pull, read it, act, then delete the file (commit + push) to mark it consumed."
|
|
_wd_adv_inbox_seen="$h"
|
|
fi
|
|
else
|
|
_wd_adv_inbox_seen=""
|
|
fi
|
|
if [[ -n "$builder_inbox" ]]; then
|
|
h="$(printf '%s' "$builder_inbox" | md5sum | awk '{print $1}')"
|
|
if [[ "$h" != "$_wd_builder_inbox_seen" ]]; then
|
|
log "handoff: BUILDER-INBOX.md new/changed (pushed) -> pinging Builder"
|
|
ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary pushed machine-docs/BUILDER-INBOX.md — pull, read it, act, then delete the file (commit + push) to mark it consumed."
|
|
_wd_builder_inbox_seen="$h"
|
|
fi
|
|
else
|
|
_wd_builder_inbox_seen=""
|
|
fi
|
|
}
|
|
|
|
watchdog_loop() {
|
|
local idx pid status next
|
|
idx="$(cur_idx)"; pid="$(phase_id "$idx")"
|
|
log "watchdog up (phase=$pid [$((idx+1))/${#PHASES[@]}], seq='$(all_ids)', signal=${SIGNAL_INTERVAL}s, heavy=${WATCH_INTERVAL}s)"
|
|
local elapsed="$WATCH_INTERVAL"
|
|
while true; do
|
|
handoff_check
|
|
stall_check
|
|
if (( elapsed >= WATCH_INTERVAL )); then
|
|
elapsed=0
|
|
idx="$(cur_idx)"; pid="$(phase_id "$idx")"; status="$(phase_status "$idx")"
|
|
if phase_done "$status"; then
|
|
next=$((idx + 1))
|
|
if (( next < ${#PHASES[@]} )); then
|
|
log "PHASE $pid DONE (## DONE in $status) — auto-transitioning to $(phase_id "$next")."
|
|
stop_loops
|
|
echo "$next" > "$PHASE_IDX_FILE"
|
|
handoff_reset
|
|
start_loops
|
|
else
|
|
log "PHASE SEQUENCE COMPLETE (last phase $pid DONE). Stopping loops — entire build (1c→3) finished."
|
|
stop_loops
|
|
printf 'cc-ci phase sequence complete %(%F %T)T. Phases: %s. Loops stopped; entire build finished.\n' -1 "$(all_ids)" > "$LOG_DIR/SEQUENCE-COMPLETE"
|
|
log "watchdog exiting."
|
|
exit 0
|
|
fi
|
|
else
|
|
heal_session builder "$BUILDER_SESSION" "$BUILDER_DIR"
|
|
heal_session adversary "$ADV_SESSION" "$ADV_DIR"
|
|
heal_orchestrator
|
|
fi
|
|
fi
|
|
sleep "$SIGNAL_INTERVAL"
|
|
elapsed=$(( elapsed + SIGNAL_INTERVAL ))
|
|
done
|
|
}
|
|
|
|
start_watchdog() {
|
|
if session_alive "$WATCHDOG_SESSION"; then log "watchdog already running"; return 0; fi
|
|
log "starting watchdog"
|
|
tmux new-session -d -s "$WATCHDOG_SESSION" -c "$PLAN_DIR" \
|
|
"exec >>'$LOG_DIR/watchdog.log' 2>&1; '$SELF' watchdog"
|
|
}
|
|
|
|
cmd_status() {
|
|
local idx pid; idx="$(cur_idx)"; pid="$(phase_id "$idx")"
|
|
echo " phase: $pid [$((idx+1))/${#PHASES[@]}] plan=$(phase_plan "$idx") status=$(phase_status "$idx")"
|
|
local s
|
|
for s in "$BUILDER_SESSION" "$ADV_SESSION" "$WATCHDOG_SESSION"; do
|
|
if session_alive "$s"; then echo " $s: RUNNING"; else echo " $s: stopped"; fi
|
|
done
|
|
if phase_done "$(phase_status "$idx")"; then echo " phase $pid: ## DONE"; else echo " phase $pid: in progress"; fi
|
|
[[ -f "$LOG_DIR/SEQUENCE-COMPLETE" ]] && echo " >>> $(cat "$LOG_DIR/SEQUENCE-COMPLETE")"
|
|
}
|
|
|
|
case "${1:-}" in
|
|
start)
|
|
preflight
|
|
# Fresh sequence: stop any running loops, reset to phase 0 (unless RESUME_PHASE=1 keeps the idx).
|
|
stop_loops
|
|
if [[ "${RESUME_PHASE:-}" != "1" ]]; then echo 0 > "$PHASE_IDX_FILE"; fi
|
|
rm -f "$LOG_DIR/SEQUENCE-COMPLETE"
|
|
start_loops
|
|
start_watchdog
|
|
log "started at phase $(phase_id "$(cur_idx)"). status: ./launch.sh status | attach: tmux attach -t $BUILDER_SESSION"
|
|
;;
|
|
watchdog) preflight; watchdog_loop ;;
|
|
status) cmd_status ;;
|
|
logs)
|
|
case "${2:-}" in
|
|
builder) tail -f "$LOG_DIR/$BUILDER_SESSION.log" ;;
|
|
adversary) tail -f "$LOG_DIR/$ADV_SESSION.log" ;;
|
|
watchdog) tail -f "$LOG_DIR/watchdog.log" ;;
|
|
*) die "usage: $0 logs builder|adversary|watchdog" ;;
|
|
esac
|
|
;;
|
|
stop)
|
|
stop_loops
|
|
if session_alive "$WATCHDOG_SESSION"; then log "killing $WATCHDOG_SESSION"; tmux kill-session -t "$WATCHDOG_SESSION" || true; fi
|
|
log "stopped."
|
|
;;
|
|
*)
|
|
cat <<EOF
|
|
cc-ci loop launcher (phase-aware)
|
|
|
|
$0 start start the phase sequence at phase 0 + watchdog (stops any running loops first)
|
|
$0 status show phase + session + DONE state
|
|
$0 logs builder|adversary|watchdog tail a log
|
|
$0 stop stop both loops + watchdog
|
|
$0 watchdog run supervision loop in foreground
|
|
|
|
Phase sequence (auto-transition on per-phase ## DONE; STOP after the last = manual gate):
|
|
$(all_ids)
|
|
Env: CLAUDE_BIN=$CLAUDE_BIN REMOTE_CONTROL=$REMOTE_CONTROL WATCH_INTERVAL=${WATCH_INTERVAL}s SIGNAL_INTERVAL=${SIGNAL_INTERVAL}s
|
|
PHASES_SPEC='$PHASES_SPEC'
|
|
RESUME_PHASE=1 to keep the current phase index instead of resetting to 0.
|
|
EOF
|
|
;;
|
|
esac
|