Files
cc-ci-orchestrator/cc-ci-plan/launch-orchestrator.sh
autonomic-bot 36a6c9872a orchestrator: reboot-resilience + session auto-resume + full session plan/tooling
Reboot survival for the Pi orchestrator host:
- systemd unit cc-ci-plan/systemd/cc-ci-loops.service (installed + enabled): on boot
  records the reboot, starts loops+watchdog (RESUME_PHASE=1), and resumes the
  orchestrator session.
- reboot-log.sh: boot_id-gated reboot record -> REBOOTS.md (manual restarts don't count).
- launch-orchestrator.sh: injects an AGENTS.md startup nudge so an auto-resumed
  orchestrator announces itself (PushNotification) + reports reboots.
- AGENTS.md: on-startup notify routine documented.

Plans/tooling accumulated this session:
- plan-phase1d (generic suite), 1e (harness corrections), phase4 (final review),
  sso-dep-testing, orchestrator-migration (parked), test-e2e-testme-acceptance.
- launch.sh: 1d/1e/2/2b/3/4 phase sequence, machine-docs-aware state resolution,
  limit-stall re-nudge, INBOX side-channel detection.
- plan.md §6.1/§7: artifact-layer isolation, INBOX, 5-min long-run polling, DEFERRED.
- prompts: isolation discipline + INBOX + pacing.
- .gitignore: harden (.sops/, cc-ci-secrets/, .claude/, *.tmp.*).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-28 20:28:10 +01:00

118 lines
6.1 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# launch-orchestrator.sh — start/resume the cc-ci ORCHESTRATOR session in tmux under remote-control.
#
# The orchestrator (see /srv/cc-ci/AGENTS.md) is the long-lived SUPERVISORY session: it watches the
# Builder/Adversary loops, reads their logs/STATUS, edits the plan/prompts, restarts stuck loops, and
# owns the VM-level fallback. It is SEPARATE from the loops that launch.sh manages — this script only
# brings the orchestrator back (e.g. after a reboot, which kills the tmux server and every session in
# it). The conversation itself survives on disk across exits/reboots; remote-control only stays
# connected while the process is alive, so recovery = relaunch the process and re-attach by --resume.
#
# Naming: tmux session AND remote-control name are both "cc-ci-orchestrator", matching the loop
# sessions cc-ci-builder / cc-ci-adv / cc-ci-watchdog.
#
# Usage:
# ./launch-orchestrator.sh start # resume the persistent orchestrator session (DEFAULT)
# ./launch-orchestrator.sh fresh # start a NEW orchestrator session (no --resume)
# ./launch-orchestrator.sh status # show tmux + remote-control state
# ./launch-orchestrator.sh attach # tmux attach to the session (Ctrl-b d to detach)
# ./launch-orchestrator.sh stop # kill the tmux session (conversation persists on disk)
#
# The persistent session id is read from $ID_FILE (seeded on first run with DEFAULT_ID). A Claude
# session keeps the SAME id across --resume, so this stays valid across reboots. To point the script
# at a different session, edit that file or export ORCH_SESSION_ID.
set -euo pipefail
# ----- config -------------------------------------------------------------
SESSION="${ORCH_SESSION:-cc-ci-orchestrator}" # tmux session name == remote-control name
WORKDIR="${ORCH_DIR:-/srv/cc-ci}" # orchestrator cwd (its claude project dir)
CLAUDE_BIN="${CLAUDE_BIN:-claude}"
CLAUDE_FLAGS="${CLAUDE_FLAGS:---dangerously-skip-permissions}"
# REMOTE_CONTROL=1 → --remote-control session, viewable/steerable at claude.ai/code. Needs the box
# logged into the claude.ai account. =0 for a plain local interactive session.
REMOTE_CONTROL="${REMOTE_CONTROL:-1}"
LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}"
ID_FILE="${ORCH_ID_FILE:-$LOG_DIR/.orchestrator-session-id}"
DEFAULT_ID="34a80a99-b37e-4809-b8da-ccc9fafe785e" # the orchestrator session as of 2026-05-28
# Startup nudge injected as the resumed session's first turn, so an AUTO-launched orchestrator (e.g.
# cc-ci-loops.service ExecStartPost after a reboot) actually RUNS its AGENTS.md startup routine —
# announce itself + report reboots — instead of resuming silently and waiting. Set empty to disable.
# Must contain NO single quotes (it is single-quoted into the tmux command).
STARTUP_PROMPT="${ORCH_STARTUP_PROMPT-STARTUP (auto-launch): you are the cc-ci orchestrator, just (re)launched, likely after a reboot. Do your AGENTS.md On-startup routine NOW: read cc-ci-plan/REBOOTS.md and run cc-ci-plan/launch.sh status, then send a proactive PushNotification that you are online with the current phase and reboot count, and confirm cc-ci-loops.service brought the loops + watchdog back (relaunch with RESUME_PHASE=1 cc-ci-plan/launch.sh start if not). Then resume supervising.}"
# --------------------------------------------------------------------------
log() { printf '[orchestrator %(%H:%M:%S)T] %s\n' -1 "$*"; }
die() { log "ERROR: $*"; exit 1; }
session_alive() { tmux has-session -t "$SESSION" 2>/dev/null; }
preflight() {
command -v tmux >/dev/null 2>&1 || die "missing dependency: tmux"
command -v "$CLAUDE_BIN" >/dev/null 2>&1 || die "claude CLI not found (set CLAUDE_BIN)"
[[ -d "$WORKDIR" ]] || die "workdir not found: $WORKDIR"
mkdir -p "$LOG_DIR"
[[ -f "$ID_FILE" ]] || echo "$DEFAULT_ID" > "$ID_FILE"
}
resume_id() { echo "${ORCH_SESSION_ID:-$(cat "$ID_FILE" 2>/dev/null || echo "$DEFAULT_ID")}"; }
# Launch claude in a detached tmux session. $1=resume ("resume"|"fresh").
start() {
local mode="${1:-resume}"
preflight
if session_alive; then
log "$SESSION already running — leaving it (use '$0 stop' first to relaunch)"
return 0
fi
local rc="" resume="" id=""
[[ "$REMOTE_CONTROL" == "1" ]] && rc="--remote-control '$SESSION'"
if [[ "$mode" == "resume" ]]; then
id="$(resume_id)"
[[ -n "$id" ]] && resume="--resume '$id'"
log "starting $SESSION (resume=$id, cwd=$WORKDIR, rc=$REMOTE_CONTROL)"
else
log "starting $SESSION FRESH (no resume, cwd=$WORKDIR, rc=$REMOTE_CONTROL)"
fi
# Startup nudge as a POSITIONAL prompt (not stdin — stdin would force print mode and break
# remote-control). On --resume this appends as the session's next turn, triggering the AGENTS.md
# startup routine (announce + report reboots). Empty STARTUP_PROMPT => clean resume, no nudge.
local prompt_arg=""
[[ -n "$STARTUP_PROMPT" ]] && prompt_arg="'$STARTUP_PROMPT'"
tmux new-session -d -s "$SESSION" -c "$WORKDIR" \
"$CLAUDE_BIN $resume $rc $CLAUDE_FLAGS $prompt_arg"
tmux pipe-pane -o -t "$SESSION" "cat >> '$LOG_DIR/$SESSION.log'"
log "started. status: $0 status | attach: tmux attach -t $SESSION"
}
case "${1:-start}" in
start) start resume ;;
fresh) start fresh ;;
stop)
if session_alive; then log "killing $SESSION"; tmux kill-session -t "$SESSION" || true; else log "$SESSION not running"; fi
;;
status)
if session_alive; then
log "$SESSION: RUNNING"
ps -eo pid,etime,args | grep "[r]emote-control $SESSION" || true
else
log "$SESSION: stopped"
fi
log "resume id: $(cat "$ID_FILE" 2>/dev/null || echo "$DEFAULT_ID") (file: $ID_FILE)"
;;
attach) exec tmux attach -t "$SESSION" ;;
*)
cat <<EOF
cc-ci orchestrator launcher
$0 start resume the persistent orchestrator session in tmux + remote-control (default)
$0 fresh start a NEW orchestrator session (no --resume)
$0 status show tmux + remote-control state and the resume id
$0 attach tmux attach to the session
$0 stop kill the tmux session (conversation persists on disk)
Env: SESSION=$SESSION WORKDIR=$WORKDIR REMOTE_CONTROL=$REMOTE_CONTROL CLAUDE_BIN=$CLAUDE_BIN
EOF
;;
esac