#!/usr/bin/env python3 """ cc-ci weekly-run SUPERVISOR — hourly glm-5.2 orchestrator wake-up. Fired hourly by a systemd timer. It is a CHEAP deterministic GATE first: if this week's /upgrade-all run is already complete, or is actively progressing, it does NOTHING and spends ZERO model tokens. Only when the run has STALLED or died before completing — e.g. the host disk-full crash on 2026-07-03 that the log-idle/429 watchdog does NOT cover — does it launch a short-lived glm-5.2 opencode agent that DIAGNOSES the blockage (disk, wedged deploy, dead session, a stuck recipe) and DRIVES the run to completion (resume the upgrader, ensure the summary + public report land). One-shot per fire; the next hour re-checks and no-ops if healthy. This is the intelligent complement to launch-upgrader.py's watchdog: the watchdog only handles opencode-go usage-limit (429) stalls (wait-out + `--continue`); the supervisor handles everything else that can wedge a weekly run, using a real model instead of a fixed heuristic. Usage: launch-supervisor.py [check] default — the timer entrypoint (gate; may spawn the agent) launch-supervisor.py force skip the gate; always launch the supervisor agent launch-supervisor.py status show what the gate currently sees launch-supervisor.py stop kill the supervisor agent session """ import os, re, sys, time, subprocess, importlib.util from datetime import datetime from pathlib import Path # ── reuse launch-upgrader's server/session/completion helpers (single source of truth) ────────── _HERE = os.path.dirname(os.path.realpath(__file__)) os.environ.setdefault("UPGRADER_SESSION", "cc-ci-upgrader") # the run we supervise _spec = importlib.util.spec_from_file_location("launch_upgrader", os.path.join(_HERE, "launch-upgrader.py")) lu = importlib.util.module_from_spec(_spec); _spec.loader.exec_module(lu) # ── config ────────────────────────────────────────────────────────────────────────────────────── SUP_SESSION = os.environ.get("SUPERVISOR_SESSION", "cc-ci-supervisor") WORKDIR = os.environ.get("UPGRADER_DIR", "/srv/cc-ci") LOG_DIR = os.environ.get("LOG_DIR", "/srv/cc-ci/.cc-ci-logs") MODEL = os.environ.get("SUPERVISOR_MODEL", "opencode-go/glm-5.2") OPENCODE_BIN = lu.OPENCODE_BIN OPENCODE_SERVER = lu.OPENCODE_SERVER OPENCODE_SHARE = os.environ.get("OPENCODE_SHARE", "1") == "1" # Don't auto-resurrect a run whose session is older than this — a genuinely abandoned run should not # be dragged back to life days later; the operator will look. Covers the Thu-night → weekend window. WINDOW_HOURS = float(os.environ.get("SUPERVISOR_WINDOW_HOURS", "96")) def log(m): print(f"[supervisor {datetime.now():%H:%M:%S}] {m}", flush=True) def _sh(c): return subprocess.run(c, capture_output=True, text=True) # ── gate helpers ──────────────────────────────────────────────────────────────────────────────── def _session_created_ms(sid): rows = lu._server_get("/session") or [] rows = rows if isinstance(rows, list) else rows.get("data", []) for s in rows: if s.get("id") == sid: return (s.get("time") or {}).get("created") return None def _sup_alive(): return _sh(["tmux", "has-session", "-t", SUP_SESSION]).returncode == 0 def _sup_busy(): r = _sh(["tmux", "capture-pane", "-pt", SUP_SESSION]) return bool(re.search(r"esc to interrupt|⠋|⠙|⠹|⠸|⠼|⠴|⠦|⠧|⠇|⠏|Running tool", r.stdout)) if r.returncode == 0 else False def _sup_kill(): _sh(["tmux", "kill-session", "-t", SUP_SESSION]) # ── the supervisor agent ───────────────────────────────────────────────────────────────────────── def build_kickoff(sid, reason): return f"""\ *** cc-ci WEEKLY-RUN SUPERVISOR — one-shot, glm-5.2 *** You are the hourly SUPERVISOR for the weekly cc-ci /upgrade-all run. A gate has determined the run is INCOMPLETE and not currently progressing ({reason}). Your job: get this week's run to a clean DONE — published report + summary — then STOP. You are NOT a perpetual loop. Your cwd is {WORKDIR}; reach the CI server with `ssh cc-ci`; creds in {WORKDIR}/.testenv; skills in {WORKDIR}/.claude/skills/. The stalled upgrader opencode session is {sid} (title "cc-ci-upgrader"). DO THIS, in order — stop as soon as the run is healthy again: 1. ENVIRONMENT FIRST. Check the CI server disk: `ssh cc-ci 'df -h / | tail -1'`. If root is > 85% used, reclaim STALE images (unused AND older than a week, so this week's are kept): `ssh cc-ci 'docker image prune -af --filter until=168h 2>&1 | tail -1; df -h / | tail -1'`. Also glance for other infra wedges (a hung deploy, proxy VIP exhaustion — see upgrade-all §0). 2. ASSESS the run. Read the upgrader session's recent output (opencode server {OPENCODE_SERVER}, `GET /session/{sid}/message`) and the open recipe PRs to see which enrolled recipes already have a PR this week and which remain. Do NOT redo any recipe that already has a PR. 3. DRIVE TO COMPLETION. Prefer to RESUME the existing run (context preserved) once the environment is healthy: `python3 {WORKDIR}/cc-ci-plan/launch-upgrader.py resume`. Then CONFIRM it actually restarted and is progressing (a fresh `opencode run … -s {sid} --continue` proc + the session advancing). If the session is truly gone/unresumable, drive the remaining recipes yourself the /upgrade-all way (per-recipe /recipe-upgrade DEFAULT-mode subagents, !testme verify), then make sure the weekly summary is written to {WORKDIR}/.cc-ci-logs/upgrades/ and launch the public report: `python3 {WORKDIR}/cc-ci-plan/launch-report.py fresh`. 4. If on inspection the run is actually FINE (progressing) or already COMPLETE, do NOTHING. 5. Print `SUPERVISOR DONE` and go idle. Do NOT loop. GUARDRAILS: NEVER merge a PR. NEVER weaken a test. DEFAULT mode only. Single-writer on the shared Swarm — don't pile concurrent deploys past DRONE_RUNNER_CAPACITY. Handing back to the resumed run is preferred over doing the recipe work yourself — avoid two writers at once. """ def spawn_supervisor(sid, reason): Path(LOG_DIR).mkdir(parents=True, exist_ok=True) if _sup_alive(): _sup_kill(); time.sleep(1) kf = Path(LOG_DIR) / f".kickoff-{SUP_SESSION}.txt" kf.write_text(build_kickoff(sid, reason)) share = "--share" if OPENCODE_SHARE else "" cmd = (f"set -a; . {WORKDIR}/.testenv; set +a; {OPENCODE_BIN} run --model '{MODEL}' {share} " f"--attach '{OPENCODE_SERVER}' --title '{SUP_SESSION}' --dir {WORKDIR} \"$(cat '{kf}')\"") _sh(["tmux", "new-session", "-d", "-s", SUP_SESSION, "-c", WORKDIR, cmd]) _sh(["tmux", "pipe-pane", "-o", "-t", SUP_SESSION, f"cat >> '{LOG_DIR}/{SUP_SESSION}.log'"]) log(f"launched glm-5.2 supervisor (tmux '{SUP_SESSION}', model={MODEL}) — {reason}") # ── gate ───────────────────────────────────────────────────────────────────────────────────────── def _gate(): """Return (should_spawn, sid, reason). Cheap — no model tokens.""" sid = lu._session_id() if not sid: return False, None, "no cc-ci-upgrader session exists — nothing to supervise" if lu._completed(): return False, sid, "weekly run COMPLETE (DONE marker present) — nothing to do" created = _session_created_ms(sid) age_h = (time.time() * 1000 - created) / 3.6e6 if created else 0.0 if created and age_h > WINDOW_HOURS: return False, sid, f"incomplete run is {age_h:.0f}h old (> {WINDOW_HOURS:.0f}h window) — not auto-resurrecting" # "Progressing" for an opencode run is NOT session_busy() (its pane regex is claude-tuned and # misreads a headless `opencode run` as idle). Trust the run PROCESS + the session log's mtime: # a live `opencode run … -s --attach` proc, or a log touched within the stall window. pids = lu._run_pids(sid) idle = lu._log_idle_min() if pids or (idle is not None and idle < lu.STALL_MIN): via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m" return False, sid, f"upgrader run progressing ({via}) — leaving it" # The per-run watchdog owns PROMPT recovery (resume on proc-death/stall) and is the single writer # while it lives. Defer to it — it gives up (exits its tmux) only after MAX_RESUMES fail, i.e. the # run is stuck in a way a bare resume can't fix (e.g. disk-full). THEN the supervisor takes over. if lu._watchdog_alive(): return False, sid, "per-run watchdog alive — it owns recovery; supervisor stays back" if _sup_alive() and _sup_busy(): return False, sid, "a supervisor agent is already working — skip" idle_s = f"{idle:.0f}m" if idle is not None else "unknown" return True, sid, f"run INCOMPLETE + not progressing (log idle {idle_s}, age {age_h:.0f}h)" def check(force=False): if force: sid = lu._session_id() spawn_supervisor(sid, "forced"); return should, sid, reason = _gate() log(reason) if should: spawn_supervisor(sid, reason) # ── main ───────────────────────────────────────────────────────────────────────────────────────── def main(): cmd = sys.argv[1] if len(sys.argv) > 1 else "check" if cmd == "check": check() elif cmd == "force": check(force=True) elif cmd == "stop": _sup_kill(); log(f"{SUP_SESSION} stopped") elif cmd == "status": should, sid, reason = _gate() log(f"gate: would {'SPAWN' if should else 'skip'} — {reason}") log(f"supervisor session: {'RUNNING '+('(busy)' if _sup_busy() else '(idle)') if _sup_alive() else 'stopped'}") log(f"model: {MODEL} window: {WINDOW_HOURS:.0f}h") else: print(__doc__); sys.exit(2) if __name__ == "__main__": main()