#!/usr/bin/env python3 """ cc-ci upgrader launcher — one-shot weekly recipe-upgrade job agent. The upgrader runs /upgrade-all to completion, then stops and stays idle so the run + summary remain viewable in the web UI. The next weekly run starts a fresh session (start clears any idle/finished session). Usage: launch-upgrader.py start use-or-create: leave an in-flight run alone, else start fresh launch-upgrader.py fresh always kill any existing session and start fresh launch-upgrader.py stop kill the session launch-upgrader.py status show session state launch-upgrader.py attach tmux attach to the session Env: LOOP_BACKEND opencode (default) | claude — also accepts UPGRADER_BACKEND LOOP_MODEL model flag (overrides UPGRADER_MODEL); default tracks backend — opencode→opencode-go/glm-5.2, claude→sonnet UPGRADER_MODEL provider/model for opencode, e.g. opencode-go/glm-5.2 (OpenCode Go subscription) or tinfoil/deepseek-v4-pro; sonnet etc. for claude UPGRADER_ARGS extra args passed to /upgrade-all (e.g. "n8n ghost", "--dry-run") claude backend: CLAUDE_BIN, CLAUDE_FLAGS, REMOTE_CONTROL opencode backend: OPENCODE_BIN, OPENCODE_SERVER, OPENCODE_SHARE (1=attach to web server + public --share link) """ import os, sys, subprocess, re from datetime import datetime from pathlib import Path # ── config ──────────────────────────────────────────────────────────────────── SESSION = os.environ.get("UPGRADER_SESSION", "cc-ci-upgrader") WORKDIR = os.environ.get("UPGRADER_DIR", "/srv/cc-ci") LOG_DIR = os.environ.get("LOG_DIR", "/srv/cc-ci/.cc-ci-logs") # LOOP_BACKEND / LOOP_MODEL take precedence (unified control from the operator). # Default backend+model is the OpenCode Go subscription on glm-5.2; override either via env. # The model default tracks the backend so an explicit `LOOP_BACKEND=claude` (without a model) # still gets a sensible claude model rather than a glm id. BACKEND = os.environ.get("LOOP_BACKEND", os.environ.get("UPGRADER_BACKEND", "opencode")) _DEFAULT_MODEL = "opencode-go/glm-5.2" if BACKEND == "opencode" else "sonnet" MODEL = os.environ.get("LOOP_MODEL", os.environ.get("UPGRADER_MODEL", _DEFAULT_MODEL)) CLAUDE_BIN = os.environ.get("CLAUDE_BIN", "claude") CLAUDE_FLAGS = os.environ.get("CLAUDE_FLAGS", "--dangerously-skip-permissions") REMOTE_CONTROL = os.environ.get("REMOTE_CONTROL", "1") == "1" OPENCODE_BIN = os.environ.get("OPENCODE_BIN", "/home/loops/.local/bin/opencode") OPENCODE_SERVER = os.environ.get("OPENCODE_SERVER", "http://127.0.0.1:4096") # Web visibility for the opencode backend: attach the session to the shared opencode # web server (viewable at http://oc.commoninternet.net, tailnet-only) AND optionally # create a public opencode.ai --share link. Default both on so the run is monitorable. OPENCODE_SHARE = os.environ.get("OPENCODE_SHARE", "1") == "1" UPGRADER_ARGS = os.environ.get("UPGRADER_ARGS", "") # ── helpers ─────────────────────────────────────────────────────────────────── def log(msg): ts = datetime.now().strftime("%H:%M:%S") print(f"[upgrader {ts}] {msg}", flush=True) def die(msg): log(f"ERROR: {msg}") sys.exit(1) def session_alive(): return subprocess.run( ["tmux", "has-session", "-t", SESSION], capture_output=True ).returncode == 0 def session_busy(): """True while a turn is actively in flight (not idle/finished/wedged).""" r = subprocess.run(["tmux", "capture-pane", "-pt", SESSION], capture_output=True, text=True) pane = r.stdout if r.returncode == 0 else "" return bool(re.search(r"esc to interrupt|⠋|⠙|⠹|⠸|⠼|⠴|⠦|⠧|⠇|⠏|Running tool", pane)) def kill_session(): subprocess.run(["tmux", "kill-session", "-t", SESSION], capture_output=True) # ── kickoff prompt ──────────────────────────────────────────────────────────── def build_kickoff(): args_note = f" with arguments: {UPGRADER_ARGS}" if UPGRADER_ARGS else "" return f"""\ *** cc-ci UPGRADER — weekly recipe-upgrade job *** You are the cc-ci Upgrader: a ONE-SHOT job agent, NOT a perpetual loop. Run the recipe-upgrade sequence to completion, then STOP. Your cwd is {WORKDIR}; reach the CI server with `ssh cc-ci`; creds are in {WORKDIR}/.testenv; skills in {WORKDIR}/.claude/skills/. DO THIS: 1. Invoke the /upgrade-all skill in DEFAULT mode{args_note} (read {WORKDIR}/.claude/skills/upgrade-all/SKILL.md for the full procedure). It surveys every enrolled recipe and, for each upgradeable one, runs /recipe-upgrade in DEFAULT mode — recipe PR only, verified by posting `!testme` on the PR (results visible in the PR, iterate up to 3x). A genuinely stale test gets an explanatory PR COMMENT, never a test edit. 2. Process recipes via per-recipe SUBAGENTS so your own context stays light. If your context usage climbs (~80%), run /compact before continuing. 3. Write + push the weekly summary (the PR list is the actionable output for the operator). 4. WHEN THE RUN IS COMPLETE: STOP. Print the final summary (lead with the PR list) and an `UPGRADE RUN COMPLETE` line, then go idle. Do NOT loop, do NOT re-run, and do NOT kill your own session — leave it up so the operator can review the output in the web UI. Next week's run starts a fresh session (the launcher clears this idle one). GUARDRAILS: NEVER merge any PR. NEVER weaken a test. DEFAULT mode only — do NOT pass --with-tests (updating cc-ci tests is the operator's per-recipe opt-in). Single-writer: dedicated branches + separate clones, never push main, never touch the build loops' /cc-ci /cc-ci-adv clones. The shared Swarm is stateful — go sequentially. """ # ── launch ──────────────────────────────────────────────────────────────────── def start(mode="use-or-create"): import shutil if not shutil.which("tmux"): die("tmux not found") Path(LOG_DIR).mkdir(parents=True, exist_ok=True) if session_alive(): if mode == "use-or-create" and session_busy(): log(f"{SESSION} already running a job (busy) — leaving it") return log(f"{SESSION} exists but idle/stale (or fresh requested) — killing it first") kill_session() import time; time.sleep(1) kf = Path(LOG_DIR) / f".kickoff-{SESSION}.txt" kf.write_text(build_kickoff()) model_flag = f"--model '{MODEL}'" if MODEL else "" log(f"starting {SESSION} (backend={BACKEND}, model={MODEL}, args='{UPGRADER_ARGS or ''}')") if BACKEND == "claude": if not shutil.which(CLAUDE_BIN): die(f"claude CLI not found — set CLAUDE_BIN (currently: {CLAUDE_BIN})") rc = f"--remote-control '{SESSION}'" if REMOTE_CONTROL else "" cmd = f"{CLAUDE_BIN} {rc} {model_flag} {CLAUDE_FLAGS} \"$(cat '{kf}')\"" elif BACKEND == "opencode": if not Path(OPENCODE_BIN).exists(): die(f"opencode not found at {OPENCODE_BIN}") # NOTE: -m/--model and --attach/--title/--share are flags on the `run` SUBCOMMAND, # so they must come AFTER `run` (a global `opencode --model X run` is ignored). share_flag = "--share" if OPENCODE_SHARE else "" cmd = ( f"set -a; . /srv/cc-ci/.testenv; set +a; " f"{OPENCODE_BIN} run {model_flag} {share_flag} --attach '{OPENCODE_SERVER}' " f"--title '{SESSION}' \"$(cat '{kf}')\"" ) log(f" attached to {OPENCODE_SERVER} → http://oc.commoninternet.net (tailnet only)" + (" +public --share link (printed in the session)" if OPENCODE_SHARE else "")) else: die(f"unknown LOOP_BACKEND '{BACKEND}' — use 'claude' or 'opencode'") subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd]) subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_DIR}/{SESSION}.log'"]) log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log") # For the opencode backend, spawn a detached babysitter that auto-resumes the run if the # opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit(). if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1": subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"], stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"), stderr=subprocess.STDOUT, start_new_session=True) log(" babysitter spawned — auto-resume on usage-limit stalls") # ── opencode stall-detect + auto-resume watchdog ──────────────────────────────── # The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it # trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects # the stall (the session log stops growing), waits out the limit, and resumes the SAME session — # context preserved — via `opencode run -s --continue`. Standalone: launch-upgrader.py {resume|babysit}. import json as _json, urllib.request as _ureq, time as _time STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # babysitter poll cadence DONE_MARKER = "UPGRADE RUN COMPLETE" GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions" AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json") LOG_FILE = f"{LOG_DIR}/{SESSION}.log" def _server_get(path): try: with _ureq.urlopen(OPENCODE_SERVER + path, timeout=15) as r: return _json.load(r) except Exception: return None def _session_id(): """Newest top-level opencode session titled like SESSION (the run we manage).""" rows = _server_get("/session") or [] rows = rows if isinstance(rows, list) else rows.get("data", []) cands = [s for s in rows if s.get("title") == SESSION and not (s.get("parentID") or s.get("parentId"))] cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True) return cands[0]["id"] if cands else None def _log_idle_min(): try: return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0 except Exception: return None def _go_key(): try: return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key") except Exception: return None def _limit_retry_after(): """0 if the opencode-go endpoint is available (HTTP 200); else the 429 retry-after seconds.""" key = _go_key() if not key: return 0 body = _json.dumps({"model": (MODEL or "").split("/")[-1] or "glm-5.2", "max_tokens": 8, "messages": [{"role": "user", "content": "hi"}]}).encode() req = _ureq.Request(GO_ENDPOINT, data=body, method="POST", headers={"Authorization": "Bearer " + key, "content-type": "application/json"}) try: _ureq.urlopen(req, timeout=20).read(); return 0 except _ureq.HTTPError as e: if e.code == 429: try: return max(1, int(e.headers.get("retry-after", "300"))) except Exception: return 300 return 0 except Exception: return 0 def _run_pids(): """PIDs of live `opencode run` procs (via /proc scan — never matches this process).""" me, out = os.getpid(), [] for p in os.listdir("/proc"): if not p.isdigit() or int(p) == me: continue try: cl = open(f"/proc/{p}/cmdline", "rb").read().split(b"\0") except Exception: continue if b"opencode" in (b" ".join(cl)) and b"run" in cl and b"--attach" in cl: out.append(int(p)) return out def _completed(): try: with open(LOG_FILE, errors="ignore") as f: f.seek(0, 2); f.seek(max(0, f.tell() - 20000)); return DONE_MARKER in f.read() except Exception: return False def resume(reason="manual"): """Resume the managed opencode session from where it stopped (context preserved).""" import signal sid = _session_id() if not sid: log(f"resume: no top-level '{SESSION}' session on {OPENCODE_SERVER} — cannot resume"); return False log(f"resume ({reason}): continuing session {sid}") for pid in _run_pids(): try: os.kill(pid, signal.SIGTERM) except Exception: pass _time.sleep(2); kill_session(); _time.sleep(1) kf = Path(LOG_DIR) / f".kickoff-{SESSION}-resume.txt" kf.write_text( "The opencode-go usage limit has reset (or the run stalled). You were mid-way through the weekly " "cc-ci /upgrade-all run. CONTINUE from where you left off — do NOT start over. Process the enrolled " "recipes not yet done this week, alphabetically; SKIP ones already done (their PRs exist — extend, " "never duplicate). Per recipe: run /recipe-upgrade in DEFAULT mode via a subagent, verify with " "!testme, open/extend the recipe PR (NEVER merge, NEVER weaken a test), <= DRONE_RUNNER_CAPACITY " "concurrent. immich has a tag+digest image abra can't parse — do the upstream-direct cross-check " "(recipe-upgrade SKILL §1), don't silently skip it. When all remaining recipes are done: " "write+push the weekly summary, then `python3 /srv/cc-ci/cc-ci-plan/launch-report.py fresh`, print " "'" + DONE_MARKER + "', and go idle.") share = "--share" if OPENCODE_SHARE else "" cmd = (f"set -a; . /srv/cc-ci/.testenv; set +a; {OPENCODE_BIN} run -s {sid} --continue " f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' \"$(cat '{kf}')\"") subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd]) subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"]) log(f"resume: relaunched {SESSION} (session {sid})"); return True def babysit(): """Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session. Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone.""" log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)") misses = 0 while True: _time.sleep(CHECK_EVERY) if _completed(): log("babysit: run completed — exiting"); return idle = _log_idle_min() if idle is None or idle <= STALL_MIN: misses = 0; continue # Log has been static > STALL_MIN — the run is stalled (limit or wedge), not working. retry = _limit_retry_after() if retry > 0: wait = min(retry + 30, 3600) log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s") _time.sleep(wait); continue # Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume. misses += 1 if misses >= 2: log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming") resume("babysit auto-resume"); misses = 0 # ── main ────────────────────────────────────────────────────────────────────── def main(): cmd = sys.argv[1] if len(sys.argv) > 1 else "start" if cmd == "start": start("use-or-create") elif cmd == "fresh": start("fresh") elif cmd == "stop": if session_alive(): log(f"killing {SESSION}") kill_session() else: log(f"{SESSION} not running") elif cmd == "status": if session_alive(): busy = "busy" if session_busy() else "idle/finishing" log(f"{SESSION}: RUNNING ({busy})") subprocess.run( f"ps -eo pid,etime,args | grep '[r]emote-control {SESSION}' || true", shell=True) else: log(f"{SESSION}: stopped") log(f"backend: {BACKEND} model: {MODEL} args: '{UPGRADER_ARGS or ''}'") elif cmd == "attach": os.execvp("tmux", ["tmux", "attach", "-t", SESSION]) elif cmd == "resume": resume("manual") elif cmd == "babysit": babysit() else: print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job launch-upgrader.py start use-or-create (leave busy run alone, else start fresh) launch-upgrader.py fresh always kill existing + start fresh launch-upgrader.py stop kill the session launch-upgrader.py status show session state launch-upgrader.py attach tmux attach launch-upgrader.py resume continue the opencode session from where it stalled (-s --continue) launch-upgrader.py babysit watch + auto-resume the opencode run across usage-limit (429) stalls Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var) Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var) Args: {UPGRADER_ARGS or ''} (UPGRADER_ARGS env var, passed to /upgrade-all) claude: viewable at claude.ai/code opencode: viewable at http://oc.commoninternet.net server={OPENCODE_SERVER} """) if __name__ == "__main__": main()