cc-ci-orchestrator/cc-ci-plan/launch-upgrader.py

#!/usr/bin/env python3
"""
cc-ci upgrader launcher — one-shot weekly recipe-upgrade job agent.

The upgrader runs /upgrade-all to completion, then stops and stays idle so the
run + summary remain viewable in the web UI. The next weekly run starts a fresh
session (start clears any idle/finished session).

Usage:
  launch-upgrader.py start    use-or-create: leave an in-flight run alone, else start fresh
  launch-upgrader.py fresh    always kill any existing session and start fresh
  launch-upgrader.py stop     kill the session
  launch-upgrader.py status   show session state
  launch-upgrader.py attach   tmux attach to the session

Env:
  LOOP_BACKEND     opencode (default) | claude   — also accepts UPGRADER_BACKEND
  LOOP_MODEL       model flag (overrides UPGRADER_MODEL); default tracks backend —
                   opencode→opencode-go/glm-5.2, claude→sonnet
  UPGRADER_MODEL   provider/model for opencode, e.g. opencode-go/glm-5.2 (OpenCode Go
                   subscription) or tinfoil/deepseek-v4-pro; sonnet etc. for claude
  UPGRADER_ARGS    extra args passed to /upgrade-all (e.g. "n8n ghost", "--dry-run")

  claude backend:
    CLAUDE_BIN, CLAUDE_FLAGS, REMOTE_CONTROL
  opencode backend:
    OPENCODE_BIN, OPENCODE_SERVER, OPENCODE_SHARE (1=attach to web server + public --share link)
"""

import os, sys, subprocess, re
from datetime import datetime
from pathlib import Path

# ── config ────────────────────────────────────────────────────────────────────

SESSION = os.environ.get("UPGRADER_SESSION", "cc-ci-upgrader")
WORKDIR = os.environ.get("UPGRADER_DIR",     "/srv/cc-ci")
LOG_DIR = os.environ.get("LOG_DIR",          "/srv/cc-ci/.cc-ci-logs")

# LOOP_BACKEND / LOOP_MODEL take precedence (unified control from the operator).
# Default backend+model is the OpenCode Go subscription on glm-5.2; override either via env.
# The model default tracks the backend so an explicit `LOOP_BACKEND=claude` (without a model)
# still gets a sensible claude model rather than a glm id.
BACKEND = os.environ.get("LOOP_BACKEND", os.environ.get("UPGRADER_BACKEND", "opencode"))
_DEFAULT_MODEL = "opencode-go/glm-5.2" if BACKEND == "opencode" else "sonnet"
MODEL   = os.environ.get("LOOP_MODEL",   os.environ.get("UPGRADER_MODEL",   _DEFAULT_MODEL))

CLAUDE_BIN     = os.environ.get("CLAUDE_BIN",   "claude")
CLAUDE_FLAGS   = os.environ.get("CLAUDE_FLAGS", "--dangerously-skip-permissions")
REMOTE_CONTROL = os.environ.get("REMOTE_CONTROL", "1") == "1"

OPENCODE_BIN    = os.environ.get("OPENCODE_BIN",    "/home/loops/.local/bin/opencode")
OPENCODE_SERVER = os.environ.get("OPENCODE_SERVER", "http://127.0.0.1:4096")
# Web visibility for the opencode backend: attach the session to the shared opencode
# web server (viewable at http://oc.commoninternet.net, tailnet-only) AND optionally
# create a public opencode.ai --share link. Default both on so the run is monitorable.
OPENCODE_SHARE = os.environ.get("OPENCODE_SHARE", "1") == "1"

UPGRADER_ARGS = os.environ.get("UPGRADER_ARGS", "")

# ── helpers ───────────────────────────────────────────────────────────────────

def log(msg):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[upgrader {ts}] {msg}", flush=True)

def die(msg):
    log(f"ERROR: {msg}")
    sys.exit(1)

def session_alive():
    return subprocess.run(
        ["tmux", "has-session", "-t", SESSION], capture_output=True
    ).returncode == 0

def session_busy():
    """True while a turn is actively in flight (not idle/finished/wedged)."""
    r = subprocess.run(["tmux", "capture-pane", "-pt", SESSION],
                       capture_output=True, text=True)
    pane = r.stdout if r.returncode == 0 else ""
    return bool(re.search(r"esc to interrupt|⠋|⠙|⠹|⠸|⠼|⠴|⠦|⠧|⠇|⠏|Running tool", pane))

def kill_session():
    subprocess.run(["tmux", "kill-session", "-t", SESSION], capture_output=True)

# ── kickoff prompt ────────────────────────────────────────────────────────────

def build_kickoff():
    args_note = f" with arguments: {UPGRADER_ARGS}" if UPGRADER_ARGS else ""
    return f"""\
*** cc-ci UPGRADER — weekly recipe-upgrade job ***
You are the cc-ci Upgrader: a ONE-SHOT job agent, NOT a perpetual loop. Run the
recipe-upgrade sequence to completion, then STOP. Your cwd is {WORKDIR}; reach the CI
server with `ssh cc-ci`; creds are in {WORKDIR}/.testenv; skills in {WORKDIR}/.claude/skills/.

DO THIS:
1. Invoke the /upgrade-all skill in DEFAULT mode{args_note}
   (read {WORKDIR}/.claude/skills/upgrade-all/SKILL.md for the full procedure). It surveys
   every enrolled recipe and, for each upgradeable one, runs /recipe-upgrade in DEFAULT
   mode — recipe PR only, verified by posting `!testme` on the PR (results visible in the
   PR, iterate up to 3x). A genuinely stale test gets an explanatory PR COMMENT, never a
   test edit.
2. Process recipes via per-recipe SUBAGENTS so your own context stays light. If your
   context usage climbs (~80%), run /compact before continuing.
3. Write + push the weekly summary (the PR list is the actionable output for the operator).
4. WHEN THE RUN IS COMPLETE: STOP. Print the final summary (lead with the PR list) and an
   `UPGRADE RUN COMPLETE` line, then go idle. Do NOT loop, do NOT re-run, and do NOT kill
   your own session — leave it up so the operator can review the output in the web UI.
   Next week's run starts a fresh session (the launcher clears this idle one).

GUARDRAILS: NEVER merge any PR. NEVER weaken a test. DEFAULT mode only — do NOT pass
--with-tests (updating cc-ci tests is the operator's per-recipe opt-in). Single-writer:
dedicated branches + separate clones, never push main, never touch the build loops'
/cc-ci /cc-ci-adv clones. The shared Swarm is stateful — go sequentially.
"""

# ── launch ────────────────────────────────────────────────────────────────────

def start(mode="use-or-create"):
    import shutil
    if not shutil.which("tmux"):
        die("tmux not found")
    Path(LOG_DIR).mkdir(parents=True, exist_ok=True)

    if session_alive():
        if mode == "use-or-create" and session_busy():
            log(f"{SESSION} already running a job (busy) — leaving it")
            return
        log(f"{SESSION} exists but idle/stale (or fresh requested) — killing it first")
        kill_session()
        import time; time.sleep(1)

    kf = Path(LOG_DIR) / f".kickoff-{SESSION}.txt"
    kf.write_text(build_kickoff())

    model_flag = f"--model '{MODEL}'" if MODEL else ""
    log(f"starting {SESSION} (backend={BACKEND}, model={MODEL}, args='{UPGRADER_ARGS or '<none>'}')")

    if BACKEND == "claude":
        if not shutil.which(CLAUDE_BIN):
            die(f"claude CLI not found — set CLAUDE_BIN (currently: {CLAUDE_BIN})")
        rc  = f"--remote-control '{SESSION}'" if REMOTE_CONTROL else ""
        cmd = f"{CLAUDE_BIN} {rc} {model_flag} {CLAUDE_FLAGS} \"$(cat '{kf}')\""

    elif BACKEND == "opencode":
        if not Path(OPENCODE_BIN).exists():
            die(f"opencode not found at {OPENCODE_BIN}")
        # NOTE: -m/--model and --attach/--title/--share are flags on the `run` SUBCOMMAND,
        # so they must come AFTER `run` (a global `opencode --model X run` is ignored).
        share_flag = "--share" if OPENCODE_SHARE else ""
        cmd = (
            f"set -a; . /srv/cc-ci/.testenv; set +a; "
            f"{OPENCODE_BIN} run {model_flag} {share_flag} --attach '{OPENCODE_SERVER}' "
            f"--title '{SESSION}' \"$(cat '{kf}')\""
        )
        log(f"  attached to {OPENCODE_SERVER} → http://oc.commoninternet.net (tailnet only)"
            + ("  +public --share link (printed in the session)" if OPENCODE_SHARE else ""))
    else:
        die(f"unknown LOOP_BACKEND '{BACKEND}' — use 'claude' or 'opencode'")

    subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
    subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
                    f"cat >> '{LOG_DIR}/{SESSION}.log'"])
    log(f"started. attach: tmux attach -t {SESSION}  log: {LOG_DIR}/{SESSION}.log")
    # For the opencode backend, spawn a detached babysitter that auto-resumes the run if the
    # opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit().
    if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1":
        subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"],
                         stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"),
                         stderr=subprocess.STDOUT, start_new_session=True)
        log("  babysitter spawned — auto-resume on usage-limit stalls")

# ── opencode stall-detect + auto-resume watchdog ────────────────────────────────
# The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it
# trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects
# the stall (the session log stops growing), waits out the limit, and resumes the SAME session —
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|babysit}.
import json as _json, urllib.request as _ureq, time as _time

STALL_MIN   = float(os.environ.get("UPGRADER_STALL_MIN", "15"))   # log-idle minutes ⇒ stalled
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180"))    # babysitter poll cadence
DONE_MARKER = "UPGRADE RUN COMPLETE"
GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions"
AUTH_JSON   = os.path.expanduser("~/.local/share/opencode/auth.json")
LOG_FILE    = f"{LOG_DIR}/{SESSION}.log"

def _server_get(path):
    try:
        with _ureq.urlopen(OPENCODE_SERVER + path, timeout=15) as r:
            return _json.load(r)
    except Exception:
        return None

def _session_id():
    """Newest top-level opencode session titled like SESSION (the run we manage)."""
    rows = _server_get("/session") or []
    rows = rows if isinstance(rows, list) else rows.get("data", [])
    cands = [s for s in rows if s.get("title") == SESSION and not (s.get("parentID") or s.get("parentId"))]
    cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True)
    return cands[0]["id"] if cands else None

def _log_idle_min():
    try:
        return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
    except Exception:
        return None

def _go_key():
    try:
        return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key")
    except Exception:
        return None

def _limit_retry_after():
    """0 if the opencode-go endpoint is available (HTTP 200); else the 429 retry-after seconds."""
    key = _go_key()
    if not key:
        return 0
    body = _json.dumps({"model": (MODEL or "").split("/")[-1] or "glm-5.2", "max_tokens": 8,
                        "messages": [{"role": "user", "content": "hi"}]}).encode()
    req = _ureq.Request(GO_ENDPOINT, data=body, method="POST",
                        headers={"Authorization": "Bearer " + key, "content-type": "application/json"})
    try:
        _ureq.urlopen(req, timeout=20).read(); return 0
    except _ureq.HTTPError as e:
        if e.code == 429:
            try: return max(1, int(e.headers.get("retry-after", "300")))
            except Exception: return 300
        return 0
    except Exception:
        return 0

def _run_pids():
    """PIDs of live `opencode run` procs (via /proc scan — never matches this process)."""
    me, out = os.getpid(), []
    for p in os.listdir("/proc"):
        if not p.isdigit() or int(p) == me:
            continue
        try:
            cl = open(f"/proc/{p}/cmdline", "rb").read().split(b"\0")
        except Exception:
            continue
        if b"opencode" in (b" ".join(cl)) and b"run" in cl and b"--attach" in cl:
            out.append(int(p))
    return out

def _completed():
    try:
        with open(LOG_FILE, errors="ignore") as f:
            f.seek(0, 2); f.seek(max(0, f.tell() - 20000)); return DONE_MARKER in f.read()
    except Exception:
        return False

def resume(reason="manual"):
    """Resume the managed opencode session from where it stopped (context preserved)."""
    import signal
    sid = _session_id()
    if not sid:
        log(f"resume: no top-level '{SESSION}' session on {OPENCODE_SERVER} — cannot resume"); return False
    log(f"resume ({reason}): continuing session {sid}")
    for pid in _run_pids():
        try: os.kill(pid, signal.SIGTERM)
        except Exception: pass
    _time.sleep(2); kill_session(); _time.sleep(1)
    kf = Path(LOG_DIR) / f".kickoff-{SESSION}-resume.txt"
    kf.write_text(
        "The opencode-go usage limit has reset (or the run stalled). You were mid-way through the weekly "
        "cc-ci /upgrade-all run. CONTINUE from where you left off — do NOT start over. Process the enrolled "
        "recipes not yet done this week, alphabetically; SKIP ones already done (their PRs exist — extend, "
        "never duplicate). Per recipe: run /recipe-upgrade in DEFAULT mode via a subagent, verify with "
        "!testme, open/extend the recipe PR (NEVER merge, NEVER weaken a test), <= DRONE_RUNNER_CAPACITY "
        "concurrent. immich has a tag+digest image abra can't parse — do the upstream-direct cross-check "
        "(recipe-upgrade SKILL §1), don't silently skip it. When all remaining recipes are done: "
        "write+push the weekly summary, then `python3 /srv/cc-ci/cc-ci-plan/launch-report.py fresh`, print "
        "'" + DONE_MARKER + "', and go idle.")
    share = "--share" if OPENCODE_SHARE else ""
    cmd = (f"set -a; . /srv/cc-ci/.testenv; set +a; {OPENCODE_BIN} run -s {sid} --continue "
           f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' \"$(cat '{kf}')\"")
    subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
    subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
    log(f"resume: relaunched {SESSION} (session {sid})"); return True

def babysit():
    """Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
    Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone."""
    log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
    misses = 0
    while True:
        _time.sleep(CHECK_EVERY)
        if _completed():
            log("babysit: run completed — exiting"); return
        idle = _log_idle_min()
        if idle is None or idle <= STALL_MIN:
            misses = 0; continue
        # Log has been static > STALL_MIN — the run is stalled (limit or wedge), not working.
        retry = _limit_retry_after()
        if retry > 0:
            wait = min(retry + 30, 3600)
            log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
            _time.sleep(wait); continue
        # Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume.
        misses += 1
        if misses >= 2:
            log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming")
            resume("babysit auto-resume"); misses = 0

# ── main ──────────────────────────────────────────────────────────────────────

def main():
    cmd = sys.argv[1] if len(sys.argv) > 1 else "start"

    if cmd == "start":
        start("use-or-create")
    elif cmd == "fresh":
        start("fresh")
    elif cmd == "stop":
        if session_alive():
            log(f"killing {SESSION}")
            kill_session()
        else:
            log(f"{SESSION} not running")
    elif cmd == "status":
        if session_alive():
            busy = "busy" if session_busy() else "idle/finishing"
            log(f"{SESSION}: RUNNING ({busy})")
            subprocess.run(
                f"ps -eo pid,etime,args | grep '[r]emote-control {SESSION}' || true",
                shell=True)
        else:
            log(f"{SESSION}: stopped")
        log(f"backend: {BACKEND}  model: {MODEL}  args: '{UPGRADER_ARGS or '<none>'}'")
    elif cmd == "attach":
        os.execvp("tmux", ["tmux", "attach", "-t", SESSION])
    elif cmd == "resume":
        resume("manual")
    elif cmd == "babysit":
        babysit()
    else:
        print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job

  launch-upgrader.py start    use-or-create (leave busy run alone, else start fresh)
  launch-upgrader.py fresh    always kill existing + start fresh
  launch-upgrader.py stop     kill the session
  launch-upgrader.py status   show session state
  launch-upgrader.py attach   tmux attach
  launch-upgrader.py resume   continue the opencode session from where it stalled (-s <id> --continue)
  launch-upgrader.py babysit  watch + auto-resume the opencode run across usage-limit (429) stalls

Backend: {BACKEND}  (LOOP_BACKEND or UPGRADER_BACKEND env var)
Model:   {MODEL}  (LOOP_MODEL or UPGRADER_MODEL env var)
Args:    {UPGRADER_ARGS or '<none>'}  (UPGRADER_ARGS env var, passed to /upgrade-all)

claude:   viewable at claude.ai/code
opencode: viewable at http://oc.commoninternet.net  server={OPENCODE_SERVER}
""")


if __name__ == "__main__":
    main()