diff --git a/cc-ci-plan/launch-upgrader.py b/cc-ci-plan/launch-upgrader.py index 090eea0..5c637a1 100644 --- a/cc-ci-plan/launch-upgrader.py +++ b/cc-ci-plan/launch-upgrader.py @@ -162,23 +162,23 @@ def start(mode="use-or-create"): subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_DIR}/{SESSION}.log'"]) log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log") - # For the opencode backend, spawn a detached babysitter that auto-resumes the run if the - # opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit(). - if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1": - subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"], - stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"), + # For the opencode backend, spawn a detached watchdog that auto-resumes the run if the + # opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See watchdog(). + if BACKEND == "opencode" and os.environ.get("UPGRADER_WATCHDOG", "1") == "1": + subprocess.Popen(["python3", os.path.realpath(__file__), "watchdog"], + stdout=open(f"{LOG_DIR}/{SESSION}-watchdog.log", "a"), stderr=subprocess.STDOUT, start_new_session=True) - log(" babysitter spawned — auto-resume on usage-limit stalls") + log(" watchdog spawned — auto-resume on usage-limit stalls") # ── opencode stall-detect + auto-resume watchdog ──────────────────────────────── # The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it # trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects # the stall (the session log stops growing), waits out the limit, and resumes the SAME session — -# context preserved — via `opencode run -s --continue`. Standalone: launch-upgrader.py {resume|babysit}. +# context preserved — via `opencode run -s --continue`. Standalone: launch-upgrader.py {resume|watchdog}. import json as _json, urllib.request as _ureq, time as _time STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled -CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # babysitter poll cadence +CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # watchdog poll cadence DONE_MARKER = "UPGRADE RUN COMPLETE" GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions" AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json") @@ -280,15 +280,15 @@ def resume(reason="manual"): subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"]) log(f"resume: relaunched {SESSION} (session {sid})"); return True -def babysit(): +def watchdog(): """Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session. Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone.""" - log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)") + log(f"watchdog: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)") misses = 0 while True: _time.sleep(CHECK_EVERY) if _completed(): - log("babysit: run completed — exiting"); return + log("watchdog: run completed — exiting"); return idle = _log_idle_min() if idle is None or idle <= STALL_MIN: misses = 0; continue @@ -296,13 +296,13 @@ def babysit(): retry = _limit_retry_after() if retry > 0: wait = min(retry + 30, 3600) - log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s") + log(f"watchdog: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s") _time.sleep(wait); continue # Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume. misses += 1 if misses >= 2: - log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming") - resume("babysit auto-resume"); misses = 0 + log(f"watchdog: stalled {idle:.0f}min, limit clear — auto-resuming") + resume("watchdog auto-resume"); misses = 0 # ── main ────────────────────────────────────────────────────────────────────── @@ -333,8 +333,8 @@ def main(): os.execvp("tmux", ["tmux", "attach", "-t", SESSION]) elif cmd == "resume": resume("manual") - elif cmd == "babysit": - babysit() + elif cmd == "watchdog": + watchdog() else: print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job @@ -344,7 +344,7 @@ def main(): launch-upgrader.py status show session state launch-upgrader.py attach tmux attach launch-upgrader.py resume continue the opencode session from where it stalled (-s --continue) - launch-upgrader.py babysit watch + auto-resume the opencode run across usage-limit (429) stalls + launch-upgrader.py watchdog watch + auto-resume the opencode run across usage-limit (429) stalls Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var) Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var)