launch-upgrader: rename babysit -> watchdog (match agents.py convention)

Subcommand, function, env (UPGRADER_WATCHDOG), and log file renamed; behavior
unchanged. Only the opencode upgrader 'start' auto-spawns it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
autonomic-bot
2026-06-23 01:33:07 +00:00
parent 816985160d
commit 6f9cbc1a56

View File

@ -162,23 +162,23 @@ def start(mode="use-or-create"):
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
f"cat >> '{LOG_DIR}/{SESSION}.log'"])
log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log")
# For the opencode backend, spawn a detached babysitter that auto-resumes the run if the
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit().
if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1":
subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"],
stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"),
# For the opencode backend, spawn a detached watchdog that auto-resumes the run if the
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See watchdog().
if BACKEND == "opencode" and os.environ.get("UPGRADER_WATCHDOG", "1") == "1":
subprocess.Popen(["python3", os.path.realpath(__file__), "watchdog"],
stdout=open(f"{LOG_DIR}/{SESSION}-watchdog.log", "a"),
stderr=subprocess.STDOUT, start_new_session=True)
log(" babysitter spawned — auto-resume on usage-limit stalls")
log(" watchdog spawned — auto-resume on usage-limit stalls")
# ── opencode stall-detect + auto-resume watchdog ────────────────────────────────
# The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it
# trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects
# the stall (the session log stops growing), waits out the limit, and resumes the SAME session —
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|babysit}.
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|watchdog}.
import json as _json, urllib.request as _ureq, time as _time
STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # babysitter poll cadence
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # watchdog poll cadence
DONE_MARKER = "UPGRADE RUN COMPLETE"
GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions"
AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json")
@ -280,15 +280,15 @@ def resume(reason="manual"):
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
log(f"resume: relaunched {SESSION} (session {sid})"); return True
def babysit():
def watchdog():
"""Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone."""
log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
log(f"watchdog: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
misses = 0
while True:
_time.sleep(CHECK_EVERY)
if _completed():
log("babysit: run completed — exiting"); return
log("watchdog: run completed — exiting"); return
idle = _log_idle_min()
if idle is None or idle <= STALL_MIN:
misses = 0; continue
@ -296,13 +296,13 @@ def babysit():
retry = _limit_retry_after()
if retry > 0:
wait = min(retry + 30, 3600)
log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
log(f"watchdog: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
_time.sleep(wait); continue
# Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume.
misses += 1
if misses >= 2:
log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming")
resume("babysit auto-resume"); misses = 0
log(f"watchdog: stalled {idle:.0f}min, limit clear — auto-resuming")
resume("watchdog auto-resume"); misses = 0
# ── main ──────────────────────────────────────────────────────────────────────
@ -333,8 +333,8 @@ def main():
os.execvp("tmux", ["tmux", "attach", "-t", SESSION])
elif cmd == "resume":
resume("manual")
elif cmd == "babysit":
babysit()
elif cmd == "watchdog":
watchdog()
else:
print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job
@ -344,7 +344,7 @@ def main():
launch-upgrader.py status show session state
launch-upgrader.py attach tmux attach
launch-upgrader.py resume continue the opencode session from where it stalled (-s <id> --continue)
launch-upgrader.py babysit watch + auto-resume the opencode run across usage-limit (429) stalls
launch-upgrader.py watchdog watch + auto-resume the opencode run across usage-limit (429) stalls
Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var)
Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var)