launch-upgrader: rename babysit -> watchdog (match agents.py convention)
Subcommand, function, env (UPGRADER_WATCHDOG), and log file renamed; behavior unchanged. Only the opencode upgrader 'start' auto-spawns it. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@ -162,23 +162,23 @@ def start(mode="use-or-create"):
|
||||
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
|
||||
f"cat >> '{LOG_DIR}/{SESSION}.log'"])
|
||||
log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log")
|
||||
# For the opencode backend, spawn a detached babysitter that auto-resumes the run if the
|
||||
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit().
|
||||
if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1":
|
||||
subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"],
|
||||
stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"),
|
||||
# For the opencode backend, spawn a detached watchdog that auto-resumes the run if the
|
||||
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See watchdog().
|
||||
if BACKEND == "opencode" and os.environ.get("UPGRADER_WATCHDOG", "1") == "1":
|
||||
subprocess.Popen(["python3", os.path.realpath(__file__), "watchdog"],
|
||||
stdout=open(f"{LOG_DIR}/{SESSION}-watchdog.log", "a"),
|
||||
stderr=subprocess.STDOUT, start_new_session=True)
|
||||
log(" babysitter spawned — auto-resume on usage-limit stalls")
|
||||
log(" watchdog spawned — auto-resume on usage-limit stalls")
|
||||
|
||||
# ── opencode stall-detect + auto-resume watchdog ────────────────────────────────
|
||||
# The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it
|
||||
# trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects
|
||||
# the stall (the session log stops growing), waits out the limit, and resumes the SAME session —
|
||||
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|babysit}.
|
||||
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|watchdog}.
|
||||
import json as _json, urllib.request as _ureq, time as _time
|
||||
|
||||
STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled
|
||||
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # babysitter poll cadence
|
||||
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # watchdog poll cadence
|
||||
DONE_MARKER = "UPGRADE RUN COMPLETE"
|
||||
GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions"
|
||||
AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json")
|
||||
@ -280,15 +280,15 @@ def resume(reason="manual"):
|
||||
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
|
||||
log(f"resume: relaunched {SESSION} (session {sid})"); return True
|
||||
|
||||
def babysit():
|
||||
def watchdog():
|
||||
"""Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
|
||||
Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone."""
|
||||
log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
||||
log(f"watchdog: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
||||
misses = 0
|
||||
while True:
|
||||
_time.sleep(CHECK_EVERY)
|
||||
if _completed():
|
||||
log("babysit: run completed — exiting"); return
|
||||
log("watchdog: run completed — exiting"); return
|
||||
idle = _log_idle_min()
|
||||
if idle is None or idle <= STALL_MIN:
|
||||
misses = 0; continue
|
||||
@ -296,13 +296,13 @@ def babysit():
|
||||
retry = _limit_retry_after()
|
||||
if retry > 0:
|
||||
wait = min(retry + 30, 3600)
|
||||
log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
|
||||
log(f"watchdog: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
|
||||
_time.sleep(wait); continue
|
||||
# Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume.
|
||||
misses += 1
|
||||
if misses >= 2:
|
||||
log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming")
|
||||
resume("babysit auto-resume"); misses = 0
|
||||
log(f"watchdog: stalled {idle:.0f}min, limit clear — auto-resuming")
|
||||
resume("watchdog auto-resume"); misses = 0
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -333,8 +333,8 @@ def main():
|
||||
os.execvp("tmux", ["tmux", "attach", "-t", SESSION])
|
||||
elif cmd == "resume":
|
||||
resume("manual")
|
||||
elif cmd == "babysit":
|
||||
babysit()
|
||||
elif cmd == "watchdog":
|
||||
watchdog()
|
||||
else:
|
||||
print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job
|
||||
|
||||
@ -344,7 +344,7 @@ def main():
|
||||
launch-upgrader.py status show session state
|
||||
launch-upgrader.py attach tmux attach
|
||||
launch-upgrader.py resume continue the opencode session from where it stalled (-s <id> --continue)
|
||||
launch-upgrader.py babysit watch + auto-resume the opencode run across usage-limit (429) stalls
|
||||
launch-upgrader.py watchdog watch + auto-resume the opencode run across usage-limit (429) stalls
|
||||
|
||||
Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var)
|
||||
Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var)
|
||||
|
||||
Reference in New Issue
Block a user