launch-upgrader: add stall-detect + auto-resume watchdog (opencode-go limit)
The opencode-go subscription's rolling usage-limit (429) ends the 'opencode run' agent loop mid-run; it does NOT self-resume. Add: - resume: continue the SAME session (context preserved) via 'opencode run -s <id> --continue' — finds the session from the web server, kills the idle proc safely (via /proc scan, never pkill -f self-match), relaunches in the tmux session. - babysit: poll the session log; on a stall (>15min idle) wait out any 429 retry-after then auto-resume. Spawned automatically by an opencode 'start'. So a usage-limit pause now self-heals instead of needing a manual nudge. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@ -162,6 +162,147 @@ def start(mode="use-or-create"):
|
|||||||
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
|
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
|
||||||
f"cat >> '{LOG_DIR}/{SESSION}.log'"])
|
f"cat >> '{LOG_DIR}/{SESSION}.log'"])
|
||||||
log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log")
|
log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log")
|
||||||
|
# For the opencode backend, spawn a detached babysitter that auto-resumes the run if the
|
||||||
|
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See babysit().
|
||||||
|
if BACKEND == "opencode" and os.environ.get("UPGRADER_BABYSIT", "1") == "1":
|
||||||
|
subprocess.Popen(["python3", os.path.realpath(__file__), "babysit"],
|
||||||
|
stdout=open(f"{LOG_DIR}/{SESSION}-babysit.log", "a"),
|
||||||
|
stderr=subprocess.STDOUT, start_new_session=True)
|
||||||
|
log(" babysitter spawned — auto-resume on usage-limit stalls")
|
||||||
|
|
||||||
|
# ── opencode stall-detect + auto-resume watchdog ────────────────────────────────
|
||||||
|
# The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it
|
||||||
|
# trips mid-run, the `opencode run` agent loop ENDS and does NOT self-resume. This watchdog detects
|
||||||
|
# the stall (the session log stops growing), waits out the limit, and resumes the SAME session —
|
||||||
|
# context preserved — via `opencode run -s <id> --continue`. Standalone: launch-upgrader.py {resume|babysit}.
|
||||||
|
import json as _json, urllib.request as _ureq, time as _time
|
||||||
|
|
||||||
|
STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled
|
||||||
|
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # babysitter poll cadence
|
||||||
|
DONE_MARKER = "UPGRADE RUN COMPLETE"
|
||||||
|
GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions"
|
||||||
|
AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json")
|
||||||
|
LOG_FILE = f"{LOG_DIR}/{SESSION}.log"
|
||||||
|
|
||||||
|
def _server_get(path):
|
||||||
|
try:
|
||||||
|
with _ureq.urlopen(OPENCODE_SERVER + path, timeout=15) as r:
|
||||||
|
return _json.load(r)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _session_id():
|
||||||
|
"""Newest top-level opencode session titled like SESSION (the run we manage)."""
|
||||||
|
rows = _server_get("/session") or []
|
||||||
|
rows = rows if isinstance(rows, list) else rows.get("data", [])
|
||||||
|
cands = [s for s in rows if s.get("title") == SESSION and not (s.get("parentID") or s.get("parentId"))]
|
||||||
|
cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True)
|
||||||
|
return cands[0]["id"] if cands else None
|
||||||
|
|
||||||
|
def _log_idle_min():
|
||||||
|
try:
|
||||||
|
return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _go_key():
|
||||||
|
try:
|
||||||
|
return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key")
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _limit_retry_after():
|
||||||
|
"""0 if the opencode-go endpoint is available (HTTP 200); else the 429 retry-after seconds."""
|
||||||
|
key = _go_key()
|
||||||
|
if not key:
|
||||||
|
return 0
|
||||||
|
body = _json.dumps({"model": (MODEL or "").split("/")[-1] or "glm-5.2", "max_tokens": 8,
|
||||||
|
"messages": [{"role": "user", "content": "hi"}]}).encode()
|
||||||
|
req = _ureq.Request(GO_ENDPOINT, data=body, method="POST",
|
||||||
|
headers={"Authorization": "Bearer " + key, "content-type": "application/json"})
|
||||||
|
try:
|
||||||
|
_ureq.urlopen(req, timeout=20).read(); return 0
|
||||||
|
except _ureq.HTTPError as e:
|
||||||
|
if e.code == 429:
|
||||||
|
try: return max(1, int(e.headers.get("retry-after", "300")))
|
||||||
|
except Exception: return 300
|
||||||
|
return 0
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _run_pids():
|
||||||
|
"""PIDs of live `opencode run` procs (via /proc scan — never matches this process)."""
|
||||||
|
me, out = os.getpid(), []
|
||||||
|
for p in os.listdir("/proc"):
|
||||||
|
if not p.isdigit() or int(p) == me:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
cl = open(f"/proc/{p}/cmdline", "rb").read().split(b"\0")
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if b"opencode" in (b" ".join(cl)) and b"run" in cl and b"--attach" in cl:
|
||||||
|
out.append(int(p))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _completed():
|
||||||
|
try:
|
||||||
|
with open(LOG_FILE, errors="ignore") as f:
|
||||||
|
f.seek(0, 2); f.seek(max(0, f.tell() - 20000)); return DONE_MARKER in f.read()
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def resume(reason="manual"):
|
||||||
|
"""Resume the managed opencode session from where it stopped (context preserved)."""
|
||||||
|
import signal
|
||||||
|
sid = _session_id()
|
||||||
|
if not sid:
|
||||||
|
log(f"resume: no top-level '{SESSION}' session on {OPENCODE_SERVER} — cannot resume"); return False
|
||||||
|
log(f"resume ({reason}): continuing session {sid}")
|
||||||
|
for pid in _run_pids():
|
||||||
|
try: os.kill(pid, signal.SIGTERM)
|
||||||
|
except Exception: pass
|
||||||
|
_time.sleep(2); kill_session(); _time.sleep(1)
|
||||||
|
kf = Path(LOG_DIR) / f".kickoff-{SESSION}-resume.txt"
|
||||||
|
kf.write_text(
|
||||||
|
"The opencode-go usage limit has reset (or the run stalled). You were mid-way through the weekly "
|
||||||
|
"cc-ci /upgrade-all run. CONTINUE from where you left off — do NOT start over. Process the enrolled "
|
||||||
|
"recipes not yet done this week, alphabetically; SKIP ones already done (their PRs exist — extend, "
|
||||||
|
"never duplicate). Per recipe: run /recipe-upgrade in DEFAULT mode via a subagent, verify with "
|
||||||
|
"!testme, open/extend the recipe PR (NEVER merge, NEVER weaken a test), <= DRONE_RUNNER_CAPACITY "
|
||||||
|
"concurrent. immich has a tag+digest image abra can't parse — do the upstream-direct cross-check "
|
||||||
|
"(recipe-upgrade SKILL §1), don't silently skip it. When all remaining recipes are done: "
|
||||||
|
"write+push the weekly summary, then `python3 /srv/cc-ci/cc-ci-plan/launch-report.py fresh`, print "
|
||||||
|
"'" + DONE_MARKER + "', and go idle.")
|
||||||
|
share = "--share" if OPENCODE_SHARE else ""
|
||||||
|
cmd = (f"set -a; . /srv/cc-ci/.testenv; set +a; {OPENCODE_BIN} run -s {sid} --continue "
|
||||||
|
f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' \"$(cat '{kf}')\"")
|
||||||
|
subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
|
||||||
|
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
|
||||||
|
log(f"resume: relaunched {SESSION} (session {sid})"); return True
|
||||||
|
|
||||||
|
def babysit():
|
||||||
|
"""Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
|
||||||
|
Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone."""
|
||||||
|
log(f"babysit: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
||||||
|
misses = 0
|
||||||
|
while True:
|
||||||
|
_time.sleep(CHECK_EVERY)
|
||||||
|
if _completed():
|
||||||
|
log("babysit: run completed — exiting"); return
|
||||||
|
idle = _log_idle_min()
|
||||||
|
if idle is None or idle <= STALL_MIN:
|
||||||
|
misses = 0; continue
|
||||||
|
# Log has been static > STALL_MIN — the run is stalled (limit or wedge), not working.
|
||||||
|
retry = _limit_retry_after()
|
||||||
|
if retry > 0:
|
||||||
|
wait = min(retry + 30, 3600)
|
||||||
|
log(f"babysit: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
|
||||||
|
_time.sleep(wait); continue
|
||||||
|
# Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume.
|
||||||
|
misses += 1
|
||||||
|
if misses >= 2:
|
||||||
|
log(f"babysit: stalled {idle:.0f}min, limit clear — auto-resuming")
|
||||||
|
resume("babysit auto-resume"); misses = 0
|
||||||
|
|
||||||
# ── main ──────────────────────────────────────────────────────────────────────
|
# ── main ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@ -190,6 +331,10 @@ def main():
|
|||||||
log(f"backend: {BACKEND} model: {MODEL} args: '{UPGRADER_ARGS or '<none>'}'")
|
log(f"backend: {BACKEND} model: {MODEL} args: '{UPGRADER_ARGS or '<none>'}'")
|
||||||
elif cmd == "attach":
|
elif cmd == "attach":
|
||||||
os.execvp("tmux", ["tmux", "attach", "-t", SESSION])
|
os.execvp("tmux", ["tmux", "attach", "-t", SESSION])
|
||||||
|
elif cmd == "resume":
|
||||||
|
resume("manual")
|
||||||
|
elif cmd == "babysit":
|
||||||
|
babysit()
|
||||||
else:
|
else:
|
||||||
print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job
|
print(f"""cc-ci upgrader launcher — one-shot weekly recipe-upgrade job
|
||||||
|
|
||||||
@ -198,6 +343,8 @@ def main():
|
|||||||
launch-upgrader.py stop kill the session
|
launch-upgrader.py stop kill the session
|
||||||
launch-upgrader.py status show session state
|
launch-upgrader.py status show session state
|
||||||
launch-upgrader.py attach tmux attach
|
launch-upgrader.py attach tmux attach
|
||||||
|
launch-upgrader.py resume continue the opencode session from where it stalled (-s <id> --continue)
|
||||||
|
launch-upgrader.py babysit watch + auto-resume the opencode run across usage-limit (429) stalls
|
||||||
|
|
||||||
Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var)
|
Backend: {BACKEND} (LOOP_BACKEND or UPGRADER_BACKEND env var)
|
||||||
Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var)
|
Model: {MODEL} (LOOP_MODEL or UPGRADER_MODEL env var)
|
||||||
|
|||||||
Reference in New Issue
Block a user