watchdog: cover all parts of the weekly run + survive the systemd oneshot

Two gaps for the scheduled Thursday glm-5.2 run:
1. Survival: the watchdog was a Popen child of the Type=oneshot service, which
   systemd's cgroup cleanup kills on exit. Spawn it under the persistent tmux
   server instead (_spawn_watchdog), like the run sessions — survives the oneshot.
2. The report runs on glm-5.2 sharing the same opencode-go budget the upgrade run
   drains, so it can 429-stall with no recovery. launch-report.py now spawns the
   SAME watchdog pointed at the cc-ci-report session (generic via UPGRADER_SESSION/
   _MODEL/_DONE_MARKER/_RESUME_FILE), with a report-specific resume prompt.

Also: _run_pids() is now scoped to the managed session (title or -s <sid>) so the
report watchdog can't kill the idle upgrader process and vice-versa; resume() adds
--dir and honors a custom resume prompt file.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
autonomic-bot
2026-06-23 02:42:50 +00:00
parent 5a6c62e36c
commit f94be45f9c
2 changed files with 83 additions and 25 deletions

View File

@ -87,8 +87,35 @@ def start(mode, date):
log(f"starting {SESSION} (backend={BACKEND}, model={MODEL}, date={date or 'today'})")
subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", cwd, cmd])
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_DIR}/{SESSION}.log'"])
if BACKEND == "opencode" and OPENCODE_SHARE:
log(f" attached to {OPENCODE_SERVER} → http://oc.commoninternet.net +public --share link")
if BACKEND == "opencode":
if OPENCODE_SHARE:
log(f" attached to {OPENCODE_SERVER} → http://oc.commoninternet.net +public --share link")
# Watchdog for the report too: it runs on glm-5.2 sharing the same opencode-go budget the
# upgrade run just drained, so a 429 stall is likely. Reuse launch-upgrader.py's watchdog,
# pointed at THIS (cc-ci-report) session with a report-specific marker + resume prompt. It
# runs under the tmux server (survives the systemd oneshot like the run sessions).
if os.environ.get("REPORT_WATCHDOG", "1") == "1":
import shlex
LU = "/srv/cc-ci/cc-ci-plan/launch-upgrader.py"
rf = Path(LOG_DIR) / f".kickoff-{SESSION}-resume.txt"
rf.write_text(
"The opencode-go usage limit has reset (or the report stalled). CONTINUE generating and "
"publishing this week's public Recipe Report per /recipe-report: survey → write the spec "
"JSON → render with recipe-report.py → publish. Do NOT hand-write HTML (render() owns all "
"formatting). When the page is live, print 'RECIPE REPORT COMPLETE' and go idle.")
wenv = {"HOME": os.environ.get("HOME") or "/home/loops",
"UPGRADER_SESSION": SESSION, "UPGRADER_DIR": cwd, "LOG_DIR": LOG_DIR,
"UPGRADER_BACKEND": "opencode", "UPGRADER_MODEL": MODEL,
"OPENCODE_BIN": OPENCODE_BIN, "OPENCODE_SERVER": OPENCODE_SERVER,
"OPENCODE_SHARE": "1" if OPENCODE_SHARE else "0",
"UPGRADER_DONE_MARKER": "RECIPE REPORT COMPLETE",
"UPGRADER_RESUME_FILE": str(rf)}
envstr = " ".join(f"{k}={shlex.quote(str(v))}" for k, v in wenv.items())
wlog = f"{LOG_DIR}/{SESSION}-watchdog.log"
wcmd = f"env {envstr} python3 {shlex.quote(LU)} watchdog >> {shlex.quote(wlog)} 2>&1"
subprocess.run(["tmux", "kill-session", "-t", f"{SESSION}-watchdog"], capture_output=True)
subprocess.run(["tmux", "new-session", "-d", "-s", f"{SESSION}-watchdog", "-c", cwd, wcmd])
log(f" report watchdog spawned in tmux '{SESSION}-watchdog' — auto-resume on usage-limit stalls")
log(f"started. attach: tmux attach -t {SESSION}")

View File

@ -162,13 +162,10 @@ def start(mode="use-or-create"):
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION,
f"cat >> '{LOG_DIR}/{SESSION}.log'"])
log(f"started. attach: tmux attach -t {SESSION} log: {LOG_DIR}/{SESSION}.log")
# For the opencode backend, spawn a detached watchdog that auto-resumes the run if the
# opencode-go usage-limit (429) stalls it mid-run (it does NOT self-resume). See watchdog().
# For the opencode backend, spawn a watchdog that auto-resumes the run if the opencode-go
# usage-limit (429) stalls it mid-run (it does NOT self-resume). See watchdog().
if BACKEND == "opencode" and os.environ.get("UPGRADER_WATCHDOG", "1") == "1":
subprocess.Popen(["python3", os.path.realpath(__file__), "watchdog"],
stdout=open(f"{LOG_DIR}/{SESSION}-watchdog.log", "a"),
stderr=subprocess.STDOUT, start_new_session=True)
log(" watchdog spawned — auto-resume on usage-limit stalls")
_spawn_watchdog()
# ── opencode stall-detect + auto-resume watchdog ────────────────────────────────
# The opencode-go subscription enforces a rolling usage-limit (HTTP 429 + retry-after). When it
@ -179,7 +176,10 @@ import json as _json, urllib.request as _ureq, time as _time
STALL_MIN = float(os.environ.get("UPGRADER_STALL_MIN", "15")) # log-idle minutes ⇒ stalled
CHECK_EVERY = int(os.environ.get("UPGRADER_CHECK_SEC", "180")) # watchdog poll cadence
DONE_MARKER = "UPGRADE RUN COMPLETE"
# Generic so the SAME watchdog also covers the report job (launch-report.py points it at the
# cc-ci-report session with its own marker + resume prompt via these env vars).
DONE_MARKER = os.environ.get("UPGRADER_DONE_MARKER", "UPGRADE RUN COMPLETE")
RESUME_FILE = os.environ.get("UPGRADER_RESUME_FILE") # optional path to a custom resume prompt
GO_ENDPOINT = "https://opencode.ai/zen/go/v1/chat/completions"
AUTH_JSON = os.path.expanduser("~/.local/share/opencode/auth.json")
LOG_FILE = f"{LOG_DIR}/{SESSION}.log"
@ -230,8 +230,8 @@ def _limit_retry_after():
except Exception:
return 0
def _run_pids():
"""PIDs of live `opencode run` procs (via /proc scan — never matches this process)."""
def _run_pids(sid=None):
"""PIDs of live `opencode run` procs for THIS session (via /proc scan — never matches self)."""
me, out = os.getpid(), []
for p in os.listdir("/proc"):
if not p.isdigit() or int(p) == me:
@ -240,7 +240,13 @@ def _run_pids():
cl = open(f"/proc/{p}/cmdline", "rb").read().split(b"\0")
except Exception:
continue
if b"opencode" in (b" ".join(cl)) and b"run" in cl and b"--attach" in cl:
joined = b" ".join(cl)
if not (b"opencode" in joined and b"run" in cl and b"--attach" in cl):
continue
# Scope to THIS managed session only: a fresh run carries `--title <SESSION>`, a resumed
# run carries `-s <sid>`. Without this, the report watchdog would kill the idle upgrader
# run (and vice-versa) since both are `opencode run … --attach`.
if SESSION.encode() in joined or (sid and sid.encode() in joined):
out.append(int(p))
return out
@ -275,31 +281,56 @@ def resume(reason="manual"):
if not sid:
log(f"resume: no top-level '{SESSION}' session on {OPENCODE_SERVER} — cannot resume"); return False
log(f"resume ({reason}): continuing session {sid}")
for pid in _run_pids():
for pid in _run_pids(sid):
try: os.kill(pid, signal.SIGTERM)
except Exception: pass
_time.sleep(2); kill_session(); _time.sleep(1)
kf = Path(LOG_DIR) / f".kickoff-{SESSION}-resume.txt"
kf.write_text(
"The opencode-go usage limit has reset (or the run stalled). You were mid-way through the weekly "
"cc-ci /upgrade-all run. CONTINUE from where you left off — do NOT start over. Process the enrolled "
"recipes not yet done this week, alphabetically; SKIP ones already done (their PRs exist — extend, "
"never duplicate). Per recipe: run /recipe-upgrade in DEFAULT mode via a subagent, verify with "
"!testme, open/extend the recipe PR (NEVER merge, NEVER weaken a test), <= DRONE_RUNNER_CAPACITY "
"concurrent. immich has a tag+digest image abra can't parse — do the upstream-direct cross-check "
"(recipe-upgrade SKILL §1), don't silently skip it. When all remaining recipes are done: "
"write+push the weekly summary, then `python3 /srv/cc-ci/cc-ci-plan/launch-report.py fresh`, print "
"'" + DONE_MARKER + "', and go idle.")
if RESUME_FILE and os.path.exists(RESUME_FILE):
kf.write_text(open(RESUME_FILE).read()) # caller-supplied (e.g. the report job)
else:
kf.write_text(
"The opencode-go usage limit has reset (or the run stalled). You were mid-way through the weekly "
"cc-ci /upgrade-all run. CONTINUE from where you left off — do NOT start over. Process the enrolled "
"recipes not yet done this week, alphabetically; SKIP ones already done (their PRs exist — extend, "
"never duplicate). Per recipe: run /recipe-upgrade in DEFAULT mode via a subagent, verify with "
"!testme, open/extend the recipe PR (NEVER merge, NEVER weaken a test), <= DRONE_RUNNER_CAPACITY "
"concurrent. immich has a tag+digest image abra can't parse — do the upstream-direct cross-check "
"(recipe-upgrade SKILL §1), don't silently skip it. When all remaining recipes are done: "
"write+push the weekly summary, then `python3 /srv/cc-ci/cc-ci-plan/launch-report.py fresh`, print "
"'" + DONE_MARKER + "', and go idle.")
share = "--share" if OPENCODE_SHARE else ""
cmd = (f"set -a; . /srv/cc-ci/.testenv; set +a; {OPENCODE_BIN} run -s {sid} --continue "
f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' \"$(cat '{kf}')\"")
f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' --dir '{WORKDIR}' \"$(cat '{kf}')\"")
subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
log(f"resume: relaunched {SESSION} (session {sid})"); return True
def _spawn_watchdog():
"""Start the watchdog inside the persistent tmux server (NOT a Popen child). A systemd-timer
`start` is a Type=oneshot whose cgroup is reaped on exit, which would kill a Popen child; a
tmux session lives under the long-running tmux server and survives. Env is passed explicitly so
the watchdog gets THIS run's config regardless of the tmux server's ambient environment."""
import shlex
wsess = f"{SESSION}-watchdog"
wlog = f"{LOG_DIR}/{SESSION}-watchdog.log"
env = {"HOME": os.environ.get("HOME") or os.path.expanduser("~"),
"UPGRADER_SESSION": SESSION, "UPGRADER_DIR": WORKDIR, "LOG_DIR": LOG_DIR,
"UPGRADER_BACKEND": "opencode", "UPGRADER_MODEL": MODEL,
"OPENCODE_BIN": OPENCODE_BIN, "OPENCODE_SERVER": OPENCODE_SERVER,
"OPENCODE_SHARE": "1" if OPENCODE_SHARE else "0"}
for k in ("UPGRADER_RESUME_FILE", "UPGRADER_DONE_MARKER", "UPGRADER_STALL_MIN", "UPGRADER_CHECK_SEC"):
if os.environ.get(k):
env[k] = os.environ[k]
envstr = " ".join(f"{k}={shlex.quote(str(v))}" for k, v in env.items())
cmd = f"env {envstr} python3 {shlex.quote(os.path.realpath(__file__))} watchdog >> {shlex.quote(wlog)} 2>&1"
subprocess.run(["tmux", "kill-session", "-t", wsess], capture_output=True)
subprocess.run(["tmux", "new-session", "-d", "-s", wsess, "-c", WORKDIR, cmd])
log(f" watchdog spawned in tmux '{wsess}' — auto-resume on usage-limit stalls (survives the oneshot)")
def watchdog():
"""Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
Exits when the run prints UPGRADE RUN COMPLETE. Spawned by an opencode `start`; also standalone."""
Exits when the model prints DONE_MARKER. Spawned by an opencode `start`; also standalone."""
log(f"watchdog: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
misses = 0
while True: