weekly-run: watchdog resumes on proc-death; supervisor defers to watchdog
Live-testing the resume path surfaced two gaps: (1) an `opencode run` proc
EXITS when the model ends its turn, so a long /upgrade-all run's process dies
repeatedly before the whole run completes — and the log mtime freezes on death,
so the watchdog's log-idle>15min signal is both too slow and unreliable. (2) A
resumed run had no watchdog, so nothing re-continued it.
- watchdog(): detect PROC-DEATH (no live `opencode run` proc for the session +
not completed) and resume promptly, in addition to log-idle. Guarded by
MAX_RESUMES (default 20) so a no-progress loop (e.g. disk-full) eventually hands
off to the supervisor/operator instead of spinning forever.
- resume(): auto-spawn a watchdog if none is alive (skips when the watchdog itself
called resume — it lives in {SESSION}-watchdog — so no duplicate).
- launch-supervisor.py gate: defer while the per-run watchdog is alive (it is the
single writer for prompt-recovery). The supervisor only takes over once the
watchdog gives up (MAX_RESUMES) — i.e. a wedge a bare resume can't fix. Removes
the supervisor/watchdog double-resume race.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET
This commit is contained in:
@ -127,6 +127,11 @@ def _gate():
|
|||||||
if pids or (idle is not None and idle < lu.STALL_MIN):
|
if pids or (idle is not None and idle < lu.STALL_MIN):
|
||||||
via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m"
|
via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m"
|
||||||
return False, sid, f"upgrader run progressing ({via}) — leaving it"
|
return False, sid, f"upgrader run progressing ({via}) — leaving it"
|
||||||
|
# The per-run watchdog owns PROMPT recovery (resume on proc-death/stall) and is the single writer
|
||||||
|
# while it lives. Defer to it — it gives up (exits its tmux) only after MAX_RESUMES fail, i.e. the
|
||||||
|
# run is stuck in a way a bare resume can't fix (e.g. disk-full). THEN the supervisor takes over.
|
||||||
|
if lu._watchdog_alive():
|
||||||
|
return False, sid, "per-run watchdog alive — it owns recovery; supervisor stays back"
|
||||||
if _sup_alive() and _sup_busy():
|
if _sup_alive() and _sup_busy():
|
||||||
return False, sid, "a supervisor agent is already working — skip"
|
return False, sid, "a supervisor agent is already working — skip"
|
||||||
idle_s = f"{idle:.0f}m" if idle is not None else "unknown"
|
idle_s = f"{idle:.0f}m" if idle is not None else "unknown"
|
||||||
|
|||||||
@ -92,6 +92,10 @@ def session_busy():
|
|||||||
def kill_session():
|
def kill_session():
|
||||||
subprocess.run(["tmux", "kill-session", "-t", SESSION], capture_output=True)
|
subprocess.run(["tmux", "kill-session", "-t", SESSION], capture_output=True)
|
||||||
|
|
||||||
|
def _watchdog_alive():
|
||||||
|
return subprocess.run(["tmux", "has-session", "-t", f"{SESSION}-watchdog"],
|
||||||
|
capture_output=True).returncode == 0
|
||||||
|
|
||||||
def prereclaim_cc_ci():
|
def prereclaim_cc_ci():
|
||||||
"""Weekly-run step 0: prune STALE (unused AND older than PRERECLAIM_UNTIL) docker images on the
|
"""Weekly-run step 0: prune STALE (unused AND older than PRERECLAIM_UNTIL) docker images on the
|
||||||
cc-ci server so the run has disk headroom. Keeps recent images (reused this week); only clears
|
cc-ci server so the run has disk headroom. Keeps recent images (reused this week); only clears
|
||||||
@ -339,7 +343,13 @@ def resume(reason="manual"):
|
|||||||
f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' --dir '{WORKDIR}' \"$(cat '{kf}')\"")
|
f"--model '{MODEL}' {share} --attach '{OPENCODE_SERVER}' --dir '{WORKDIR}' \"$(cat '{kf}')\"")
|
||||||
subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
|
subprocess.run(["tmux", "new-session", "-d", "-s", SESSION, "-c", WORKDIR, cmd])
|
||||||
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
|
subprocess.run(["tmux", "pipe-pane", "-o", "-t", SESSION, f"cat >> '{LOG_FILE}'"])
|
||||||
log(f"resume: relaunched {SESSION} (session {sid})"); return True
|
log(f"resume: relaunched {SESSION} (session {sid})")
|
||||||
|
# Every resume must be self-healing: ensure a watchdog is watching this run. Skip if one is
|
||||||
|
# already alive — notably when the watchdog ITSELF called resume (it lives in {SESSION}-watchdog),
|
||||||
|
# so this never spawns a duplicate watchdog-of-a-watchdog.
|
||||||
|
if os.environ.get("UPGRADER_WATCHDOG", "1") == "1" and not _watchdog_alive():
|
||||||
|
_spawn_watchdog()
|
||||||
|
return True
|
||||||
|
|
||||||
def _spawn_watchdog():
|
def _spawn_watchdog():
|
||||||
"""Start the watchdog inside the persistent tmux server (NOT a Popen child). A systemd-timer
|
"""Start the watchdog inside the persistent tmux server (NOT a Popen child). A systemd-timer
|
||||||
@ -364,28 +374,47 @@ def _spawn_watchdog():
|
|||||||
log(f" watchdog spawned in tmux '{wsess}' — auto-resume on usage-limit stalls (survives the oneshot)")
|
log(f" watchdog spawned in tmux '{wsess}' — auto-resume on usage-limit stalls (survives the oneshot)")
|
||||||
|
|
||||||
def watchdog():
|
def watchdog():
|
||||||
"""Watch the opencode upgrader; on a stall, wait out any usage-limit then resume the session.
|
"""Watch the opencode upgrader and keep it alive to completion. Two stall modes:
|
||||||
Exits when the model prints DONE_MARKER. Spawned by an opencode `start`; also standalone."""
|
(a) PROC-DEATH — `opencode run` exits when the model ENDS ITS TURN (or crashes). For a long
|
||||||
log(f"watchdog: watching {SESSION} (stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
autonomous /upgrade-all this happens repeatedly before the whole run is done; the log mtime
|
||||||
misses = 0
|
also freezes, so log-idle alone would take 15min to notice a run that died in 5. We detect
|
||||||
|
it directly: no live `opencode run` proc for the session + not completed ⇒ resume promptly.
|
||||||
|
(b) LOG-IDLE — a proc is alive but wedged (no output > STALL_MIN); resume after confirming.
|
||||||
|
Either way, wait out an opencode-go usage-limit (429) first rather than hammering. Exits when the
|
||||||
|
model prints DONE_MARKER, or after MAX_RESUMES consecutive resumes fail to get a live proc going
|
||||||
|
(truly broken — hand back to the hourly supervisor / operator). Spawned by start()/resume()."""
|
||||||
|
MAX_RESUMES = int(os.environ.get("UPGRADER_MAX_RESUMES", "20"))
|
||||||
|
log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
||||||
|
misses = 0; resumes = 0
|
||||||
while True:
|
while True:
|
||||||
_time.sleep(CHECK_EVERY)
|
_time.sleep(CHECK_EVERY)
|
||||||
if _completed():
|
if _completed():
|
||||||
log("watchdog: run completed — exiting"); return
|
log("watchdog: run completed — exiting"); return
|
||||||
|
sid = _session_id()
|
||||||
|
pids = _run_pids(sid) if sid else []
|
||||||
idle = _log_idle_min()
|
idle = _log_idle_min()
|
||||||
if idle is None or idle <= STALL_MIN:
|
dead = not pids
|
||||||
misses = 0; continue
|
stalled = idle is not None and idle > STALL_MIN
|
||||||
# Log has been static > STALL_MIN — the run is stalled (limit or wedge), not working.
|
if not dead and not stalled:
|
||||||
|
misses = 0; resumes = 0; continue # alive + producing output — healthy
|
||||||
|
# Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
|
||||||
retry = _limit_retry_after()
|
retry = _limit_retry_after()
|
||||||
if retry > 0:
|
if retry > 0:
|
||||||
wait = min(retry + 30, 3600)
|
wait = min(retry + 30, 3600)
|
||||||
log(f"watchdog: stalled {idle:.0f}min + usage-limited (retry-after {retry}s) — waiting {wait}s")
|
log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
|
||||||
|
f"(retry-after {retry}s) — waiting {wait}s")
|
||||||
_time.sleep(wait); continue
|
_time.sleep(wait); continue
|
||||||
# Not limited but stalled — confirm it's really wedged (two consecutive misses), then resume.
|
if not dead:
|
||||||
misses += 1
|
# Alive but log-static — confirm it's really wedged (two consecutive misses) before acting.
|
||||||
if misses >= 2:
|
misses += 1
|
||||||
log(f"watchdog: stalled {idle:.0f}min, limit clear — auto-resuming")
|
if misses < 2:
|
||||||
resume("watchdog auto-resume"); misses = 0
|
continue
|
||||||
|
if resumes >= MAX_RESUMES:
|
||||||
|
log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
|
||||||
|
return
|
||||||
|
why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
|
||||||
|
log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
|
||||||
|
resume("watchdog auto-resume"); resumes += 1; misses = 0
|
||||||
|
|
||||||
# ── main ──────────────────────────────────────────────────────────────────────
|
# ── main ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user