weekly-run: tree-aware idle signal + crash-proof watchdog
More fixes from live-running the finish of the 2026-07-03 run: - _session_idle_min(): measure staleness across the whole session TREE (top-level run + all descendant subagents), via the opencode server time.updated, NOT the tmux log mtime (which freezes when a headless run doesn't stream to the pane). A per-recipe subagent deploy runs 20-40min during which the PARENT session's updated time is stale — reading the parent alone looked 'idle' and would false- resume, killing the productive run. Renamed from _log_idle_min (kept as alias). - watchdog(): wrap each poll in try/except so a transient server blip/race can never kill the watchdog (a dead watchdog silently abandons the run). - watchdog + supervisor now read the tree-aware session idle instead of log mtime. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET
This commit is contained in:
@ -123,7 +123,7 @@ def _gate():
|
||||
# misreads a headless `opencode run` as idle). Trust the run PROCESS + the session log's mtime:
|
||||
# a live `opencode run … -s <sid> --attach` proc, or a log touched within the stall window.
|
||||
pids = lu._run_pids(sid)
|
||||
idle = lu._log_idle_min()
|
||||
idle = lu._session_idle_min()
|
||||
if pids or (idle is not None and idle < lu.STALL_MIN):
|
||||
via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m"
|
||||
return False, sid, f"upgrader run progressing ({via}) — leaving it"
|
||||
|
||||
@ -238,12 +238,47 @@ def _session_id():
|
||||
cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True)
|
||||
return cands[0]["id"] if cands else None
|
||||
|
||||
def _log_idle_min():
|
||||
def _session_idle_min():
|
||||
"""Minutes since the managed run last ADVANCED — measured across the whole session TREE (the
|
||||
top-level cc-ci-upgrader session AND every descendant subagent). Uses the opencode SERVER's
|
||||
time.updated (authoritative — bumps on every new message/part) rather than the tmux log mtime,
|
||||
which FREEZES when a headless `opencode run --continue` doesn't stream to the pane (that froze
|
||||
signal made the watchdog resume-storm). Crucially it must include CHILDREN: while a per-recipe
|
||||
subagent runs (a deploy can take 20-40min) the PARENT's updated time is stale, so reading the
|
||||
parent alone would look 'idle' and trigger a false resume that kills the productive run. Falls
|
||||
back to the log mtime only when the server is unreachable."""
|
||||
sid = _session_id()
|
||||
if not sid:
|
||||
try:
|
||||
return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
|
||||
except Exception:
|
||||
return None
|
||||
rows = _server_get("/session") or []
|
||||
rows = rows if isinstance(rows, list) else rows.get("data", [])
|
||||
def _parent(s): return s.get("parentID") or s.get("parentId")
|
||||
# Transitive closure of sid over parent links → the managed session + all its descendants.
|
||||
tree = {sid}
|
||||
for _ in range(8): # bounded fixpoint (subagents nest only a few deep)
|
||||
grew = False
|
||||
for s in rows:
|
||||
if s.get("id") not in tree and _parent(s) in tree:
|
||||
tree.add(s.get("id")); grew = True
|
||||
if not grew:
|
||||
break
|
||||
newest = 0
|
||||
for s in rows:
|
||||
if s.get("id") in tree:
|
||||
newest = max(newest, (s.get("time") or {}).get("updated") or 0)
|
||||
if newest:
|
||||
return max(0.0, (_time.time() * 1000 - newest) / 60000.0)
|
||||
try:
|
||||
return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# Back-compat alias (older callers / the report watchdog import this name).
|
||||
_log_idle_min = _session_idle_min
|
||||
|
||||
def _go_key():
|
||||
try:
|
||||
return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key")
|
||||
@ -384,37 +419,42 @@ def watchdog():
|
||||
model prints DONE_MARKER, or after MAX_RESUMES consecutive resumes fail to get a live proc going
|
||||
(truly broken — hand back to the hourly supervisor / operator). Spawned by start()/resume()."""
|
||||
MAX_RESUMES = int(os.environ.get("UPGRADER_MAX_RESUMES", "20"))
|
||||
log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
|
||||
log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min session-idle, poll {CHECK_EVERY}s)")
|
||||
misses = 0; resumes = 0
|
||||
while True:
|
||||
_time.sleep(CHECK_EVERY)
|
||||
if _completed():
|
||||
log("watchdog: run completed — exiting"); return
|
||||
sid = _session_id()
|
||||
pids = _run_pids(sid) if sid else []
|
||||
idle = _log_idle_min()
|
||||
dead = not pids
|
||||
stalled = idle is not None and idle > STALL_MIN
|
||||
if not dead and not stalled:
|
||||
misses = 0; resumes = 0; continue # alive + producing output — healthy
|
||||
# Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
|
||||
retry = _limit_retry_after()
|
||||
if retry > 0:
|
||||
wait = min(retry + 30, 3600)
|
||||
log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
|
||||
f"(retry-after {retry}s) — waiting {wait}s")
|
||||
_time.sleep(wait); continue
|
||||
if not dead:
|
||||
# Alive but log-static — confirm it's really wedged (two consecutive misses) before acting.
|
||||
misses += 1
|
||||
if misses < 2:
|
||||
continue
|
||||
if resumes >= MAX_RESUMES:
|
||||
log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
|
||||
return
|
||||
why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
|
||||
log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
|
||||
resume("watchdog auto-resume"); resumes += 1; misses = 0
|
||||
# A transient error (server blip, race) must NEVER kill the watchdog — that would silently
|
||||
# abandon the run. Log and keep polling instead of letting the exception exit the loop.
|
||||
try:
|
||||
if _completed():
|
||||
log("watchdog: run completed — exiting"); return
|
||||
sid = _session_id()
|
||||
pids = _run_pids(sid) if sid else []
|
||||
idle = _session_idle_min()
|
||||
dead = not pids
|
||||
stalled = idle is not None and idle > STALL_MIN
|
||||
if not dead and not stalled:
|
||||
misses = 0; resumes = 0; continue # alive + session advancing — healthy
|
||||
# Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
|
||||
retry = _limit_retry_after()
|
||||
if retry > 0:
|
||||
wait = min(retry + 30, 3600)
|
||||
log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
|
||||
f"(retry-after {retry}s) — waiting {wait}s")
|
||||
_time.sleep(wait); continue
|
||||
if not dead:
|
||||
# Alive but session not advancing — confirm really wedged (two misses) before acting.
|
||||
misses += 1
|
||||
if misses < 2:
|
||||
continue
|
||||
if resumes >= MAX_RESUMES:
|
||||
log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
|
||||
return
|
||||
why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
|
||||
log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
|
||||
resume("watchdog auto-resume"); resumes += 1; misses = 0
|
||||
except Exception as e:
|
||||
log(f"watchdog: poll error ({type(e).__name__}: {e}) — continuing")
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
Reference in New Issue
Block a user