weekly-run: tree-aware idle signal + crash-proof watchdog

More fixes from live-running the finish of the 2026-07-03 run:
- _session_idle_min(): measure staleness across the whole session TREE (top-level
  run + all descendant subagents), via the opencode server time.updated, NOT the
  tmux log mtime (which freezes when a headless run doesn't stream to the pane).
  A per-recipe subagent deploy runs 20-40min during which the PARENT session's
  updated time is stale — reading the parent alone looked 'idle' and would false-
  resume, killing the productive run. Renamed from _log_idle_min (kept as alias).
- watchdog(): wrap each poll in try/except so a transient server blip/race can
  never kill the watchdog (a dead watchdog silently abandons the run).
- watchdog + supervisor now read the tree-aware session idle instead of log mtime.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET
This commit is contained in:
autonomic-bot
2026-07-04 05:04:32 +00:00
parent 399e999978
commit 69dd17833a
2 changed files with 70 additions and 30 deletions

View File

@ -123,7 +123,7 @@ def _gate():
# misreads a headless `opencode run` as idle). Trust the run PROCESS + the session log's mtime:
# a live `opencode run … -s <sid> --attach` proc, or a log touched within the stall window.
pids = lu._run_pids(sid)
idle = lu._log_idle_min()
idle = lu._session_idle_min()
if pids or (idle is not None and idle < lu.STALL_MIN):
via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m"
return False, sid, f"upgrader run progressing ({via}) — leaving it"

View File

@ -238,12 +238,47 @@ def _session_id():
cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True)
return cands[0]["id"] if cands else None
def _log_idle_min():
def _session_idle_min():
"""Minutes since the managed run last ADVANCED — measured across the whole session TREE (the
top-level cc-ci-upgrader session AND every descendant subagent). Uses the opencode SERVER's
time.updated (authoritative — bumps on every new message/part) rather than the tmux log mtime,
which FREEZES when a headless `opencode run --continue` doesn't stream to the pane (that froze
signal made the watchdog resume-storm). Crucially it must include CHILDREN: while a per-recipe
subagent runs (a deploy can take 20-40min) the PARENT's updated time is stale, so reading the
parent alone would look 'idle' and trigger a false resume that kills the productive run. Falls
back to the log mtime only when the server is unreachable."""
sid = _session_id()
if not sid:
try:
return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
except Exception:
return None
rows = _server_get("/session") or []
rows = rows if isinstance(rows, list) else rows.get("data", [])
def _parent(s): return s.get("parentID") or s.get("parentId")
# Transitive closure of sid over parent links → the managed session + all its descendants.
tree = {sid}
for _ in range(8): # bounded fixpoint (subagents nest only a few deep)
grew = False
for s in rows:
if s.get("id") not in tree and _parent(s) in tree:
tree.add(s.get("id")); grew = True
if not grew:
break
newest = 0
for s in rows:
if s.get("id") in tree:
newest = max(newest, (s.get("time") or {}).get("updated") or 0)
if newest:
return max(0.0, (_time.time() * 1000 - newest) / 60000.0)
try:
return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
except Exception:
return None
# Back-compat alias (older callers / the report watchdog import this name).
_log_idle_min = _session_idle_min
def _go_key():
try:
return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key")
@ -384,37 +419,42 @@ def watchdog():
model prints DONE_MARKER, or after MAX_RESUMES consecutive resumes fail to get a live proc going
(truly broken — hand back to the hourly supervisor / operator). Spawned by start()/resume()."""
MAX_RESUMES = int(os.environ.get("UPGRADER_MAX_RESUMES", "20"))
log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min session-idle, poll {CHECK_EVERY}s)")
misses = 0; resumes = 0
while True:
_time.sleep(CHECK_EVERY)
if _completed():
log("watchdog: run completed — exiting"); return
sid = _session_id()
pids = _run_pids(sid) if sid else []
idle = _log_idle_min()
dead = not pids
stalled = idle is not None and idle > STALL_MIN
if not dead and not stalled:
misses = 0; resumes = 0; continue # alive + producing output — healthy
# Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
retry = _limit_retry_after()
if retry > 0:
wait = min(retry + 30, 3600)
log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
f"(retry-after {retry}s) — waiting {wait}s")
_time.sleep(wait); continue
if not dead:
# Alive but log-static — confirm it's really wedged (two consecutive misses) before acting.
misses += 1
if misses < 2:
continue
if resumes >= MAX_RESUMES:
log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
return
why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
resume("watchdog auto-resume"); resumes += 1; misses = 0
# A transient error (server blip, race) must NEVER kill the watchdog — that would silently
# abandon the run. Log and keep polling instead of letting the exception exit the loop.
try:
if _completed():
log("watchdog: run completed — exiting"); return
sid = _session_id()
pids = _run_pids(sid) if sid else []
idle = _session_idle_min()
dead = not pids
stalled = idle is not None and idle > STALL_MIN
if not dead and not stalled:
misses = 0; resumes = 0; continue # alive + session advancing — healthy
# Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
retry = _limit_retry_after()
if retry > 0:
wait = min(retry + 30, 3600)
log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
f"(retry-after {retry}s) — waiting {wait}s")
_time.sleep(wait); continue
if not dead:
# Alive but session not advancing — confirm really wedged (two misses) before acting.
misses += 1
if misses < 2:
continue
if resumes >= MAX_RESUMES:
log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
return
why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
resume("watchdog auto-resume"); resumes += 1; misses = 0
except Exception as e:
log(f"watchdog: poll error ({type(e).__name__}: {e}) — continuing")
# ── main ──────────────────────────────────────────────────────────────────────