weekly-run: tree-aware idle signal + crash-proof watchdog

More fixes from live-running the finish of the 2026-07-03 run: - _session_idle_min(): measure staleness across the whole session TREE (top-level run + all descendant subagents), via the opencode server time.updated, NOT the tmux log mtime (which freezes when a headless run doesn't stream to the pane). A per-recipe subagent deploy runs 20-40min during which the PARENT session's updated time is stale — reading the parent alone looked 'idle' and would false- resume, killing the productive run. Renamed from _log_idle_min (kept as alias). - watchdog(): wrap each poll in try/except so a transient server blip/race can never kill the watchdog (a dead watchdog silently abandons the run). - watchdog + supervisor now read the tree-aware session idle instead of log mtime. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET
2026-07-04 05:04:32 +00:00
parent 399e999978
commit 69dd17833a
2 changed files with 70 additions and 30 deletions
--- a/cc-ci-plan/launch-supervisor.py
+++ b/cc-ci-plan/launch-supervisor.py
@ -123,7 +123,7 @@ def _gate():
    # misreads a headless `opencode run` as idle). Trust the run PROCESS + the session log's mtime:
    # a live `opencode run … -s <sid> --attach` proc, or a log touched within the stall window.
    pids = lu._run_pids(sid)
-    idle = lu._log_idle_min()
+    idle = lu._session_idle_min()
    if pids or (idle is not None and idle < lu.STALL_MIN):
        via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m"
        return False, sid, f"upgrader run progressing ({via}) — leaving it"
--- a/cc-ci-plan/launch-upgrader.py
+++ b/cc-ci-plan/launch-upgrader.py
@ -238,12 +238,47 @@ def _session_id():
    cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True)
    return cands[0]["id"] if cands else None

-def _log_idle_min():
+def _session_idle_min():
+    """Minutes since the managed run last ADVANCED — measured across the whole session TREE (the
+    top-level cc-ci-upgrader session AND every descendant subagent). Uses the opencode SERVER's
+    time.updated (authoritative — bumps on every new message/part) rather than the tmux log mtime,
+    which FREEZES when a headless `opencode run --continue` doesn't stream to the pane (that froze
+    signal made the watchdog resume-storm). Crucially it must include CHILDREN: while a per-recipe
+    subagent runs (a deploy can take 20-40min) the PARENT's updated time is stale, so reading the
+    parent alone would look 'idle' and trigger a false resume that kills the productive run. Falls
+    back to the log mtime only when the server is unreachable."""
+    sid = _session_id()
+    if not sid:
+        try:
+            return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
+        except Exception:
+            return None
+    rows = _server_get("/session") or []
+    rows = rows if isinstance(rows, list) else rows.get("data", [])
+    def _parent(s): return s.get("parentID") or s.get("parentId")
+    # Transitive closure of sid over parent links → the managed session + all its descendants.
+    tree = {sid}
+    for _ in range(8):  # bounded fixpoint (subagents nest only a few deep)
+        grew = False
+        for s in rows:
+            if s.get("id") not in tree and _parent(s) in tree:
+                tree.add(s.get("id")); grew = True
+        if not grew:
+            break
+    newest = 0
+    for s in rows:
+        if s.get("id") in tree:
+            newest = max(newest, (s.get("time") or {}).get("updated") or 0)
+    if newest:
+        return max(0.0, (_time.time() * 1000 - newest) / 60000.0)
    try:
        return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0
    except Exception:
        return None

+# Back-compat alias (older callers / the report watchdog import this name).
+_log_idle_min = _session_idle_min
+
 def _go_key():
    try:
        return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key")
@ -384,37 +419,42 @@ def watchdog():
    model prints DONE_MARKER, or after MAX_RESUMES consecutive resumes fail to get a live proc going
    (truly broken — hand back to the hourly supervisor / operator). Spawned by start()/resume()."""
    MAX_RESUMES = int(os.environ.get("UPGRADER_MAX_RESUMES", "20"))
-    log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)")
+    log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min session-idle, poll {CHECK_EVERY}s)")
    misses = 0; resumes = 0
    while True:
        _time.sleep(CHECK_EVERY)
-        if _completed():
-            log("watchdog: run completed — exiting"); return
-        sid = _session_id()
-        pids = _run_pids(sid) if sid else []
-        idle = _log_idle_min()
-        dead = not pids
-        stalled = idle is not None and idle > STALL_MIN
-        if not dead and not stalled:
-            misses = 0; resumes = 0; continue          # alive + producing output — healthy
-        # Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
-        retry = _limit_retry_after()
-        if retry > 0:
-            wait = min(retry + 30, 3600)
-            log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
-                f"(retry-after {retry}s) — waiting {wait}s")
-            _time.sleep(wait); continue
-        if not dead:
-            # Alive but log-static — confirm it's really wedged (two consecutive misses) before acting.
-            misses += 1
-            if misses < 2:
-                continue
-        if resumes >= MAX_RESUMES:
-            log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
-            return
-        why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
-        log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
-        resume("watchdog auto-resume"); resumes += 1; misses = 0
+        # A transient error (server blip, race) must NEVER kill the watchdog — that would silently
+        # abandon the run. Log and keep polling instead of letting the exception exit the loop.
+        try:
+            if _completed():
+                log("watchdog: run completed — exiting"); return
+            sid = _session_id()
+            pids = _run_pids(sid) if sid else []
+            idle = _session_idle_min()
+            dead = not pids
+            stalled = idle is not None and idle > STALL_MIN
+            if not dead and not stalled:
+                misses = 0; resumes = 0; continue      # alive + session advancing — healthy
+            # Something's wrong (dead or wedged). Wait out a usage-limit before touching it.
+            retry = _limit_retry_after()
+            if retry > 0:
+                wait = min(retry + 30, 3600)
+                log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited "
+                    f"(retry-after {retry}s) — waiting {wait}s")
+                _time.sleep(wait); continue
+            if not dead:
+                # Alive but session not advancing — confirm really wedged (two misses) before acting.
+                misses += 1
+                if misses < 2:
+                    continue
+            if resumes >= MAX_RESUMES:
+                log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)")
+                return
+            why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear"
+            log(f"watchdog: {why} — auto-resuming (#{resumes + 1})")
+            resume("watchdog auto-resume"); resumes += 1; misses = 0
+        except Exception as e:
+            log(f"watchdog: poll error ({type(e).__name__}: {e}) — continuing")

 # ── main ──────────────────────────────────────────────────────────────────────