From 69dd17833a603085120b21f66ec1151502ef7006 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Sat, 4 Jul 2026 05:04:32 +0000 Subject: [PATCH] weekly-run: tree-aware idle signal + crash-proof watchdog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit More fixes from live-running the finish of the 2026-07-03 run: - _session_idle_min(): measure staleness across the whole session TREE (top-level run + all descendant subagents), via the opencode server time.updated, NOT the tmux log mtime (which freezes when a headless run doesn't stream to the pane). A per-recipe subagent deploy runs 20-40min during which the PARENT session's updated time is stale — reading the parent alone looked 'idle' and would false- resume, killing the productive run. Renamed from _log_idle_min (kept as alias). - watchdog(): wrap each poll in try/except so a transient server blip/race can never kill the watchdog (a dead watchdog silently abandons the run). - watchdog + supervisor now read the tree-aware session idle instead of log mtime. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET --- cc-ci-plan/launch-supervisor.py | 2 +- cc-ci-plan/launch-upgrader.py | 98 +++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 30 deletions(-) diff --git a/cc-ci-plan/launch-supervisor.py b/cc-ci-plan/launch-supervisor.py index 70ac72c..27efdaf 100644 --- a/cc-ci-plan/launch-supervisor.py +++ b/cc-ci-plan/launch-supervisor.py @@ -123,7 +123,7 @@ def _gate(): # misreads a headless `opencode run` as idle). Trust the run PROCESS + the session log's mtime: # a live `opencode run … -s --attach` proc, or a log touched within the stall window. pids = lu._run_pids(sid) - idle = lu._log_idle_min() + idle = lu._session_idle_min() if pids or (idle is not None and idle < lu.STALL_MIN): via = f"{len(pids)} live run proc(s)" if pids else f"log idle {idle:.0f}m < {lu.STALL_MIN:.0f}m" return False, sid, f"upgrader run progressing ({via}) — leaving it" diff --git a/cc-ci-plan/launch-upgrader.py b/cc-ci-plan/launch-upgrader.py index a4fd360..a4eeb03 100644 --- a/cc-ci-plan/launch-upgrader.py +++ b/cc-ci-plan/launch-upgrader.py @@ -238,12 +238,47 @@ def _session_id(): cands.sort(key=lambda s: (s.get("time") or {}).get("created") or 0, reverse=True) return cands[0]["id"] if cands else None -def _log_idle_min(): +def _session_idle_min(): + """Minutes since the managed run last ADVANCED — measured across the whole session TREE (the + top-level cc-ci-upgrader session AND every descendant subagent). Uses the opencode SERVER's + time.updated (authoritative — bumps on every new message/part) rather than the tmux log mtime, + which FREEZES when a headless `opencode run --continue` doesn't stream to the pane (that froze + signal made the watchdog resume-storm). Crucially it must include CHILDREN: while a per-recipe + subagent runs (a deploy can take 20-40min) the PARENT's updated time is stale, so reading the + parent alone would look 'idle' and trigger a false resume that kills the productive run. Falls + back to the log mtime only when the server is unreachable.""" + sid = _session_id() + if not sid: + try: + return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0 + except Exception: + return None + rows = _server_get("/session") or [] + rows = rows if isinstance(rows, list) else rows.get("data", []) + def _parent(s): return s.get("parentID") or s.get("parentId") + # Transitive closure of sid over parent links → the managed session + all its descendants. + tree = {sid} + for _ in range(8): # bounded fixpoint (subagents nest only a few deep) + grew = False + for s in rows: + if s.get("id") not in tree and _parent(s) in tree: + tree.add(s.get("id")); grew = True + if not grew: + break + newest = 0 + for s in rows: + if s.get("id") in tree: + newest = max(newest, (s.get("time") or {}).get("updated") or 0) + if newest: + return max(0.0, (_time.time() * 1000 - newest) / 60000.0) try: return (_time.time() - os.path.getmtime(LOG_FILE)) / 60.0 except Exception: return None +# Back-compat alias (older callers / the report watchdog import this name). +_log_idle_min = _session_idle_min + def _go_key(): try: return (_json.load(open(AUTH_JSON)).get("opencode-go") or {}).get("key") @@ -384,37 +419,42 @@ def watchdog(): model prints DONE_MARKER, or after MAX_RESUMES consecutive resumes fail to get a live proc going (truly broken — hand back to the hourly supervisor / operator). Spawned by start()/resume().""" MAX_RESUMES = int(os.environ.get("UPGRADER_MAX_RESUMES", "20")) - log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min log-idle, poll {CHECK_EVERY}s)") + log(f"watchdog: watching {SESSION} (proc-death + stall>{STALL_MIN}min session-idle, poll {CHECK_EVERY}s)") misses = 0; resumes = 0 while True: _time.sleep(CHECK_EVERY) - if _completed(): - log("watchdog: run completed — exiting"); return - sid = _session_id() - pids = _run_pids(sid) if sid else [] - idle = _log_idle_min() - dead = not pids - stalled = idle is not None and idle > STALL_MIN - if not dead and not stalled: - misses = 0; resumes = 0; continue # alive + producing output — healthy - # Something's wrong (dead or wedged). Wait out a usage-limit before touching it. - retry = _limit_retry_after() - if retry > 0: - wait = min(retry + 30, 3600) - log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited " - f"(retry-after {retry}s) — waiting {wait}s") - _time.sleep(wait); continue - if not dead: - # Alive but log-static — confirm it's really wedged (two consecutive misses) before acting. - misses += 1 - if misses < 2: - continue - if resumes >= MAX_RESUMES: - log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)") - return - why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear" - log(f"watchdog: {why} — auto-resuming (#{resumes + 1})") - resume("watchdog auto-resume"); resumes += 1; misses = 0 + # A transient error (server blip, race) must NEVER kill the watchdog — that would silently + # abandon the run. Log and keep polling instead of letting the exception exit the loop. + try: + if _completed(): + log("watchdog: run completed — exiting"); return + sid = _session_id() + pids = _run_pids(sid) if sid else [] + idle = _session_idle_min() + dead = not pids + stalled = idle is not None and idle > STALL_MIN + if not dead and not stalled: + misses = 0; resumes = 0; continue # alive + session advancing — healthy + # Something's wrong (dead or wedged). Wait out a usage-limit before touching it. + retry = _limit_retry_after() + if retry > 0: + wait = min(retry + 30, 3600) + log(f"watchdog: {'proc dead' if dead else f'stalled {idle:.0f}min'} + usage-limited " + f"(retry-after {retry}s) — waiting {wait}s") + _time.sleep(wait); continue + if not dead: + # Alive but session not advancing — confirm really wedged (two misses) before acting. + misses += 1 + if misses < 2: + continue + if resumes >= MAX_RESUMES: + log(f"watchdog: {MAX_RESUMES} resumes without completion — giving up (supervisor/operator needed)") + return + why = "run proc exited (turn ended/crashed)" if dead else f"stalled {idle:.0f}min, limit clear" + log(f"watchdog: {why} — auto-resuming (#{resumes + 1})") + resume("watchdog auto-resume"); resumes += 1; misses = 0 + except Exception as e: + log(f"watchdog: poll error ({type(e).__name__}: {e}) — continuing") # ── main ──────────────────────────────────────────────────────────────────────