feat(watchdog): DONE-nudge for ceremony-lag (built-but-unmarked phase) before kill+reboot

Recurring stall: a phase is substantively complete (all DoD gates PASS from both adversaries, no veto) but the builder never writes the done marker, so auto-advance cannot fire and the loops idle. A blunt stall kill+reboot does not fix it (the re-kickoffed agent just re-idles). On a stall, if the agent is a loop agent and the current phase is NOT marked done, send a one-time DONE-nudge (ping) telling it to write the done marker IF the DoD is met (both adversaries PASS, no veto), giving a fresh idle window; only escalate to the kill+reboot if it stays stalled. One nudge per phase (cleared on phase advance). Gated by [loop].done_nudge (default true); message uses the configured done_marker and the phase status file. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01UWTdUq2bsic7JZGqJp3nD6
2026-06-24 02:40:14 +00:00
parent e6b53513d4
commit 44bb1da1be
1 changed files with 24 additions and 0 deletions
--- a/agents.py
+++ b/agents.py
@ -466,6 +466,18 @@ def limit_tick(cfg, agent, pane):
 # ── stall detection ──────────────────────────────────────────────────────────────

 _idle_since: dict[str, float] = {}
+_done_nudged: dict[str, bool] = {}   # per-session: sent the one-time "write the done marker" nudge this phase
+
+def _done_nudge_msg(cfg, ph):
+    """The DONE-nudge: prompts a stalled loop agent to finalize a built-but-unmarked phase."""
+    dm = cfg["loop"].get("done_marker", "## DONE")
+    pid = ph.get("id", "")
+    status = ph.get("status", f"STATUS-{pid}.md")
+    sub = _state_subdir(cfg)
+    return (f"watchdog nudge: you've stalled in phase '{pid}', which is NOT yet marked '{dm}'. Resume now "
+            f"— pull any pending review/inbox and continue. If (and ONLY if) every DoD item has a fresh "
+            f"PASS from BOTH adversaries with no standing veto, write '{dm}' to {sub}/{status} and push, "
+            f"so the phase settles and auto-advances. Do not stay idle.")

 def _pane_last_active(session):
    """Unix timestamp of the tmux window's last activity (last output change), or None.
@ -530,6 +542,17 @@ def stall_check_one(cfg, agent):
        if idle < stall_idle:
            return
        reason = f"idle {int(idle)}s with no WAITING-UNTIL marker"
+    # Ceremony-lag guard: a loop agent idling in a phase that's built but NOT marked done won't let the
+    # phase advance (the recurring "all gates PASS but no ## DONE written" stall). Nudge it ONCE per phase
+    # to finalize (write the done marker if the DoD is met) before falling back to the blunt kill+reboot.
+    if (cfg["loop"].get("done_nudge", True) and agent.get("kind") == "loop" and phases(cfg)
+            and not phase_done(cfg, cur_phase(cfg).get("status", "")) and not _done_nudged.get(session)):
+        log(f"stall: {agent['name']} ({session}) {reason} — DONE-nudge (phase built but not marked done)")
+        ping_session(session, _done_nudge_msg(cfg, cur_phase(cfg)),
+                     submit_key=backend_of(cfg, agent).get("submit_key", "Enter"))
+        _done_nudged[session] = True
+        _idle_since[session] = now      # fresh idle window to act on the nudge before reboot escalates
+        return
    log(f"stall: {agent['name']} ({session}) {reason} — kill + reboot")
    start_agent(cfg, agent, force=True)
    _idle_since[session] = 0.0
@ -900,6 +923,7 @@ def phase_advance_check(cfg):
        if marker.exists():
            marker.unlink()   # resuming into a (freshly-appended) phase — clear stale completion
        handoff_reset()
+        _done_nudged.clear()   # fresh DONE-nudge budget for the new phase
        start_loops(cfg)
        return True
    # last phase is DONE → sequence complete