feat(watchdog): DONE-nudge for ceremony-lag (built-but-unmarked phase) before kill+reboot

Recurring stall: a phase is substantively complete (all DoD gates PASS from both
adversaries, no veto) but the builder never writes the done marker, so auto-advance
cannot fire and the loops idle. A blunt stall kill+reboot does not fix it (the
re-kickoffed agent just re-idles).

On a stall, if the agent is a loop agent and the current phase is NOT marked done,
send a one-time DONE-nudge (ping) telling it to write the done marker IF the DoD is
met (both adversaries PASS, no veto), giving a fresh idle window; only escalate to
the kill+reboot if it stays stalled. One nudge per phase (cleared on phase advance).
Gated by [loop].done_nudge (default true); message uses the configured done_marker
and the phase status file.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01UWTdUq2bsic7JZGqJp3nD6
This commit is contained in:
2026-06-24 02:40:14 +00:00
parent e6b53513d4
commit 44bb1da1be

View File

@ -466,6 +466,18 @@ def limit_tick(cfg, agent, pane):
# ── stall detection ──────────────────────────────────────────────────────────────
_idle_since: dict[str, float] = {}
_done_nudged: dict[str, bool] = {} # per-session: sent the one-time "write the done marker" nudge this phase
def _done_nudge_msg(cfg, ph):
"""The DONE-nudge: prompts a stalled loop agent to finalize a built-but-unmarked phase."""
dm = cfg["loop"].get("done_marker", "## DONE")
pid = ph.get("id", "")
status = ph.get("status", f"STATUS-{pid}.md")
sub = _state_subdir(cfg)
return (f"watchdog nudge: you've stalled in phase '{pid}', which is NOT yet marked '{dm}'. Resume now "
f"— pull any pending review/inbox and continue. If (and ONLY if) every DoD item has a fresh "
f"PASS from BOTH adversaries with no standing veto, write '{dm}' to {sub}/{status} and push, "
f"so the phase settles and auto-advances. Do not stay idle.")
def _pane_last_active(session):
"""Unix timestamp of the tmux window's last activity (last output change), or None.
@ -530,6 +542,17 @@ def stall_check_one(cfg, agent):
if idle < stall_idle:
return
reason = f"idle {int(idle)}s with no WAITING-UNTIL marker"
# Ceremony-lag guard: a loop agent idling in a phase that's built but NOT marked done won't let the
# phase advance (the recurring "all gates PASS but no ## DONE written" stall). Nudge it ONCE per phase
# to finalize (write the done marker if the DoD is met) before falling back to the blunt kill+reboot.
if (cfg["loop"].get("done_nudge", True) and agent.get("kind") == "loop" and phases(cfg)
and not phase_done(cfg, cur_phase(cfg).get("status", "")) and not _done_nudged.get(session)):
log(f"stall: {agent['name']} ({session}) {reason} — DONE-nudge (phase built but not marked done)")
ping_session(session, _done_nudge_msg(cfg, cur_phase(cfg)),
submit_key=backend_of(cfg, agent).get("submit_key", "Enter"))
_done_nudged[session] = True
_idle_since[session] = now # fresh idle window to act on the nudge before reboot escalates
return
log(f"stall: {agent['name']} ({session}) {reason} — kill + reboot")
start_agent(cfg, agent, force=True)
_idle_since[session] = 0.0
@ -900,6 +923,7 @@ def phase_advance_check(cfg):
if marker.exists():
marker.unlink() # resuming into a (freshly-appended) phase — clear stale completion
handoff_reset()
_done_nudged.clear() # fresh DONE-nudge budget for the new phase
start_loops(cfg)
return True
# last phase is DONE → sequence complete