feat(watchdog): DONE-nudge for ceremony-lag (built-but-unmarked phase) before kill+reboot
Recurring stall: a phase is substantively complete (all DoD gates PASS from both adversaries, no veto) but the builder never writes the done marker, so auto-advance cannot fire and the loops idle. A blunt stall kill+reboot does not fix it (the re-kickoffed agent just re-idles). On a stall, if the agent is a loop agent and the current phase is NOT marked done, send a one-time DONE-nudge (ping) telling it to write the done marker IF the DoD is met (both adversaries PASS, no veto), giving a fresh idle window; only escalate to the kill+reboot if it stays stalled. One nudge per phase (cleared on phase advance). Gated by [loop].done_nudge (default true); message uses the configured done_marker and the phase status file. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01UWTdUq2bsic7JZGqJp3nD6
This commit is contained in:
24
agents.py
24
agents.py
@ -466,6 +466,18 @@ def limit_tick(cfg, agent, pane):
|
||||
# ── stall detection ──────────────────────────────────────────────────────────────
|
||||
|
||||
_idle_since: dict[str, float] = {}
|
||||
_done_nudged: dict[str, bool] = {} # per-session: sent the one-time "write the done marker" nudge this phase
|
||||
|
||||
def _done_nudge_msg(cfg, ph):
|
||||
"""The DONE-nudge: prompts a stalled loop agent to finalize a built-but-unmarked phase."""
|
||||
dm = cfg["loop"].get("done_marker", "## DONE")
|
||||
pid = ph.get("id", "")
|
||||
status = ph.get("status", f"STATUS-{pid}.md")
|
||||
sub = _state_subdir(cfg)
|
||||
return (f"watchdog nudge: you've stalled in phase '{pid}', which is NOT yet marked '{dm}'. Resume now "
|
||||
f"— pull any pending review/inbox and continue. If (and ONLY if) every DoD item has a fresh "
|
||||
f"PASS from BOTH adversaries with no standing veto, write '{dm}' to {sub}/{status} and push, "
|
||||
f"so the phase settles and auto-advances. Do not stay idle.")
|
||||
|
||||
def _pane_last_active(session):
|
||||
"""Unix timestamp of the tmux window's last activity (last output change), or None.
|
||||
@ -530,6 +542,17 @@ def stall_check_one(cfg, agent):
|
||||
if idle < stall_idle:
|
||||
return
|
||||
reason = f"idle {int(idle)}s with no WAITING-UNTIL marker"
|
||||
# Ceremony-lag guard: a loop agent idling in a phase that's built but NOT marked done won't let the
|
||||
# phase advance (the recurring "all gates PASS but no ## DONE written" stall). Nudge it ONCE per phase
|
||||
# to finalize (write the done marker if the DoD is met) before falling back to the blunt kill+reboot.
|
||||
if (cfg["loop"].get("done_nudge", True) and agent.get("kind") == "loop" and phases(cfg)
|
||||
and not phase_done(cfg, cur_phase(cfg).get("status", "")) and not _done_nudged.get(session)):
|
||||
log(f"stall: {agent['name']} ({session}) {reason} — DONE-nudge (phase built but not marked done)")
|
||||
ping_session(session, _done_nudge_msg(cfg, cur_phase(cfg)),
|
||||
submit_key=backend_of(cfg, agent).get("submit_key", "Enter"))
|
||||
_done_nudged[session] = True
|
||||
_idle_since[session] = now # fresh idle window to act on the nudge before reboot escalates
|
||||
return
|
||||
log(f"stall: {agent['name']} ({session}) {reason} — kill + reboot")
|
||||
start_agent(cfg, agent, force=True)
|
||||
_idle_since[session] = 0.0
|
||||
@ -900,6 +923,7 @@ def phase_advance_check(cfg):
|
||||
if marker.exists():
|
||||
marker.unlink() # resuming into a (freshly-appended) phase — clear stale completion
|
||||
handoff_reset()
|
||||
_done_nudged.clear() # fresh DONE-nudge budget for the new phase
|
||||
start_loops(cfg)
|
||||
return True
|
||||
# last phase is DONE → sequence complete
|
||||
|
||||
Reference in New Issue
Block a user