diff --git a/agents.py b/agents.py index 3996e67..cf30829 100755 --- a/agents.py +++ b/agents.py @@ -466,6 +466,18 @@ def limit_tick(cfg, agent, pane): # ── stall detection ────────────────────────────────────────────────────────────── _idle_since: dict[str, float] = {} +_done_nudged: dict[str, bool] = {} # per-session: sent the one-time "write the done marker" nudge this phase + +def _done_nudge_msg(cfg, ph): + """The DONE-nudge: prompts a stalled loop agent to finalize a built-but-unmarked phase.""" + dm = cfg["loop"].get("done_marker", "## DONE") + pid = ph.get("id", "") + status = ph.get("status", f"STATUS-{pid}.md") + sub = _state_subdir(cfg) + return (f"watchdog nudge: you've stalled in phase '{pid}', which is NOT yet marked '{dm}'. Resume now " + f"— pull any pending review/inbox and continue. If (and ONLY if) every DoD item has a fresh " + f"PASS from BOTH adversaries with no standing veto, write '{dm}' to {sub}/{status} and push, " + f"so the phase settles and auto-advances. Do not stay idle.") def _pane_last_active(session): """Unix timestamp of the tmux window's last activity (last output change), or None. @@ -530,6 +542,17 @@ def stall_check_one(cfg, agent): if idle < stall_idle: return reason = f"idle {int(idle)}s with no WAITING-UNTIL marker" + # Ceremony-lag guard: a loop agent idling in a phase that's built but NOT marked done won't let the + # phase advance (the recurring "all gates PASS but no ## DONE written" stall). Nudge it ONCE per phase + # to finalize (write the done marker if the DoD is met) before falling back to the blunt kill+reboot. + if (cfg["loop"].get("done_nudge", True) and agent.get("kind") == "loop" and phases(cfg) + and not phase_done(cfg, cur_phase(cfg).get("status", "")) and not _done_nudged.get(session)): + log(f"stall: {agent['name']} ({session}) {reason} — DONE-nudge (phase built but not marked done)") + ping_session(session, _done_nudge_msg(cfg, cur_phase(cfg)), + submit_key=backend_of(cfg, agent).get("submit_key", "Enter")) + _done_nudged[session] = True + _idle_since[session] = now # fresh idle window to act on the nudge before reboot escalates + return log(f"stall: {agent['name']} ({session}) {reason} — kill + reboot") start_agent(cfg, agent, force=True) _idle_since[session] = 0.0 @@ -900,6 +923,7 @@ def phase_advance_check(cfg): if marker.exists(): marker.unlink() # resuming into a (freshly-appended) phase — clear stale completion handoff_reset() + _done_nudged.clear() # fresh DONE-nudge budget for the new phase start_loops(cfg) return True # last phase is DONE → sequence complete