diff --git a/cc-ci-plan/kickoff.md b/cc-ci-plan/kickoff.md index 0877e6f..21f16bb 100644 --- a/cc-ci-plan/kickoff.md +++ b/cc-ci-plan/kickoff.md @@ -37,9 +37,13 @@ ownership keeps concurrent pushes merge-clean. |---|---|---| | **Iteration** — keep doing one unit of work, then wake again | `/loop` self-paced (ScheduleWakeup), per plan §7 pacing | each agent, in-session | | **Resilience** — restart a loop whose process/sandbox died; stop all on `## DONE` | `launch.sh` watchdog (tmux + git poll) | this script | +| **Handoff signalling** — wake the *waiting* loop the moment its counterpart hands off | watchdog `handoff_check` (~30 s): Builder writes a `CLAIMED` gate → ping Adversary to verify; Adversary updates `REVIEW.md` → ping Builder to proceed | this script | `/loop` alone is bound to its process: if the sandbox restarts, that loop is gone until something -relaunches it. The watchdog is that something. Use both. +relaunches it. The watchdog is that something. It also closes the **double-idle gap**: instead of a +pending gate/verdict sitting until the other loop's next scheduled wake, the watchdog pings the +waiting loop within ~30 s — so the Adversary can idle freely when nothing's pending (no busy-polling +or pointless re-verifying) yet still start verifying right after the Builder parks at a gate. Use all three. ## Launch diff --git a/cc-ci-plan/launch.sh b/cc-ci-plan/launch.sh index ab9f17f..2cd14ca 100755 --- a/cc-ci-plan/launch.sh +++ b/cc-ci-plan/launch.sh @@ -44,7 +44,8 @@ LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}" CC_CI_REPO="${CC_CI_REPO:-https://git.autonomic.zone/recipe-maintainers/cc-ci.git}" # CI project repo (DONE detection); harmless until the Builder creates it CC_CI_BRANCH="${CC_CI_BRANCH:-main}" -WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between watchdog checks +WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (restart dead loops, DONE) +SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop) BUILDER_SESSION="cc-ci-builder" ADV_SESSION="cc-ci-adv" @@ -108,26 +109,68 @@ is_done() { grep -qE '^##[[:space:]]+DONE' "$WATCH_DIR/STATUS.md" 2>/dev/null } +# Wake a loop by typing a one-line message into its tmux session (queues if mid-turn). +ping_session() { + local s="$1" msg="$2" + session_alive "$s" || return 0 + tmux send-keys -t "$s" -l -- "$msg" 2>/dev/null && { sleep 0.3; tmux send-keys -t "$s" Enter 2>/dev/null; } +} + +# Edge-triggered handoff signalling: the moment one loop produces the artifact the other is +# waiting on, ping the waiting loop so it wakes immediately instead of idling out its sleep. +# Reads the loops' local working clones (same host) for the fastest signal; the pinged loop +# still pulls the real state on wake. Edge-triggered (hash compare) so it pings once per change. +_wd_last_gate=""; _wd_last_review="" +handoff_check() { + local sf="$BUILDER_DIR/STATUS.md" rf="$ADV_DIR/REVIEW.md" cur + # Builder -> Adversary: a milestone gate is CLAIMED and awaiting verification. + if [[ -f "$sf" ]]; then + cur="$(grep -iE 'Gate:.*CLAIMED' "$sf" 2>/dev/null | sort -u | md5sum | awk '{print $1}')" + if grep -qiE 'Gate:.*CLAIMED' "$sf" 2>/dev/null && [[ "$cur" != "$_wd_last_gate" ]]; then + log "handoff: Builder CLAIMED a gate -> pinging Adversary" + ping_session "$ADV_SESSION" "watchdog ping: the Builder has CLAIMED a milestone gate in STATUS.md and is awaiting your verification. Pull and verify it now — don't idle." + fi + _wd_last_gate="$cur" + fi + # Adversary -> Builder: REVIEW.md changed (a verdict/PASS/FAIL or a new finding). + if [[ -f "$rf" ]]; then + cur="$(md5sum "$rf" 2>/dev/null | awk '{print $1}')" + if [[ -n "$cur" && "$cur" != "$_wd_last_review" ]]; then + [[ -n "$_wd_last_review" ]] && { + log "handoff: REVIEW.md changed -> pinging Builder" + ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary updated REVIEW.md (a verdict or finding). Pull and act now — if it PASSes your gate, proceed; if it's a finding, address it. Don't idle." + } + _wd_last_review="$cur" + fi + fi +} + watchdog_loop() { - log "watchdog up (interval=${WATCH_INTERVAL}s, repo=${CC_CI_REPO:-})" + log "watchdog up (signal=${SIGNAL_INTERVAL}s, heavy=${WATCH_INTERVAL}s, repo=${CC_CI_REPO:-})" + local elapsed="$WATCH_INTERVAL" # run a heavy check on the first tick too while true; do - # 1) DONE? then wind everything down. - if is_done; then - log "STATUS.md reports ## DONE — stopping loops." - stop_loops - log "watchdog exiting (project complete)." - exit 0 + # Fast path every tick: ping a loop the moment its counterpart hands off. + handoff_check + # Heavy path every WATCH_INTERVAL: DONE detection + restart dead loops. + if (( elapsed >= WATCH_INTERVAL )); then + elapsed=0 + if is_done; then + log "STATUS.md reports ## DONE — stopping loops." + stop_loops + log "watchdog exiting (project complete)." + exit 0 + fi + if ! session_alive "$BUILDER_SESSION"; then + log "builder session gone — restarting" + start_agent "$BUILDER_SESSION" "$BUILDER_DIR" "$PLAN_DIR/prompts/builder.md" + fi + if ! session_alive "$ADV_SESSION"; then + log "adversary session gone — restarting" + start_agent "$ADV_SESSION" "$ADV_DIR" "$PLAN_DIR/prompts/adversary.md" + fi fi - # 2) restart any dead loop (resilience the in-session /loop can't provide). - if ! session_alive "$BUILDER_SESSION"; then - log "builder session gone — restarting" - start_agent "$BUILDER_SESSION" "$BUILDER_DIR" "$PLAN_DIR/prompts/builder.md" - fi - if ! session_alive "$ADV_SESSION"; then - log "adversary session gone — restarting" - start_agent "$ADV_SESSION" "$ADV_DIR" "$PLAN_DIR/prompts/adversary.md" - fi - sleep "$WATCH_INTERVAL" + sleep "$SIGNAL_INTERVAL" + elapsed=$(( elapsed + SIGNAL_INTERVAL )) done } diff --git a/cc-ci-plan/plan.md b/cc-ci-plan/plan.md index 7082d73..dfdecc3 100644 --- a/cc-ci-plan/plan.md +++ b/cc-ci-plan/plan.md @@ -652,17 +652,21 @@ the *specific* thing. Three cases: 1. **Something in flight** (build/deploy/`nixos-rebuild`) → re-check on a short cadence (≈4 min) to stay cache-warm; keep polling *it*, don't treat it as idle, and don't spin on a minutes-long build. 2. **Blocked on the *other* loop** — Builder parked at a `CLAIMED` gate awaiting the Adversary, or - Adversary waiting for the Builder to fix an `[adversary]` finding → **poll on the short ≈4 min - cadence for the counterpart's response; do NOT use the long idle sleep.** A pending handoff is not - idleness — the other loop may respond any moment, and if *both* loops long-idle here you get dead - wall-clock where neither advances. (This is the common "both waiting" trap — avoid it.) + Adversary waiting for the Builder to fix an `[adversary]` finding. **You don't need to busy-poll + here: the watchdog signals across the handoff.** The moment the Builder writes a `CLAIMED` gate, + the watchdog pings the Adversary to verify *now*; the moment the Adversary updates `REVIEW.md` + (verdict/finding), it pings the Builder to proceed (`launch.sh`, ~30 s detection). So you may sleep + while blocked and trust the ping — but keep a **fallback self-poll on a modest cadence (~2–4 min)** + in case a ping is missed (a dead session is restarted by the watchdog and re-orients from the repo + anyway). The goal: a pending handoff resolves in well under a minute, not a whole idle interval. 3. **Genuinely idle, nothing pending from either loop** → sleep ~10–15 min, then re-orient. -Corollary for the Adversary: a standing `CLAIMED` gate is immediate top-priority work (verify it now, -don't idle past it); absent a gate, run background break-it probes / re-verify stale D-gates rather -than sleeping — so the Adversary is rarely idle while the Builder is active. Corollary for the -Builder: prefer keeping an unblocked backlog item in hand so you're not fully blocked on a gate; only -hit case 2 when everything is genuinely gated behind the pending verification. +Notes: **The Adversary may idle freely when nothing is pending — it should NOT pointlessly re-verify +or busy-poll to look busy.** It gets woken by the watchdog the instant the Builder claims a gate, so +"start verifying very soon after the Builder waits" is handled by the signal, not by the Adversary +spinning. **The Builder** should prefer keeping an unblocked backlog item in hand so it's rarely +*fully* blocked on a gate; only hit case 2 when everything is genuinely gated behind the pending +verification — and then rely on the watchdog ping (+ fallback poll) rather than a long idle. **Anti-drift guards.** - Cap retries: if an approach fails 3× the same way, stop, write the dead-end in `DECISIONS.md`, diff --git a/cc-ci-plan/prompts/adversary.md b/cc-ci-plan/prompts/adversary.md index 1dad85c..eebfd92 100644 --- a/cc-ci-plan/prompts/adversary.md +++ b/cc-ci-plan/prompts/adversary.md @@ -1,6 +1,6 @@ You are the Adversary agent for cc-ci — one of two independent loops. Your job is to DISBELIEVE the Builder. Read /srv/cc-ci/cc-ci-plan/plan.md in full, especially §2, §6, §6.1, and §9. -Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Pace yourself: a standing CLAIMED gate is immediate top-priority work — verify it now, do NOT idle past it. Poll short (~4m) while watching a CLAIMED gate, a running build, or waiting for the Builder to fix an [adversary] finding (a pending handoff is not idleness — if you both long-idle, neither advances). Absent any gate, run background break-it probes / re-verify stale D-gates rather than sleeping — you should rarely be idle while the Builder is active. Only sleep ~10–15m when nothing at all is pending from either loop. Keep running independent break-it probes even when no gate is pending. Stop only when STATUS.md says ## DONE and you have logged a fresh PASS for every D1–D10. +Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Pace yourself: when a gate is CLAIMED (or the watchdog pings you that one is), verify it promptly — that is top priority. But when nothing is pending you may IDLE freely (sleep ~10–15m); you do NOT need to busy-poll or pointlessly re-verify to look busy. The watchdog pings you the instant the Builder claims a gate, so "start verifying soon after the Builder waits" is handled by that signal — you don't have to spin. (Re-verify a D-gate only when its last PASS is genuinely stale >24h, or run a break-it probe when you choose to — not as idle-filler.) Poll ~4m only while actively watching a CLAIMED gate's run or a build in flight. Keep running independent break-it probes even when no gate is pending. Stop only when STATUS.md says ## DONE and you have logged a fresh PASS for every D1–D10. Credentials/access: §1.5 is the authoritative map. Provided creds are in /srv/cc-ci/.testenv and ~/.ssh; reach cc-ci with `ssh cc-ci` (root, via the userspace-tailscaled SOCKS proxy on 127.0.0.1:1055), and hit the dashboard / *.ci.commoninternet.net through that proxy (`curl --proxy socks5h://localhost:1055 ...`). If the proxy is down, restart it per §1.5. Verify from a COLD START but you may rely on this shared access path. diff --git a/cc-ci-plan/prompts/builder.md b/cc-ci-plan/prompts/builder.md index 7e74db1..e0d40bf 100644 --- a/cc-ci-plan/prompts/builder.md +++ b/cc-ci-plan/prompts/builder.md @@ -2,7 +2,7 @@ You are the Builder agent for the cc-ci project — one of two independent loops Single source of truth: /srv/cc-ci/cc-ci-plan/plan.md. Read it in full now, then begin at §1 Bootstrap. The original brief /srv/cc-ci/cc-ci-plan/brief.md is context only — do not edit it. -Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Each iteration = one unit of work (see §7). Pace per §7 (three cases): (1) build/deploy/rebuild in flight → poll ~4m, keep polling it; (2) parked at a CLAIMED gate awaiting the Adversary with no other unblocked work → you are BLOCKED ON THE ADVERSARY, so poll ~4m for its verdict — do NOT long-idle (if you both long-idle during a handoff, neither advances; that is the "both waiting" trap to avoid); (3) genuinely idle, nothing pending → sleep ~10–15m. Prefer keeping an unblocked backlog item in hand so you rarely hit case 2. Do NOT spin on a minutes-long build. Stop the loop only when STATUS.md says ## DONE. +Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Each iteration = one unit of work (see §7). Pace per §7 (three cases): (1) build/deploy/rebuild in flight → poll ~4m, keep polling it; (2) parked at a CLAIMED gate awaiting the Adversary with no other unblocked work → the watchdog will PING you the moment the Adversary updates REVIEW.md, so you may wait, but keep a fallback self-poll ~2–4m in case a ping is missed (don't sit in a long idle while a verdict may be landing); (3) genuinely idle, nothing pending → sleep ~10–15m. Prefer keeping an unblocked backlog item in hand so you rarely hit case 2. Do NOT spin on a minutes-long build. Stop the loop only when STATUS.md says ## DONE. You run as a SEPARATE process from the Adversary loop and coordinate ONLY through the git repo per §6.1: - git pull --rebase before every edit; make the smallest change; commit; git push. Never --force.