diff --git a/cc-ci-plan/launch.sh b/cc-ci-plan/launch.sh index a066923..3bc81df 100755 --- a/cc-ci-plan/launch.sh +++ b/cc-ci-plan/launch.sh @@ -42,6 +42,8 @@ LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}" WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (phase DONE / restart dead loops) SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop) +STALL_IDLE="${STALL_IDLE:-300}" # seconds a loop may sit idle past its WAITING-UNTIL marker + # (or with no marker at all) before the watchdog reboots it BUILDER_SESSION="cc-ci-builder" ADV_SESSION="cc-ci-adv" @@ -178,6 +180,52 @@ heal_session() { fi } +# --- Idle-wedge detection (complements heal_session's dead/FATAL/limit cases) ---------------------- +# A loop can sit ALIVE but wedged — e.g. garbled output at the context limit — showing none of the +# heal_session signals (not dead, no FATAL string, no limit notice). The loops therefore DECLARE every +# wait with a final-line marker `WAITING-UNTIL: ` and cap each wait at 10 min (plan §7). +# A healthy idle loop ALWAYS has a current marker as its last message; a wedge does not (or has one +# whose time has already passed). So: reboot a loop that has been idle (no "esc to interrupt") for +# >= STALL_IDLE seconds AND (has no WAITING-UNTIL marker OR is now past the time that marker named). +# Runs every signal tick (30 s) for fine resolution; rebooting is safe — the loop re-orients from +# git + its phase STATUS/REVIEW files. +declare -A _wd_idle_since # session -> epoch first seen idle this stretch (0/unset = working) + +_parse_waiting_until() { # arg1 = pane text; echoes epoch seconds of the last marker, or nothing + local line ts + line="$(printf '%s\n' "$1" | grep -oE 'WAITING-UNTIL:[[:space:]]*[0-9][0-9T:Z+-]+' | tail -1)" + [[ -n "$line" ]] || return 0 + ts="$(printf '%s' "${line#WAITING-UNTIL:}" | tr -d '[:space:]')" + date -u -d "$ts" +%s 2>/dev/null || true +} + +stall_check_one() { + local role="$1" s="$2" dir="$3" pane now until idle since + session_alive "$s" || { _wd_idle_since[$s]=0; return 0; } # dead => heal_session handles it + now="$(printf '%(%s)T' -1)" + pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -40 || true)" + if printf '%s\n' "$pane" | grep -q 'esc to interrupt'; then + _wd_idle_since[$s]=0; return 0 # actively working — not idle + fi + since="${_wd_idle_since[$s]:-0}" + if [[ "$since" == 0 ]]; then since="$now"; _wd_idle_since[$s]="$now"; fi + idle=$(( now - since )) + (( idle >= STALL_IDLE )) || return 0 + until="$(_parse_waiting_until "$pane")" + if [[ -n "$until" ]] && (( now < until )); then + return 0 # legitimately waiting, before its time + fi + log "stall: $role ($s) idle ${idle}s, $([[ -n "$until" ]] && echo "past its WAITING-UNTIL" || echo "no WAITING-UNTIL marker") — kill + reboot (re-orients from repo)" + tmux kill-session -t "$s" 2>/dev/null || true + start_agent "$role" "$s" "$dir" + _wd_idle_since[$s]=0 +} + +stall_check() { + stall_check_one builder "$BUILDER_SESSION" "$BUILDER_DIR" + stall_check_one adversary "$ADV_SESSION" "$ADV_DIR" +} + # Is an orchestrator process alive ANYWHERE? Conflict-safety: we must NEVER launch a second # orchestrator that resumes the same conversation while one is already running (that double-resume is # the likely cause of the "thinking blocks cannot be modified" crashes). The orchestrator may be @@ -289,6 +337,7 @@ watchdog_loop() { local elapsed="$WATCH_INTERVAL" while true; do handoff_check + stall_check if (( elapsed >= WATCH_INTERVAL )); then elapsed=0 idx="$(cur_idx)"; pid="$(phase_id "$idx")"; status="$(phase_status "$idx")" diff --git a/cc-ci-plan/plan.md b/cc-ci-plan/plan.md index 38cfcf6..a7c4d70 100644 --- a/cc-ci-plan/plan.md +++ b/cc-ci-plan/plan.md @@ -724,7 +724,9 @@ the *specific* thing. Three cases: while blocked and trust the ping — but keep a **fallback self-poll on a modest cadence (~2–4 min)** in case a ping is missed (a dead session is restarted by the watchdog and re-orients from the repo anyway). The goal: a pending handoff resolves in well under a minute, not a whole idle interval. -3. **Genuinely idle, nothing pending from either loop** → sleep ~10–15 min, then re-orient. +3. **Genuinely idle, nothing pending from either loop** → sleep in chunks of **at most 10 min**, then + re-wake and re-orient; if still nothing, sleep another ≤10 min. **Never a single wait > 10 min** + (600 s) — see the liveness rule below. Notes: **The Adversary may idle freely when nothing is pending — it should NOT pointlessly re-verify or busy-poll to look busy.** It gets woken by the watchdog the instant the Builder claims a gate, so @@ -733,6 +735,22 @@ spinning. **The Builder** should prefer keeping an unblocked backlog item in han *fully* blocked on a gate; only hit case 2 when everything is genuinely gated behind the pending verification — and then rely on the watchdog ping (+ fallback poll) rather than a long idle. +**Liveness marker & max-wait (the watchdog ENFORCES this).** Every wait is capped at **10 minutes**; +to wait longer, wake at 10 min, re-check, and wait again. **Immediately before going idle for any +wait, your FINAL output line MUST be exactly:** + + WAITING-UNTIL: + +— the moment you intend to resume (≤10 min out, matching your `ScheduleWakeup`). Compute it from the +clock, e.g. `date -u -d '+10 min' +%FT%TZ`. The watchdog uses this to tell a healthy wait from a +wedge: if it sees a loop **idle ≥5 min with no current `WAITING-UNTIL` marker as its last message, OR +idle past the time the marker named, it kills + reboots that loop** (which then re-orients from git + +its STATUS/REVIEW files). So always leave a fresh marker before sleeping, and never overrun it. + +**Proactive compaction.** If your context usage climbs high (≳80%), run `/compact` *before* +continuing — your state lives in git + the phase STATUS/REVIEW files, so compaction is lossless for +the loop and prevents wedging (garbled output, failed tool calls) near the context limit. + **Anti-drift guards.** - Cap retries: if an approach fails 3× the same way, stop, write the dead-end in `DECISIONS.md`, and try a different approach or mark blocked. No thrashing. diff --git a/cc-ci-plan/prompts/adversary.md b/cc-ci-plan/prompts/adversary.md index 15577b2..3e78df4 100644 --- a/cc-ci-plan/prompts/adversary.md +++ b/cc-ci-plan/prompts/adversary.md @@ -1,6 +1,11 @@ You are the Adversary agent for cc-ci — one of two independent loops. Your job is to DISBELIEVE the Builder. Read /srv/cc-ci/cc-ci-plan/plan.md in full, especially §2, §6, §6.1, and §9. -Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Pace yourself: when a gate is CLAIMED (or the watchdog pings you that one is), verify it promptly — that is top priority. But when nothing is pending you may IDLE freely (sleep ~10–15m); you do NOT need to busy-poll or pointlessly re-verify to look busy. The watchdog pings you the instant the Builder claims a gate, so "start verifying soon after the Builder waits" is handled by that signal — you don't have to spin. (Re-verify a D-gate only when its last PASS is genuinely stale >24h, or run a break-it probe when you choose to — not as idle-filler.) Poll ~4m only while actively watching a CLAIMED gate's run or a build in flight. Keep running independent break-it probes even when no gate is pending. Stop only when STATUS.md says ## DONE and you have logged a fresh PASS for every D1–D10. +Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Pace yourself: when a gate is CLAIMED (or the watchdog pings you that one is), verify it promptly — that is top priority. But when nothing is pending you may IDLE freely (sleep in chunks of **≤10 min** — never a single wait >10 min); you do NOT need to busy-poll or pointlessly re-verify to look busy. The watchdog pings you the instant the Builder claims a gate, so "start verifying soon after the Builder waits" is handled by that signal — you don't have to spin. (Re-verify a D-gate only when its last PASS is genuinely stale >24h, or run a break-it probe when you choose to — not as idle-filler.) Poll ~4m only while actively watching a CLAIMED gate's run or a build in flight. Keep running independent break-it probes even when no gate is pending. Stop only when STATUS.md says ## DONE and you have logged a fresh PASS for every D1–D10. + +LIVENESS PROTOCOL (the watchdog ENFORCES this — see plan.md §7): +- **Cap every wait at 10 minutes.** To wait longer, wake at 10 min, re-check, then wait again. Never a single ScheduleWakeup > 600 s. (This replaces any longer "fallback" idle — no 30m sleeps.) +- **Declare every wait.** Immediately before going idle, your FINAL output line MUST be exactly `WAITING-UNTIL: ` — the time you will resume (≤10 min out, matching your ScheduleWakeup). Compute it from the clock (`date -u -d '+10 min' +%FT%TZ`). If the watchdog sees you idle ≥5 min with no current marker as your last line, OR idle past the time it names, it kills + reboots you (you resume cleanly from git + your REVIEW/STATUS files). +- **Compact proactively.** If context usage climbs high (≳80%), run `/compact` before continuing — your loop state is in git + REVIEW/STATUS, so compaction is lossless and prevents wedging at the context limit. Credentials/access: §1.5 is the authoritative map. Provided creds are in /srv/cc-ci/.testenv and ~/.ssh; reach cc-ci with `ssh cc-ci` (root, via the userspace-tailscaled SOCKS proxy on 127.0.0.1:1055), and hit the dashboard / *.ci.commoninternet.net through that proxy (`curl --proxy socks5h://localhost:1055 ...`). If the proxy is down, restart it per §1.5. Verify from a COLD START but you may rely on this shared access path. diff --git a/cc-ci-plan/prompts/builder.md b/cc-ci-plan/prompts/builder.md index ce93e06..c31a515 100644 --- a/cc-ci-plan/prompts/builder.md +++ b/cc-ci-plan/prompts/builder.md @@ -2,7 +2,12 @@ You are the Builder agent for the cc-ci project — one of two independent loops Single source of truth: /srv/cc-ci/cc-ci-plan/plan.md. Read it in full now, then begin at §1 Bootstrap. The original brief /srv/cc-ci/cc-ci-plan/brief.md is context only — do not edit it. -Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Each iteration = one unit of work (see §7). Pace per §7 (three cases): (1) build/deploy/rebuild/e2e/heavy-test in flight → **poll every ~5 min, NEVER a single big ScheduleWakeup matching the expected runtime** (catch failures at minute 4 of a 25-min e2e, not at minute 25); the cache-warm 5-min poll is cheap, the long blackout is not; (2) parked at a CLAIMED gate awaiting the Adversary with no other unblocked work → the watchdog will PING you the moment the Adversary updates REVIEW.md OR writes a BUILDER-INBOX.md, so you may wait, but keep a fallback self-poll ~2–4m in case a ping is missed; (3) genuinely idle, nothing pending → sleep ~10–15m. Prefer keeping an unblocked backlog item in hand so you rarely hit case 2. Stop the loop only when STATUS.md says ## DONE. +Start a self-paced loop now: invoke `/loop` with no interval so you re-wake yourself via ScheduleWakeup. Each iteration = one unit of work (see §7). Pace per §7 (three cases): (1) build/deploy/rebuild/e2e/heavy-test in flight → **poll every ~5 min, NEVER a single big ScheduleWakeup matching the expected runtime** (catch failures at minute 4 of a 25-min e2e, not at minute 25); the cache-warm 5-min poll is cheap, the long blackout is not; (2) parked at a CLAIMED gate awaiting the Adversary with no other unblocked work → the watchdog will PING you the moment the Adversary updates REVIEW.md OR writes a BUILDER-INBOX.md, so you may wait, but keep a fallback self-poll ~2–4m in case a ping is missed; (3) genuinely idle, nothing pending → sleep in chunks of **≤10 min** (never a single wait >10 min). Prefer keeping an unblocked backlog item in hand so you rarely hit case 2. Stop the loop only when STATUS.md says ## DONE. + +LIVENESS PROTOCOL (the watchdog ENFORCES this — see plan.md §7): +- **Cap every wait at 10 minutes.** To wait longer, wake at 10 min, re-check, then wait again. Never a single ScheduleWakeup > 600 s. +- **Declare every wait.** Immediately before going idle, your FINAL output line MUST be exactly `WAITING-UNTIL: ` — the time you will resume (≤10 min out, matching your ScheduleWakeup). Compute it from the clock (`date -u -d '+10 min' +%FT%TZ`). If the watchdog sees you idle ≥5 min with no current marker as your last line, OR idle past the time it names, it kills + reboots you (you resume cleanly from git + your STATUS/REVIEW files). +- **Compact proactively.** If context usage climbs high (≳80%), run `/compact` before continuing — your loop state is in git + phase STATUS/REVIEW, so compaction is lossless and prevents wedging (garbled output / failed tool calls) at the context limit. You run as a SEPARATE process from the Adversary loop and coordinate ONLY through the git repo per §6.1: - git pull --rebase before every edit; make the smallest change; commit; git push. Never --force.