Watchdog handoff signalling: ping the waiting loop on gate-claim / verdict (kill double-idle)
launch.sh watchdog now runs a fast (~30s) handoff_check alongside the heavy (300s) restart/DONE check: when the Builder writes a CLAIMED gate it pings the Adversary to verify now; when the Adversary updates REVIEW.md it pings the Builder to proceed (edge-triggered, reads local clones). So a pending handoff resolves in <~30s instead of a whole idle interval. Pacing revised: the Adversary may idle freely when nothing's pending (no pointless re-verify/busy-poll) and is woken by the watchdog; Builder waits on the ping + a fallback ~2-4m self-poll. kickoff documents the new "handoff signalling" role. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -44,7 +44,8 @@ LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}"
|
||||
CC_CI_REPO="${CC_CI_REPO:-https://git.autonomic.zone/recipe-maintainers/cc-ci.git}" # CI project repo (DONE detection); harmless until the Builder creates it
|
||||
CC_CI_BRANCH="${CC_CI_BRANCH:-main}"
|
||||
|
||||
WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between watchdog checks
|
||||
WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (restart dead loops, DONE)
|
||||
SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop)
|
||||
|
||||
BUILDER_SESSION="cc-ci-builder"
|
||||
ADV_SESSION="cc-ci-adv"
|
||||
@ -108,26 +109,68 @@ is_done() {
|
||||
grep -qE '^##[[:space:]]+DONE' "$WATCH_DIR/STATUS.md" 2>/dev/null
|
||||
}
|
||||
|
||||
# Wake a loop by typing a one-line message into its tmux session (queues if mid-turn).
|
||||
ping_session() {
|
||||
local s="$1" msg="$2"
|
||||
session_alive "$s" || return 0
|
||||
tmux send-keys -t "$s" -l -- "$msg" 2>/dev/null && { sleep 0.3; tmux send-keys -t "$s" Enter 2>/dev/null; }
|
||||
}
|
||||
|
||||
# Edge-triggered handoff signalling: the moment one loop produces the artifact the other is
|
||||
# waiting on, ping the waiting loop so it wakes immediately instead of idling out its sleep.
|
||||
# Reads the loops' local working clones (same host) for the fastest signal; the pinged loop
|
||||
# still pulls the real state on wake. Edge-triggered (hash compare) so it pings once per change.
|
||||
_wd_last_gate=""; _wd_last_review=""
|
||||
handoff_check() {
|
||||
local sf="$BUILDER_DIR/STATUS.md" rf="$ADV_DIR/REVIEW.md" cur
|
||||
# Builder -> Adversary: a milestone gate is CLAIMED and awaiting verification.
|
||||
if [[ -f "$sf" ]]; then
|
||||
cur="$(grep -iE 'Gate:.*CLAIMED' "$sf" 2>/dev/null | sort -u | md5sum | awk '{print $1}')"
|
||||
if grep -qiE 'Gate:.*CLAIMED' "$sf" 2>/dev/null && [[ "$cur" != "$_wd_last_gate" ]]; then
|
||||
log "handoff: Builder CLAIMED a gate -> pinging Adversary"
|
||||
ping_session "$ADV_SESSION" "watchdog ping: the Builder has CLAIMED a milestone gate in STATUS.md and is awaiting your verification. Pull and verify it now — don't idle."
|
||||
fi
|
||||
_wd_last_gate="$cur"
|
||||
fi
|
||||
# Adversary -> Builder: REVIEW.md changed (a verdict/PASS/FAIL or a new finding).
|
||||
if [[ -f "$rf" ]]; then
|
||||
cur="$(md5sum "$rf" 2>/dev/null | awk '{print $1}')"
|
||||
if [[ -n "$cur" && "$cur" != "$_wd_last_review" ]]; then
|
||||
[[ -n "$_wd_last_review" ]] && {
|
||||
log "handoff: REVIEW.md changed -> pinging Builder"
|
||||
ping_session "$BUILDER_SESSION" "watchdog ping: the Adversary updated REVIEW.md (a verdict or finding). Pull and act now — if it PASSes your gate, proceed; if it's a finding, address it. Don't idle."
|
||||
}
|
||||
_wd_last_review="$cur"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
watchdog_loop() {
|
||||
log "watchdog up (interval=${WATCH_INTERVAL}s, repo=${CC_CI_REPO:-<unset: DONE-detection disabled>})"
|
||||
log "watchdog up (signal=${SIGNAL_INTERVAL}s, heavy=${WATCH_INTERVAL}s, repo=${CC_CI_REPO:-<unset: DONE-detection disabled>})"
|
||||
local elapsed="$WATCH_INTERVAL" # run a heavy check on the first tick too
|
||||
while true; do
|
||||
# 1) DONE? then wind everything down.
|
||||
if is_done; then
|
||||
log "STATUS.md reports ## DONE — stopping loops."
|
||||
stop_loops
|
||||
log "watchdog exiting (project complete)."
|
||||
exit 0
|
||||
# Fast path every tick: ping a loop the moment its counterpart hands off.
|
||||
handoff_check
|
||||
# Heavy path every WATCH_INTERVAL: DONE detection + restart dead loops.
|
||||
if (( elapsed >= WATCH_INTERVAL )); then
|
||||
elapsed=0
|
||||
if is_done; then
|
||||
log "STATUS.md reports ## DONE — stopping loops."
|
||||
stop_loops
|
||||
log "watchdog exiting (project complete)."
|
||||
exit 0
|
||||
fi
|
||||
if ! session_alive "$BUILDER_SESSION"; then
|
||||
log "builder session gone — restarting"
|
||||
start_agent "$BUILDER_SESSION" "$BUILDER_DIR" "$PLAN_DIR/prompts/builder.md"
|
||||
fi
|
||||
if ! session_alive "$ADV_SESSION"; then
|
||||
log "adversary session gone — restarting"
|
||||
start_agent "$ADV_SESSION" "$ADV_DIR" "$PLAN_DIR/prompts/adversary.md"
|
||||
fi
|
||||
fi
|
||||
# 2) restart any dead loop (resilience the in-session /loop can't provide).
|
||||
if ! session_alive "$BUILDER_SESSION"; then
|
||||
log "builder session gone — restarting"
|
||||
start_agent "$BUILDER_SESSION" "$BUILDER_DIR" "$PLAN_DIR/prompts/builder.md"
|
||||
fi
|
||||
if ! session_alive "$ADV_SESSION"; then
|
||||
log "adversary session gone — restarting"
|
||||
start_agent "$ADV_SESSION" "$ADV_DIR" "$PLAN_DIR/prompts/adversary.md"
|
||||
fi
|
||||
sleep "$WATCH_INTERVAL"
|
||||
sleep "$SIGNAL_INTERVAL"
|
||||
elapsed=$(( elapsed + SIGNAL_INTERVAL ))
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user