diff --git a/cc-ci-plan/launch.sh b/cc-ci-plan/launch.sh index 3bc81df..1c55671 100755 --- a/cc-ci-plan/launch.sh +++ b/cc-ci-plan/launch.sh @@ -42,8 +42,12 @@ LOG_DIR="${LOG_DIR:-/srv/cc-ci/.cc-ci-logs}" WATCH_INTERVAL="${WATCH_INTERVAL:-300}" # seconds between HEAVY checks (phase DONE / restart dead loops) SIGNAL_INTERVAL="${SIGNAL_INTERVAL:-30}" # seconds between HANDOFF checks (ping the waiting loop) -STALL_IDLE="${STALL_IDLE:-300}" # seconds a loop may sit idle past its WAITING-UNTIL marker - # (or with no marker at all) before the watchdog reboots it +STALL_IDLE="${STALL_IDLE:-300}" # NO-marker case: seconds a loop may sit idle (turn ended + # without declaring a wait) before the watchdog reboots it +STALL_GRACE="${STALL_GRACE:-180}" # marker case: seconds PAST a loop's WAITING-UNTIL before + # reboot. The real ScheduleWakeup fires AT the stated time; + # grace covers wake+start latency + marker/scheduler skew so + # the watchdog never RACES (pre-empts) a healthy self-wake. BUILDER_SESSION="cc-ci-builder" ADV_SESSION="cc-ci-adv" @@ -200,7 +204,7 @@ _parse_waiting_until() { # arg1 = pane text; echoes epoch seconds of the last } stall_check_one() { - local role="$1" s="$2" dir="$3" pane now until idle since + local role="$1" s="$2" dir="$3" pane now until idle since reason session_alive "$s" || { _wd_idle_since[$s]=0; return 0; } # dead => heal_session handles it now="$(printf '%(%s)T' -1)" pane="$(tmux capture-pane -pt "$s" 2>/dev/null | tail -40 || true)" @@ -210,12 +214,19 @@ stall_check_one() { since="${_wd_idle_since[$s]:-0}" if [[ "$since" == 0 ]]; then since="$now"; _wd_idle_since[$s]="$now"; fi idle=$(( now - since )) - (( idle >= STALL_IDLE )) || return 0 until="$(_parse_waiting_until "$pane")" - if [[ -n "$until" ]] && (( now < until )); then - return 0 # legitimately waiting, before its time + if [[ -n "$until" ]]; then + # Declared wait: the loop's own ScheduleWakeup fires AT 'until'. Reboot ONLY once we are + # STALL_GRACE seconds PAST it — i.e. the self-wake genuinely failed. Never reboot before/at + # 'until' (that races and pre-empts the healthy wake — the original false-reboot bug). + (( now > until + STALL_GRACE )) || return 0 + reason="past its WAITING-UNTIL by $(( now - until ))s — self-wake did not fire" + else + # No declared wait: a turn ended without scheduling/declaring. Treat as a wedge once idle a while. + (( idle >= STALL_IDLE )) || return 0 + reason="idle ${idle}s with no WAITING-UNTIL marker" fi - log "stall: $role ($s) idle ${idle}s, $([[ -n "$until" ]] && echo "past its WAITING-UNTIL" || echo "no WAITING-UNTIL marker") — kill + reboot (re-orients from repo)" + log "stall: $role ($s) $reason — kill + reboot (re-orients from repo)" tmux kill-session -t "$s" 2>/dev/null || true start_agent "$role" "$s" "$dir" _wd_idle_since[$s]=0