Files
cc-ci-orchestrator/cc-ci-plan/overnight-run.sh
autonomic-bot d6e1a704da watchdog: parse limit-reset time, never reboot limit-stalled sessions; rename orch session
Replace the blind every-300s 'limit appears lifted' nudge (claude) and the
opencode-only _maybe_nudge_limit with one unified limit_tick state machine:

- parse the reset time from the limit banner (last match wins; stale banners
  whose time already passed fall back rather than waiting ~a day)
- arm a quiet window until reset+45s; parse failure -> flat 5-minute probe
  loop (operator-specified; not exponential backoff)
- while armed, suppress ALL healing: a limit-stalled session is NEVER
  kill+rebooted (this was the conc-phase churn: claude limit stalls fell
  through to the generic idle reboot, losing the banner and re-hitting
  the limit fresh)
- at window end send ONE nudge as a self-verifying probe: spinner clears
  the state; a re-printed banner re-arms from the fresh reset time
- dedupe: never stack a probe while our own text is visible in the pane
- state persisted per session in LOG_DIR (.limited-<session>) so watchdog
  restarts keep the window
- orchestrator gets the same treatment: limit_tick in heal_orchestrator,
  a per-signal-tick orch_limit_check, and hourly wakes deferred during
  limit windows
- loud WARNING at 3 probes, then continue flat probes forever

Also rename the orchestrator session default cc-ci-orchestrator-vm ->
cc-ci-orchestrator (launch.py ORCH_SESSION, launch-orchestrator.py SESSION,
docs/scripts references).
2026-06-11 00:55:07 +00:00

93 lines
4.9 KiB
Bash

#!/usr/bin/env bash
# Overnight runner — after the Assistant finishes consolidating recipe PRs, run the weekly
# /upgrade-all (full test run over every enrolled recipe) and collect a morning report.
#
# Polls in pure bash (NO claude) so it doesn't burn the shared claude usage budget — only the
# /upgrade-all run itself spends tokens. Runs detached in tmux session `cc-ci-overnight`.
#
# tmux new-session -d -s cc-ci-overnight 'bash /srv/cc-ci/cc-ci-plan/overnight-run.sh'
set -uo pipefail
LOG=/srv/cc-ci/.cc-ci-logs/overnight-run.log
DONE_MARKER=/srv/cc-ci/.cc-ci-logs/pr-consolidation.done # Assistant writes this when consolidation is finished
PLAN=/srv/cc-ci/cc-ci-plan
ORCH=cc-ci-orchestrator
TODAY=$(date -u +%Y-%m-%d)
REPORT=/srv/cc-ci/.cc-ci-logs/overnight-report-${TODAY}.md
SUMMARY=/srv/cc-ci/.cc-ci-logs/upgrades/upgrade-all-${TODAY}.md
log(){ printf '[overnight %s] %s\n' "$(date -u +%FT%TZ)" "$*" >>"$LOG"; }
active(){ tmux capture-pane -pt "$1" 2>/dev/null | grep -q 'esc to interrupt'; } # session busy?
log "=== overnight runner started ==="
# --- Gate A: wait for the Assistant's PR-consolidation done-marker (max 4h) ---
log "Gate A: waiting for assistant done-marker ($DONE_MARKER)..."
for _ in $(seq 1 48); do
[ -f "$DONE_MARKER" ] && break
sleep 300
done
if [ -f "$DONE_MARKER" ]; then log "Gate A passed — assistant done: $(head -1 "$DONE_MARKER" 2>/dev/null)"
else log "Gate A TIMED OUT (4h) — proceeding without confirmation."; fi
# --- Gate B: only run /upgrade-all after the usage-limit reset (~03:30 UTC) so it has budget ---
log "Gate B: waiting until >= 03:35 UTC (post usage-limit reset)..."
for _ in $(seq 1 36); do # cap ~3h
now=$((10#$(date -u +%H%M)))
[ "$now" -ge 335 ] && [ "$now" -lt 1200 ] && break # morning window, past reset
[ "$now" -ge 1200 ] && break # safety: never wait into the afternoon
sleep 300
done
log "Gate B passed (now $(date -u +%FT%TZ))."
# --- Gate C: let the build loops go idle so /upgrade-all doesn't contend on the Swarm (max 1h) ---
log "Gate C: waiting for builder/adversary to be idle (Swarm free)..."
for _ in $(seq 1 12); do
active cc-ci-builder || active cc-ci-adv || break
sleep 300
done
active cc-ci-builder || active cc-ci-adv && log "Gate C: loops still active after 1h — proceeding anyway." || log "Gate C passed — loops idle."
# --- Run the weekly /upgrade-all (DEFAULT: recipe PRs verified by !testme, NEVER merges) ---
log "launching /upgrade-all via launch-upgrader.py fresh ..."
LOOP_BACKEND=claude LOOP_MODEL=sonnet python3 "$PLAN/launch-upgrader.py" fresh >>"$LOG" 2>&1
# --- Wait for it to finish: the upgrader self-terminates and writes a dated summary (max ~8h) ---
log "waiting for /upgrade-all to complete (summary $SUMMARY or upgrader stop)..."
for _ in $(seq 1 96); do
sleep 300
st=$(python3 "$PLAN/launch-upgrader.py" status 2>/dev/null)
if printf '%s' "$st" | grep -qi 'stopped'; then log "upgrader reports stopped."; break; fi
done
# --- Collect the morning report ---
log "writing report $REPORT ..."
{
echo "# cc-ci overnight run — ${TODAY}"
echo
echo "Generated $(date -u +%FT%TZ) by overnight-run.sh. See $LOG for the run trace."
echo
echo "## /upgrade-all summary"
if [ -f "$SUMMARY" ]; then cat "$SUMMARY"; else echo "(no dated summary at $SUMMARY — /upgrade-all may have stalled on the usage limit or errored; check the cc-ci-upgrader session + $LOG)"; fi
echo
echo "## Open PRs per recipe (post-run)"
set -a; . /srv/cc-ci/.testenv 2>/dev/null; set +a
G="https://${GITEA_USERNAME}:${GITEA_PASSWORD}@${GITEA_URL}/api/v1"
for r in $(curl -fsS "$G/orgs/recipe-maintainers/repos?limit=100" 2>/dev/null | python3 -c "import sys,json;print('\n'.join(sorted(x['name'] for x in json.load(sys.stdin))))" 2>/dev/null); do
case "$r" in cc-ci|cc-ci-orchestrator|cc-ci-secrets|archived-*) continue;; esac
n=$(curl -fsS "$G/repos/recipe-maintainers/$r/pulls?state=open&limit=50" 2>/dev/null | python3 -c "import sys,json;d=json.load(sys.stdin);print(len(d));[print(' - #%d %s'%(p['number'],p['title'][:60])) for p in d]" 2>/dev/null)
echo " - $r: ${n%%$'\n'*} open"
printf '%s\n' "$n" | tail -n +2
done
echo
echo "## Phase / loop status"
python3 "$PLAN/launch.py" status 2>&1
} > "$REPORT" 2>&1
touch /srv/cc-ci/.cc-ci-logs/.overnight-done
log "report written. pinging orchestrator to notify the operator."
# Ask the orchestrator (which has PushNotification) to deliver the morning notification + journal it.
MSG="OVERNIGHT RUN COMPLETE — read ${REPORT} and the /upgrade-all summary, then send the operator a proactive PushNotification with the headline (did /upgrade-all complete, how many recipe PRs, overall CI state), append a completion event to cc-ci-plan/JOURNAL.md, and rm /srv/cc-ci/.cc-ci-logs/.overnight-done."
tmux send-keys -t "$ORCH" -l -- "$MSG" 2>/dev/null; sleep 1; tmux send-keys -t "$ORCH" Enter 2>/dev/null
log "=== overnight runner finished ==="