From ca6e68c08d29ce9a98a59929b9e0c1c9449e4488 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Mon, 1 Jun 2026 21:46:20 +0000 Subject: [PATCH] feat(orchestrator): fold hourly supervision wake into the watchdog The standalone ai-progress-monitor.sh waker pinged a hardcoded orchestrator session every 15m. Move that into the watchdog loop: ORCH_WAKE_INTERVAL (default 3600s) types the supervision prompt into the live orchestrator session, retrying each tick until it lands so a busy or briefly-absent orchestrator is never interrupted and no hour is skipped. Delete the now-redundant waker script; the prompt file is now driven by the watchdog. Reboot-safe by inheritance (the watchdog is started by cc-ci-loops.service). Co-Authored-By: Claude Opus 4.8 --- .gitignore | 4 ++ cc-ci-plan/ai-progress-monitor-prompt.txt | 46 ++++++++--------------- cc-ci-plan/ai-progress-monitor.sh | 24 ------------ cc-ci-plan/launch.py | 40 ++++++++++++++++++++ 4 files changed, 60 insertions(+), 54 deletions(-) delete mode 100755 cc-ci-plan/ai-progress-monitor.sh diff --git a/.gitignore b/.gitignore index d2c6162..c0bb182 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,7 @@ master-age.txt # Runtime completion markers /cc-ci-plan/phase6-phase7.done + +# Python bytecode cache +__pycache__/ +*.pyc diff --git a/cc-ci-plan/ai-progress-monitor-prompt.txt b/cc-ci-plan/ai-progress-monitor-prompt.txt index 9faab86..96338ad 100644 --- a/cc-ci-plan/ai-progress-monitor-prompt.txt +++ b/cc-ci-plan/ai-progress-monitor-prompt.txt @@ -1,35 +1,21 @@ -You are the cc-ci orchestrator and this is your scheduled wake-up prompt. +You are the cc-ci orchestrator and this is your scheduled hourly wake-up. Supervise the two worker loops, nudge anything stalled, confirm progress, and otherwise stay hands-off. Do NOT make unrelated code changes. -Every time you receive this prompt: - -1. Check the current state of the three worker sessions: -- `cc-ci-builder` -- `cc-ci-adv` -- `cc-ci-assistant` - -2. Use the real workspace and live state: +1. Check live state: - `python3 cc-ci-plan/launch.py status` -- `tmux capture-pane -pt cc-ci-builder` -- `tmux capture-pane -pt cc-ci-adv` -- `tmux capture-pane -pt cc-ci-assistant` -- `ssh cc-ci hostname` +- `tmux capture-pane -pt cc-ci-builder` and `tmux capture-pane -pt cc-ci-adv` (the two loops; both run the claude backend) +- `tmux capture-pane -pt cc-ci-watchdog` (the watchdog that heals/pings the loops) +- backend sanity: `cat /srv/cc-ci/.cc-ci-logs/.loop-backend` should say `claude` and `.loop-model` should say `sonnet` +- `ssh cc-ci hostname` to confirm the CI server is reachable -3. Keep them moving: -- If Builder is stalled or waiting past its stated wait, nudge it to continue phase 5. -- If Adversary is stale, behind, or still parked on old evidence, nudge it to re-orient to the current phase-5 state. -- If Assistant is stalled, nudge it to continue phase 6 and 7. -- If Assistant is not running, restart it via `python3 cc-ci-plan/launch-assistant.py start` with the opencode backend and re-send its assignment. +2. Keep them moving (phase 5 = upgrade-flow verify, plan-phase5-verify-upgrade-flow.md): +- If Builder is stalled or idle past its stated WAITING-UNTIL with no active work, nudge it to continue phase 5. +- If Adversary is stale or parked on old evidence, nudge it to re-orient to the current phase-5 state and verify outstanding claims. +- The watchdog already heals dead sessions and pings on claim()/review() commits — only intervene where it cannot (e.g. a wedged-but-alive loop, or genuine drift). +- If a loop session is missing entirely, restart the loops with: `RESUME_PHASE=1 LOOP_BACKEND=claude LOOP_MODEL=sonnet python3 cc-ci-plan/launch.py start` -4. Completion behavior: -- Treat phase 5 as complete when `/srv/cc-ci/cc-ci/machine-docs/STATUS-5.md` contains `## DONE`. -- Treat phase 6/7 as complete when `/srv/cc-ci-orch/cc-ci-plan/phase6-phase7.done` exists. -- When BOTH are complete: - - stop the watchdog (`tmux kill-session -t cc-ci-watchdog` if it exists) - - tell Builder, Adversary, and Assistant to remain idle - - append a completion event to `/srv/cc-ci-orch/cc-ci-plan/JOURNAL.md` - - then exit successfully +3. Completion: +- Phase 5 is complete when `/srv/cc-ci/cc-ci/machine-docs/STATUS-5.md` contains `## DONE` (every D-gate Adversary-verified PASS, no standing VETO). +- Phase 6/7 are already complete (marker `/srv/cc-ci-orch/cc-ci-plan/phase6-phase7.done` exists). +- When phase 5 reaches DONE: stop the watchdog (`tmux kill-session -t cc-ci-watchdog`), tell Builder and Adversary to remain idle, append a completion event to `/srv/cc-ci-orch/cc-ci-plan/JOURNAL.md`, and send a proactive PushNotification. -5. Do not make unrelated code changes. Only supervise, nudge, restart if needed, and record concise journal notes when appropriate. - -6. Be proactive and decisive. If everything is healthy and active, make no unnecessary changes. -7. If work is already in progress from the previous wake, just continue from the live state instead of restarting the analysis from scratch. +4. Be decisive but minimal. If everything is healthy and active, make no changes — just note the state. If work from a prior wake is still in progress, continue from the live state instead of restarting your analysis. diff --git a/cc-ci-plan/ai-progress-monitor.sh b/cc-ci-plan/ai-progress-monitor.sh deleted file mode 100755 index 5a97acb..0000000 --- a/cc-ci-plan/ai-progress-monitor.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SESSION="${AI_MONITOR_ORCH_SESSION:-cc-ci-orchestrator-oc}" -PROMPT_FILE="${AI_MONITOR_PROMPT_FILE:-/srv/cc-ci-orch/cc-ci-plan/ai-progress-monitor-prompt.txt}" -LOG_FILE="${AI_MONITOR_LOG_FILE:-/srv/cc-ci/.cc-ci-logs/ai-progress-monitor.log}" -SLEEP_SECS="${AI_MONITOR_INTERVAL:-900}" - -mkdir -p "$(dirname "$LOG_FILE")" - -while true; do - ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" - { - printf '[%s] wake tick\n' "$ts" - if tmux has-session -t "$SESSION" 2>/dev/null; then - tmux send-keys -t "$SESSION" -l -- "$(cat "$PROMPT_FILE")" - tmux send-keys -t "$SESSION" C-m - printf '[%s] prompted %s\n' "$ts" "$SESSION" - else - printf '[%s] session missing: %s\n' "$ts" "$SESSION" - fi - } >>"$LOG_FILE" 2>&1 || true - sleep "$SLEEP_SECS" -done diff --git a/cc-ci-plan/launch.py b/cc-ci-plan/launch.py index bfb730d..45c1651 100644 --- a/cc-ci-plan/launch.py +++ b/cc-ci-plan/launch.py @@ -26,6 +26,8 @@ Env (all optional — defaults shown): PHASE_IDX_FILE $LOG_DIR/.phase-idx WATCH_INTERVAL 300 (seconds between heavy checks: phase DONE / heal sessions) SIGNAL_INTERVAL 30 (seconds between handoff / stall checks) + ORCH_WAKE_INTERVAL 3600 (seconds between supervision wakes typed into the orchestrator session) + ORCH_WAKE_PROMPT $PLAN_DIR/ai-progress-monitor-prompt.txt (the supervision prompt) STALL_IDLE 300 (idle seconds without a WAITING-UNTIL before reboot) STALL_GRACE 180 (seconds past a WAITING-UNTIL before reboot) """ @@ -78,6 +80,11 @@ WATCHDOG_SESSION = "cc-ci-watchdog" WATCH_INTERVAL = int(os.environ.get("WATCH_INTERVAL", 300)) SIGNAL_INTERVAL = int(os.environ.get("SIGNAL_INTERVAL", 30)) + +# Hourly supervision wake: the watchdog types this prompt into the orchestrator session +# so it reviews the loops and nudges as needed (replaces the standalone ai-progress-monitor waker). +ORCH_WAKE_INTERVAL = int(os.environ.get("ORCH_WAKE_INTERVAL", 3600)) +ORCH_WAKE_PROMPT = os.environ.get("ORCH_WAKE_PROMPT", f"{PLAN_DIR}/ai-progress-monitor-prompt.txt") STALL_IDLE = int(os.environ.get("STALL_IDLE", 300)) STALL_GRACE = int(os.environ.get("STALL_GRACE", 180)) @@ -438,6 +445,31 @@ def heal_orchestrator(): log(f"orchestrator not running — restarting via {ORCH_LAUNCHER}") subprocess.run([ORCH_LAUNCHER, "start"], capture_output=True) +def wake_orchestrator(): + """Hourly supervision nudge: type the progress-monitor prompt into the orchestrator + session so it reviews the loops. Returns True when the wake was delivered (or is moot), + False when it should be retried on a later tick. + + Skips (retry later) if the orchestrator is absent — heal_orchestrator restarts it — or + actively working, so we never interrupt a turn; the wake lands the moment it goes idle. + """ + if not WATCH_ORCHESTRATOR: + return True # feature off — treat as handled so the timer doesn't spin + if not session_alive(ORCH_SESSION): + return False + if ACTIVE_RE.search(capture_pane(ORCH_SESSION, 25)): + return False # busy — don't interrupt; retry when idle + try: + msg = " ".join(Path(ORCH_WAKE_PROMPT).read_text().split()) + except FileNotFoundError: + log(f"orchestrator wake skipped — prompt file missing: {ORCH_WAKE_PROMPT}") + return True + if not msg: + return True + log(f"waking orchestrator ({ORCH_SESSION}) for scheduled supervision pass") + ping_session(ORCH_SESSION, msg, submit_key=_SUBMIT) + return True + # ── handoff signalling ──────────────────────────────────────────────────────── _last_sha = "" @@ -529,10 +561,17 @@ def watchdog_loop(): f"seq='{all_ids()}' signal={SIGNAL_INTERVAL}s heavy={WATCH_INTERVAL}s") elapsed = WATCH_INTERVAL # force a heavy check on the first tick + wake_elapsed = 0 # first orchestrator wake fires after a full interval, not at startup while True: handoff_check() stall_check() + if wake_elapsed >= ORCH_WAKE_INTERVAL: + # Reset only once the wake actually lands; if the orchestrator is busy/absent, + # leave the timer tripped so we retry each tick until it's idle. + if wake_orchestrator(): + wake_elapsed = 0 + if elapsed >= WATCH_INTERVAL: elapsed = 0 idx = cur_idx() @@ -563,6 +602,7 @@ def watchdog_loop(): time.sleep(SIGNAL_INTERVAL) elapsed += SIGNAL_INTERVAL + wake_elapsed += SIGNAL_INTERVAL def start_watchdog(): if session_alive(WATCHDOG_SESSION):