From dea6359bcd4aea8e6a6b6f9752ba9018537d27d2 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Fri, 12 Jun 2026 15:56:03 +0000 Subject: [PATCH] plan: queue proxy and ghost follow-up phases --- cc-ci-plan/JOURNAL.md | 20 +++++ cc-ci-plan/launch.py | 41 ++++++---- cc-ci-plan/plan-phase-ghost-reeval.md | 70 ++++++++++++++++ ...n-phase-pvcheck-post-proxy-verification.md | 67 +++++++++++++++ cc-ci-plan/plan-phase-pvfix-swarm-proxy.md | 81 +++++++++++++++++++ 5 files changed, 262 insertions(+), 17 deletions(-) create mode 100644 cc-ci-plan/plan-phase-ghost-reeval.md create mode 100644 cc-ci-plan/plan-phase-pvcheck-post-proxy-verification.md create mode 100644 cc-ci-plan/plan-phase-pvfix-swarm-proxy.md diff --git a/cc-ci-plan/JOURNAL.md b/cc-ci-plan/JOURNAL.md index 00f455e..c2d0567 100644 --- a/cc-ci-plan/JOURNAL.md +++ b/cc-ci-plan/JOURNAL.md @@ -558,3 +558,23 @@ session cc-ci-orchestrator-stale can be killed; recipe-mirrors org still private though `opencode attach` has no `--model` flag: it now injects `OPENCODE_CONFIG_CONTENT` for the session. Verified live: `cc-ci-orchestrator-oc` tmux session running on `backend=opencode model=openai/gpt-5.4`, visible through the shared web server. + +## 2026-06-12 ~15:55 — OpenCode GPT-5.4 loops resumed; durable proxy phases queued +- Operator switched orchestration from Claude to OpenCode/GPT-5.4 and requested the remaining work be + made explicit: durable Swarm proxy fix, post-proxy verification, and ghost re-evaluation. +- Added phase plans: + `plan-phase-pvfix-swarm-proxy.md`, `plan-phase-pvcheck-post-proxy-verification.md`, + `plan-phase-ghost-reeval.md`. +- Persisted phase queue is now: + `rcust;shot;lvl5;bsky;dstamp;mailu;kuma;mailu;drone;cfold;pvfix;pvcheck;ghost`, with idx still + `9` (`cfold`) so the loops finish the already-started custom-folder phase before the proxy and ghost + follow-ups. +- Replaced stale Claude `cc-ci-orchestrator` tmux session (was parked on a weekly-limit banner) with an + OpenCode session using `openai/gpt-5.4`; builder/adversary restarted with + `LOOP_BACKEND=opencode LOOP_MODEL=openai/gpt-5.4 ADV_MODEL=openai/gpt-5.4 RESUME_PHASE=1`. +- Watchdog bug fixed in `launch.py`: it now treats only the configured `ORCH_SESSION` tmux session as + orchestrator liveness and restarts it if the pane command does not match the expected backend. This + prevents stale Claude one-shot/report sessions from masking a missing OpenCode orchestrator. +- Verified live tmux mapping: `cc-ci-builder`, `cc-ci-adv`, and `cc-ci-orchestrator` are all `opencode`; + `cc-ci-watchdog` is running. The watchdog will hourly-wake `cc-ci-orchestrator` via the existing + `ORCH_WAKE_INTERVAL=3600` path and will apply the existing limit-window nudge/restart handling. diff --git a/cc-ci-plan/launch.py b/cc-ci-plan/launch.py index 0fa3298..60f32dc 100644 --- a/cc-ci-plan/launch.py +++ b/cc-ci-plan/launch.py @@ -152,6 +152,13 @@ def session_alive(name): capture_output=True ).returncode == 0 +def session_command(name): + r = subprocess.run( + ["tmux", "display-message", "-p", "-t", name, "#{pane_current_command}"], + capture_output=True, text=True + ) + return r.stdout.strip() if r.returncode == 0 else "" + def kill_session(name): subprocess.run(["tmux", "kill-session", "-t", name], capture_output=True) @@ -573,26 +580,21 @@ def stall_check(): # ── orchestrator healing ────────────────────────────────────────────────────── def orchestrator_alive(): + """True only for the configured orchestrator tmux session. + + Older versions treated any non-loop Claude process as an orchestrator, which meant idle + report/assistant sessions could mask a missing or stale orchestrator. The watchdog wakes + and heals ORCH_SESSION, so ORCH_SESSION is the only valid liveness signal here. """ - True if an orchestrator process is running anywhere. - Conflict-safety: never launch a second orchestrator resuming the same session - (double-resume causes "thinking blocks cannot be modified" crashes). - """ - for line in subprocess.run("pgrep -x claude || true", shell=True, - capture_output=True, text=True).stdout.splitlines(): - pid = line.strip() - if not pid: - continue - try: - cmdline = Path(f"/proc/{pid}/cmdline").read_bytes().decode(errors="replace").replace("\0", " ") - # Skip the loop sessions and the upgrader — they're not the orchestrator. - if re.search(r"--remote-control\s+'?cc-ci-(builder|adv|upgrader)'?", cmdline): - continue - return True - except Exception: - pass return session_alive(ORCH_SESSION) +def orchestrator_backend_mismatch(): + cmd = session_command(ORCH_SESSION) + if not cmd: + return False + expected = "opencode" if BACKEND == "opencode" else "claude" + return cmd != expected + def heal_orchestrator(): if not WATCH_ORCHESTRATOR: return @@ -601,6 +603,11 @@ def heal_orchestrator(): if orchestrator_alive(): if session_alive(ORCH_SESSION): + if orchestrator_backend_mismatch(): + log(f"orchestrator ({ORCH_SESSION}) is {session_command(ORCH_SESSION)!r}, expected {BACKEND} — kill + restart") + kill_session(ORCH_SESSION) + subprocess.run([ORCH_LAUNCHER, "start"], capture_output=True) + return pane = capture_pane(ORCH_SESSION, 25) if ACTIVE_RE.search(pane): return diff --git a/cc-ci-plan/plan-phase-ghost-reeval.md b/cc-ci-plan/plan-phase-ghost-reeval.md new file mode 100644 index 0000000..2b04b99 --- /dev/null +++ b/cc-ci-plan/plan-phase-ghost-reeval.md @@ -0,0 +1,70 @@ +# Phase `ghost` — re-evaluate ghost after proxy fix and leave one clean PR + +**Mission:** re-evaluate the `ghost` upgrade failure after the Swarm proxy/IPAM infra +confound has been removed, then leave exactly one operator-ready ghost PR: green if the +recipe is sound, or clearly explained with the minimum required recipe fix/comment if a real +Ghost/MySQL upgrade issue remains. + +State files live under `machine-docs/`: `STATUS-ghost.md`, `BACKLOG-ghost.md`, +`REVIEW-ghost.md`, `JOURNAL-ghost.md`. + +## Context + +The 2026-06-12 `/upgrade-all` recorded `ghost` as the only failed recipe, but the evidence +was mixed: + +- One failure was definitely infra: shared `proxy` overlay VIP exhaustion left tasks stuck + in Swarm `New` state. +- A later failure may be recipe-specific: MySQL 8.0 to 8.4 data-dir upgrade timing under + Swarm's default update monitor, producing `UpdateStatus=paused` under load. +- A previous run on 2026-06-05 passed the Ghost/MySQL path under lighter load. +- Duplicate ghost subagent churn may have left branch/PR/comment state messy. + +Existing focused plan/background: `/srv/cc-ci/cc-ci-plan/plan-ghostpr-debug-fix.md`. + +## Required Work + +1. **Inventory PR state.** On `recipe-maintainers/ghost`, list all open PRs and branches + related to the upgrade. Identify the correct PR, expected to be ghost PR `#4`, and close + or clearly mark any duplicate only if it is truly superseded. Never merge recipe PRs. +2. **Separate infra from recipe behavior.** After `pvfix` and `pvcheck`, trigger a fresh + `!testme` on the correct ghost PR and watch the run. Do not count pre-proxy failures as + current recipe evidence. +3. **If green:** record that the prior failure was infra/timing-confounded, ensure no stale + stacks/volumes remain, and leave the PR ready for operator review. +4. **If red for a real recipe reason:** make the smallest recipe PR change needed. The + suspected fix is a longer Swarm update monitor/start grace around the MySQL 8.0 to 8.4 + data-dir migration, e.g. `update_config.monitor: 300s` and related minimal service health + timing. Validate the hypothesis with logs; do not cargo-cult timing knobs. +5. **If the test is genuinely stale:** default recipe-upgrade policy applies: leave an + explanatory PR comment for the operator. Do not edit cc-ci tests in this phase unless the + operator explicitly asks for a test-update phase. +6. **Deduplicate and clean up.** Ensure exactly one relevant open ghost upgrade PR remains, + comments explain the final state, and no `ghos-*`/`dev-ghost` stacks or volumes leak. + +## Gates + +**M1 — State inventory and clean retry.** Builder documents PR/branch/comment/build state, +identifies the correct PR, and runs one clean post-proxy `!testme`. Adversary verifies that +pre-proxy infra failures were not misclassified as current recipe failures. + +**M2 — Operator-ready outcome.** The ghost PR is green, or it has the minimal justified +recipe fix/comment and a clear current blocker. Duplicate PR/branch mess is resolved and +no ghost resources leak. Adversary verifies live PR state, build evidence, and cleanup. + +## Guardrails + +- Recipe PRs are never merged by agents. +- Do not weaken tests to get green. +- Do not re-run ghost during proxy maintenance or while `cfold` owns a broad CI sweep. +- Keep iterations bounded: at most three fresh post-proxy `!testme` attempts unless the + operator authorizes more. +- Preserve useful failure evidence in PR comments and `machine-docs/STATUS-ghost.md`. + +## Definition of Done + +Exactly one ghost upgrade PR is operator-ready, with a fresh post-proxy verdict and clear +classification of the 2026-06-12 failure. Any real recipe fix is minimal and verified; +otherwise the PR is green or has a precise operator-facing explanation. Adversary has +signed off on M1 and M2 in `machine-docs/REVIEW-ghost.md`; Builder writes `## DONE` only +after both gates have fresh Adversary PASSes. diff --git a/cc-ci-plan/plan-phase-pvcheck-post-proxy-verification.md b/cc-ci-plan/plan-phase-pvcheck-post-proxy-verification.md new file mode 100644 index 0000000..4934082 --- /dev/null +++ b/cc-ci-plan/plan-phase-pvcheck-post-proxy-verification.md @@ -0,0 +1,67 @@ +# Phase `pvcheck` — post-proxy verification and regression proof + +**Mission:** prove that the durable `proxy` overlay fix is actually safe in production: +the network has the intended headroom, routing works, real recipe CI still deploys through +Traefik, and the IPAM/VIP exhaustion signature no longer threatens the weekly upgrade path. + +State files live under `machine-docs/`: `STATUS-pvcheck.md`, `BACKLOG-pvcheck.md`, +`REVIEW-pvcheck.md`, `JOURNAL-pvcheck.md`. + +## Preconditions + +- Phase `pvfix` is `## DONE`. +- `docker network inspect proxy` shows the intended `/16` subnet. +- Core control-plane services are back after the proxy recreation. + +## Verification Scope + +1. **Host/network facts.** Capture and record: + - `docker network inspect proxy` subnet and endpoint count + - `docker stack ls` + - Traefik, Drone, bridge, dashboard, and report service health + - recent dockerd journal lines for VIP/IPAM errors +2. **Routing checks.** Verify externally visible routes still work: + - Drone UI/API route + - dashboard route + - bridge/poller health if exposed locally + - report site route +3. **Real deploy proof.** Trigger one low-risk enrolled recipe `!testme` or equivalent + harness run that joins `proxy`, completes all expected tiers, and tears down cleanly. + Prefer a small stable recipe unless `cfold` needs a broader sweep at the same time. Do + not duplicate an active `cfold` sweep. +4. **Allocator-headroom proof.** Run a bounded reproduction derived from + `plan-proxy-vip-exhaustion-fix.md`: + - deploy/remove a small batch of throwaway published-port stacks, preferably in the same + concurrent pattern that previously leaked endpoints + - confirm leaked endpoint count, if any, is tiny relative to `/16` headroom + - confirm no fresh `could not find an available IP while allocating VIP` errors + - prune throwaway networks/stacks and verify no residue +5. **Upgrade safety check.** Confirm the `/upgrade-all` Step-0 guard still exists and would + detect/recover the known VIP exhaustion signature if it ever recurs. + +## Gates + +**M1 — Control plane and routing verified.** All cc-ci control-plane routes/services are +healthy after the proxy recreation, with before/after evidence in `STATUS-pvcheck.md`. +Adversary verifies independently from live commands, not just Builder notes. + +**M2 — Real CI and allocator proof verified.** At least one real recipe deploy/test passes +through `proxy` and tears down cleanly; bounded allocator reproduction does not threaten the +new `/16`; no VIP exhaustion signature remains in fresh logs. Adversary verifies all claims +and checks for leaks. + +## Guardrails + +- Do not run a large recipe sweep here if `cfold` already owns that proof. This phase is the + proxy-specific post-change proof. +- Keep concurrency bounded. The point is to prove headroom, not stress the host into a new + unrelated failure. +- Clean up every throwaway stack/network. Zero residue is part of the acceptance criteria. +- If any core route is down, stop new test traffic and fix routing first. + +## Definition of Done + +Control-plane routes are healthy, one real proxy-joining recipe CI run succeeds and cleans +up, bounded allocator reproduction is documented, fresh logs show no VIP exhaustion, and +Adversary has signed off on M1 and M2 in `machine-docs/REVIEW-pvcheck.md`. Builder writes +`## DONE` only after both gates have fresh Adversary PASSes. diff --git a/cc-ci-plan/plan-phase-pvfix-swarm-proxy.md b/cc-ci-plan/plan-phase-pvfix-swarm-proxy.md new file mode 100644 index 0000000..52d2e4c --- /dev/null +++ b/cc-ci-plan/plan-phase-pvfix-swarm-proxy.md @@ -0,0 +1,81 @@ +# Phase `pvfix` — durable Swarm `proxy` overlay VIP exhaustion fix + +**Mission:** eliminate the recurring Docker Swarm `proxy` overlay VIP exhaustion class by +making the shared `proxy` network large enough for the cc-ci workload, while preserving the +already-added per-run safety net. This is an infra phase: coordinate carefully, because +recreating `proxy` briefly disrupts routing for Traefik, Drone, dashboard, bridge, reports, +and any live recipe deploys. + +State files live under `machine-docs/`: `STATUS-pvfix.md`, `BACKLOG-pvfix.md`, +`REVIEW-pvfix.md`, `JOURNAL-pvfix.md`. + +## Context + +The 2026-06-12 weekly upgrade exposed a real infra failure mode: + +- The shared `proxy` overlay was using Docker's default `/24` allocation (`10.0.1.0/24`, + 254 VIPs). +- Every recipe deploy joins `proxy` for Traefik routing. +- Concurrent stack removal can race Swarm endpoint GC (`key modified`, `network proxy + remove failed`) and leak endpoint/VIP allocations. +- After 11 days of dockerd uptime the allocator exhausted the `/24`, producing + `could not find an available IP while allocating VIP` and leaving tasks stuck in Swarm + `New` state. +- A docker restart rebuilt allocator state and cleared the symptom, proving the issue was + infra, not the affected recipes. + +Existing runbook/background: `/srv/cc-ci/cc-ci-plan/plan-proxy-vip-exhaustion-fix.md`. + +## Required Fix + +1. Confirm the current host state is quiet enough for a disruptive network maintenance + window. No live `/upgrade-all`, no active recipe `!testme` runs, no phase CI sweep in + progress. +2. Update `nix/modules/swarm.nix` in the cc-ci repo so the `proxy` overlay is created with + an explicit `/16`, for example: + + ```bash + docker network create --driver overlay --attachable --subnet 10.10.0.0/16 proxy + ``` + + Use a subnet clear of `ingress` and existing Docker allocations. If `10.10.0.0/16` is + unsuitable on the live host, choose a different documented `/16` and explain why. +3. Keep the upgrade Step-0 safety net in place: prune leaked overlays and restart Docker + when VIP-allocation failure signatures are detected. The durable `/16` fix is headroom; + the guard is still useful as a future self-healing belt-and-braces mechanism. +4. Recreate the live `proxy` network safely. The network cannot be resized in place. + Plan the exact live-host steps before executing them. The expected sequence is: + - capture current `proxy` inspect output and joined services + - stop or drain live recipe stacks as needed + - remove/recreate `proxy` with the `/16` + - redeploy/reconcile Traefik and the cc-ci control-plane services so they rejoin + - run `nixos-rebuild switch` using the canonical live cc-ci deploy checkout +5. Commit and push the cc-ci repo change. Do not commit secrets. Do not merge recipe PRs. + +## Gates + +**M1 — Plan and patch ready.** Builder produces the minimal `swarm.nix` patch, records the +exact maintenance procedure, and proves from live inspection that the chosen `/16` is safe. +Adversary cold-reviews the patch and live procedure before any disruptive action. + +**M2 — Live durable fix applied.** The live host has `proxy` recreated as `/16`, the NixOS +configuration has been switched, and Traefik/Drone/dashboard/bridge/reports are reachable. +Adversary verifies from the host that `docker network inspect proxy` reports the intended +subnet and that the control-plane services are healthy. + +## Guardrails + +- Maintenance window only. Do not recreate `proxy` while recipe CI, `/upgrade-all`, or + `cfold` sweep runs are active. +- No force-pushes. No secret values in logs, plans, commits, or comments. +- Prefer the smallest host change: one explicit `--subnet` plus the minimum live + reconciliation needed to restore routing. +- If the host topology differs from the runbook, stop and record the actual state before + changing anything. + +## Definition of Done + +`proxy` is explicitly configured and live as a `/16`, the change is committed and pushed to +cc-ci, core routes are healthy after the maintenance action, and Adversary has signed off on +M1 and M2 in `machine-docs/REVIEW-pvfix.md`. Builder writes `## DONE` only after both gates +have fresh Adversary PASSes.