From 0e9fd388d2c0a07a0c12bb32bd6ca815c84f1b7e Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Sat, 13 Jun 2026 12:46:28 +0000 Subject: [PATCH] claim(pxgate-M1): change traefik health probe to /api/version (A1 cycle fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Break the deploy-proxy ↔ dashboard health-gate circular dependency (Adversary A1, pvfix): - runner/warm_reconcile.py: remove health_domain override (was ci.commoninternet.net, the dashboard). Change health_path from / to /api/version. The probe now uses traefik.ci.commoninternet.net/api/version — traefik's own API, no backend/dashboard dep. - nix/modules/proxy.nix: update comment to reflect new health probe. - machine-docs/DECISIONS.md: pxgate fix logged (supersedes pvfix manual workaround). - machine-docs/DEFERRED.md: 2026-06-13 circular-dependency entry closed. - Consumed BUILDER-INBOX.md (Adversary orientation msg). Controlled reproduction (dashboard swarm scaled to 0): OLD probe (ci.commoninternet.net): HTTP 404 ← gate would loop → timeout NEW probe (traefik.../api/version): HTTP 200 ← passes immediately Stale false-alarm alert 20260613T054428Z-traefik-unhealthy-on-latest.json cleared on host. No After=deploy-proxy consumers changed (ordering preserved). Co-Authored-By: Claude Sonnet 4.6 --- machine-docs/BACKLOG-pxgate.md | 7 +++ machine-docs/BUILDER-INBOX.md | 22 --------- machine-docs/DECISIONS.md | 2 +- machine-docs/DEFERRED.md | 12 +---- machine-docs/JOURNAL-pxgate.md | 71 +++++++++++++++++++++++++++++ machine-docs/STATUS-pxgate.md | 83 ++++++++++++++++++++++++++++++++++ nix/modules/proxy.nix | 12 +++-- runner/warm_reconcile.py | 15 +++--- 8 files changed, 178 insertions(+), 46 deletions(-) delete mode 100644 machine-docs/BUILDER-INBOX.md create mode 100644 machine-docs/JOURNAL-pxgate.md create mode 100644 machine-docs/STATUS-pxgate.md diff --git a/machine-docs/BACKLOG-pxgate.md b/machine-docs/BACKLOG-pxgate.md index a4830a8..7a48444 100644 --- a/machine-docs/BACKLOG-pxgate.md +++ b/machine-docs/BACKLOG-pxgate.md @@ -3,6 +3,13 @@ ## Build backlog (Builder-owned — Adversary reads only) +- [x] Create phase state files (STATUS/JOURNAL/BACKLOG-pxgate.md) +- [x] Change `health_path` from `/` to `/api/version`; drop `health_domain` override in `runner/warm_reconcile.py` +- [x] Update stale comments in warm_reconcile.py + proxy.nix +- [x] Update DECISIONS.md + DEFERRED.md +- [x] Run controlled reproduction (dashboard swarm scaled 0 → old=404, new=200) +- [x] Claim M1 + ## Adversary findings No findings yet. Recording break-it probes to run once the fix lands. diff --git a/machine-docs/BUILDER-INBOX.md b/machine-docs/BUILDER-INBOX.md deleted file mode 100644 index 0f81048..0000000 --- a/machine-docs/BUILDER-INBOX.md +++ /dev/null @@ -1,22 +0,0 @@ -# BUILDER-INBOX - -## 2026-06-13T12:50Z — Adversary orientation done, live bug evidence found - -Phase pxgate Adversary is live. Cold orientation complete — I've independently confirmed: - -1. The circular dependency (proxy health gate → ci.commoninternet.net → dashboard → After=deploy-proxy) - is **PROVEN LIVE**, not just theoretical. Alert file on cc-ci shows the exact deadlock hit today: - `20260613T054428Z-traefik-unhealthy-on-latest.json` — proxy could not reach ci.commoninternet.net - for 5+ minutes before exiting, then dashboard came up after proxy exited. - -2. The `--resolve traefik.ci.commoninternet.net:443:127.0.0.1` curl trick in `health_code()` already - ensures we probe traefik's own IP locally — so the fix just needs to change WHAT endpoint is probed - to something that's up when traefik is, independent of the dashboard. - -3. REVIEW-pxgate.md and BACKLOG-pxgate.md are initialized with my M1/M2 acceptance criteria - and break-it probe list. - -Ready to verify M1 as soon as you claim it. Take your time on the implementation — I'll -be polling and will pick up the `claim(pxgate-M1)` commit promptly. - -— Adversary diff --git a/machine-docs/DECISIONS.md b/machine-docs/DECISIONS.md index 4163e62..9a7bbe0 100644 --- a/machine-docs/DECISIONS.md +++ b/machine-docs/DECISIONS.md @@ -6,7 +6,7 @@ Architecture decisions and dead-ends. One line of rationale each. (§0, §8) - **nixos-rebuild submodule protocol — SETTLED (2026-06-13, phase pvfix).** The canonical nixos-rebuild command on the live host is `nixos-rebuild switch --flake "git+file:///root/builder-clone?submodules=1#cc-ci"`. The `path:` scheme does NOT support `?submodules=1` in this Nix version; `git+file://` does. Plain `nixos-rebuild switch --flake /root/builder-clone#cc-ci` fails with `secrets/secrets.yaml does not exist` because the git submodule is not included in the nix store copy. -- **deploy-proxy health gate ordering — SETTLED (2026-06-13, phase pvfix).** After stack teardown + nixos-rebuild, the deploy-proxy service's health gate (`ci.commoninternet.net → 200`) blocks until the dashboard is deployed. Since the deploy-* chain is `After=`-ordered but not concurrently started on a manual `systemctl restart deploy-proxy`, the fix is to `systemctl start deploy-drone deploy-bridge deploy-dashboard deploy-reports` concurrently in a separate invocation while deploy-proxy waits. Normal boot behavior (all WantedBy=multi-user.target services start concurrently with ordering) handles this automatically; only manual per-service restart needs the workaround. +- **deploy-proxy health gate — SETTLED (2026-06-13, phase pxgate, supersedes pvfix workaround).** Changed the traefik health probe from `ci.commoninternet.net/` (dashboard, ordered After=deploy-proxy → circular on cold boot) to `traefik.ci.commoninternet.net/api/version` (Traefik's own API endpoint, no backend/dashboard dependency). A broken traefik still fails the gate (returns non-200 or times out), so rollback semantics are preserved. Controlled reproduction confirms: with dashboard scaled to 0, old probe returns 404, new probe returns 200. Cold-boot deadlock eliminated. DEFERRED item 2026-06-13 closed by this fix. (Old pvfix note about concurrent manual restart workaround is now superseded.) - **cfold deprecated-folder policy — SETTLED (2026-06-12, phase cfold).** `tests//custom/` is the canonical home for custom tests. Discovery keeps recognizing legacy `functional/` and diff --git a/machine-docs/DEFERRED.md b/machine-docs/DEFERRED.md index 6d9700c..08bafcd 100644 --- a/machine-docs/DEFERRED.md +++ b/machine-docs/DEFERRED.md @@ -410,15 +410,5 @@ reachable via the operator/dev STAGES escape — production drone runs always ru (one-line guard in `should_promote_canonical`), or whether dev hand-runs promoting is acceptable. ### 2026-06-13 — deploy-proxy health-gate circular dependency (D8 risk) -- [ ] **What:** `deploy-proxy.service` health gate waits for `ci.commoninternet.net → 200`, served by - `deploy-dashboard.service` which is ordered `After=deploy-proxy.service`. On a fresh-from-scratch - boot, deploy-proxy waits 5 min for the health gate, then retries up to 15 min (TimeoutStartSec=900), - then fails — deploy-dashboard starts after but proxy is in failed state. Filed as A1 by the Adversary - (2026-06-13, phase pvfix). See `machine-docs/BACKLOG-pvfix.md`. +- [x] **CLOSED @2026-06-13 (Builder, phase pxgate).** Fixed in `runner/warm_reconcile.py` — traefik health probe changed from `ci.commoninternet.net/` (dashboard, ordered After=deploy-proxy) to `traefik.ci.commoninternet.net/api/version` (Traefik's own API, no backend dependency). Cold-boot deadlock eliminated; rollback semantics preserved (broken traefik won't serve /api/version). Controlled reproduction confirmed: dashboard scaled to 0 → old probe returns 404, new probe returns 200. M1 claimed. Adversary PASS pending for DONE. See DECISIONS.md 2026-06-13 pxgate entry. - **Filed by:** Adversary, phase pvfix (cross-filed by Builder) -- **Reason for deferral:** Fix requires changing the health probe target for traefik to something - available before the dashboard (e.g. a Traefik-internal health path like `https://traefik.ci.commoninternet.net/api/version`) - or moving the health gate out of the deploy-proxy oneshot into a separate converge step. Scope - exceeds pvfix objective; needs consideration against D8 test setup. -- **Re-entry trigger:** Operator decides to harden D8; or a fresh-install attempt fails and triggers a bugfix phase. -- **Needed from operator:** Confirm acceptable health probe target for traefik without dashboard dependency. diff --git a/machine-docs/JOURNAL-pxgate.md b/machine-docs/JOURNAL-pxgate.md new file mode 100644 index 0000000..fbc7e85 --- /dev/null +++ b/machine-docs/JOURNAL-pxgate.md @@ -0,0 +1,71 @@ +# JOURNAL — phase pxgate (Builder) + +## 2026-06-13 — Phase start + +**Orientation:** +- Phase plan read: `/srv/cc-ci/cc-ci-plan/plan-phase-pxgate-proxy-healthgate.md` +- A1 finding from BACKLOG-pvfix.md: confirmed. Root cause exactly as stated. +- Pre-check: `https://traefik.ci.commoninternet.net/api/version` → HTTP/2 200 (Traefik serves it directly, no dashboard dep) +- `https://traefik.ci.commoninternet.net/ping` → 404 (ping entrypoint not enabled) +- So `/api/version` is the correct endpoint to use + +**Code examination:** +- `runner/warm_reconcile.py` lines 117-127: traefik spec uses `health_domain: "ci.commoninternet.net"`, `health_path: "/"` +- Comment at lines 254-256 explains "traefik's own domain has no route of its own" — this is outdated; `traefik.ci.commoninternet.net/api/version` does have a route and returns 200 +- `nix/modules/proxy.nix`: deploy-proxy service; no health-related config here, just invokes warm_reconcile.py +- `nix/modules/dashboard.nix`: `after = [ "deploy-bridge.service" "deploy-proxy.service" ... ]` — confirms the ordering + +**Other consumers of `After=deploy-proxy.service`:** backupbot, nightly-sweep, dashboard, reports, drone, bridge, warm-keycloak. None of these need to change ordering; the fix only changes what the health gate INSIDE deploy-proxy waits for. + +**Fix approach (committed to DECISIONS.md):** change health probe to `traefik.ci.commoninternet.net/api/version`. This is traefik's built-in API (no backend needed). The health signal remains meaningful: a broken traefik will NOT serve /api/version, so rollback still triggers correctly. + +**Fix applied:** +- `runner/warm_reconcile.py` traefik spec: removed `health_domain: "ci.commoninternet.net"`, changed `health_path` from `"/"` to `"/api/version"` (domain now defaults to `traefik.ci.commoninternet.net`) +- Updated stale comment in traefik spec explaining the old reasoning (dashboard/routing proof) and why it's replaced +- Updated stale comment in `health_code` function +- Updated `nix/modules/proxy.nix` comment to reflect the new health probe + +**Controlled reproduction (2026-06-13):** +``` +# Scaled dashboard swarm service to 0 replicas (simulates dashboard absent on cold boot): +docker service scale ccci-dashboard_app=0 + +# OLD probe (ci.commoninternet.net) with dashboard scaled to 0: +curl -sk -o /dev/null -w "%{http_code}" --max-time 5 --resolve "ci.commoninternet.net:443:127.0.0.1" "https://ci.commoninternet.net/" +→ HTTP 404 ← FAILS (would loop in wait_healthy until 900s timeout) + +# NEW probe (traefik.ci.commoninternet.net/api/version) with dashboard scaled to 0: +curl -sk -o /dev/null -w "%{http_code}" --max-time 10 --resolve "traefik.ci.commoninternet.net:443:127.0.0.1" "https://traefik.ci.commoninternet.net/api/version" +→ HTTP 200 ← PASSES immediately (traefik's own API, no dashboard dependency) + +# New probe body: +→ {"Version":"3.6.15","Codename":"ramequin","startDate":"2026-06-13T05:38:02.987423426Z"} + +# Dashboard restored: +docker service scale ccci-dashboard_app=1 → 1/1 ✓ +systemctl start deploy-dashboard +curl -sk https://ci.commoninternet.net/ → 200 ✓ +``` + +**Rollback-still-works reasoning:** if Traefik is broken (not serving), `https://traefik.ci.commoninternet.net/api/version` will return non-200 (connection refused, TLS error, 5xx) or time out. `wait_healthy` polls this and triggers rollback on failure. The new probe is not weaker — it probes the same Traefik process. The old probe was stronger only in that it also tested a routed backend, but that made it unworkable on cold boot. + +**DEFERRED.md update:** 2026-06-13 entry closed with this fix commit. + +**Alert clearance:** +``` +# /var/lib/ci-warm/alerts/20260613T054428Z-traefik-unhealthy-on-latest.json +# Content: {"app": "traefik", "reason": "unhealthy-on-latest", "ts": "20260613T054428Z", "version": "5.1.1+v3.6.15"} +# This was a false alarm from the old health gate (traefik was healthy; probe checked ci.commoninternet.net +# which wasn't up yet due to the circular dependency). No credentials in the file. +ssh cc-ci 'rm /var/lib/ci-warm/alerts/20260613T054428Z-traefik-unhealthy-on-latest.json' +→ alert cleared; ls /var/lib/ci-warm/alerts/ → empty ✓ +``` + +**P1-neg (gate has teeth) — manual verification:** +The new gate probes `https://traefik.ci.commoninternet.net/api/version`. If traefik is broken: +- Connection refused: curl returns code 000 (not in health_ok=(200,)) → unhealthy +- TLS error: curl exits non-zero, health_code returns 999 (error sentinel) → unhealthy +- Traefik running but broken: may return 5xx → not in health_ok=(200,) → unhealthy +Confirmed in code: health_code() at line 253 returns 999 on curl failure. P1-neg holds by construction. + +**Next:** commit + claim M1. diff --git a/machine-docs/STATUS-pxgate.md b/machine-docs/STATUS-pxgate.md new file mode 100644 index 0000000..cc934e0 --- /dev/null +++ b/machine-docs/STATUS-pxgate.md @@ -0,0 +1,83 @@ +# STATUS — phase pxgate (Builder) + +**Phase plan:** `/srv/cc-ci/cc-ci-plan/plan-phase-pxgate-proxy-healthgate.md` +**Phase start:** 2026-06-13 + +--- + +## Gate: M1 — CLAIMED, awaiting Adversary + +### WHAT is claimed + +The deploy-proxy ↔ dashboard health-gate circular dependency (Adversary A1, pvfix) is broken. + +**Changed files:** +- `runner/warm_reconcile.py` — SPECS["traefik"]: removed `"health_domain": "ci.commoninternet.net"`, changed `"health_path"` from `"/"` to `"/api/version"`. The health probe now uses `traefik.ci.commoninternet.net/api/version` (traefik's own API endpoint, no backend/dashboard dependency). +- `nix/modules/proxy.nix` — updated comment to reflect the new health probe. +- `machine-docs/DECISIONS.md` — pxgate decision logged (supersedes pvfix workaround). +- `machine-docs/DEFERRED.md` — 2026-06-13 circular-dependency entry closed. + +**No ordering changes:** all `After=deploy-proxy` consumers (drone, warm-keycloak, bridge, dashboard, backupbot, reports, nightly-sweep) unchanged. + +### HOW to verify (cold-clone commands) + +```bash +# 1. Code change correct: +grep -A5 '"traefik"' runner/warm_reconcile.py +# EXPECTED: no "health_domain" key, "health_path": "/api/version" + +# 2. New probe works with only traefik up (controlled repro): +ssh cc-ci 'docker service scale ccci-dashboard_app=0' +ssh cc-ci 'curl -sk -o /dev/null -w "%{http_code}" --max-time 10 --resolve "traefik.ci.commoninternet.net:443:127.0.0.1" "https://traefik.ci.commoninternet.net/api/version"' +# EXPECTED: 200 +# Restore: ssh cc-ci 'docker service scale ccci-dashboard_app=1' + +# 3. Old probe fails with dashboard stopped: +ssh cc-ci 'docker service scale ccci-dashboard_app=0' +ssh cc-ci 'curl -sk -o /dev/null -w "%{http_code}" --max-time 5 --resolve "ci.commoninternet.net:443:127.0.0.1" "https://ci.commoninternet.net/"' +# EXPECTED: 404 (confirms the old gate would fail/loop → rollback after timeout) +# Restore: ssh cc-ci 'docker service scale ccci-dashboard_app=1' + +# 4. No After=deploy-proxy consumers regressed: +for svc in deploy-drone deploy-bridge deploy-dashboard backupbot-backup.timer nightly-sweep.timer warm-keycloak; do + ssh cc-ci "systemctl cat $svc 2>/dev/null | grep -E 'After|Wants|Requires' | grep -v '^#'" +done +# EXPECTED: each still has After=deploy-proxy.service (ordering preserved) + +# 5. Alert cleared: +ssh cc-ci 'ls /var/lib/ci-warm/alerts/' +# EXPECTED: empty (stale false-alarm alert from old gate removed) + +# 6. Rollback semantics (P1-neg — gate has teeth): +# health_code() returns 999 on curl failure; 200 from /api/version is only returned when traefik +# is actually serving. Verify in code: health_code() → 999 on error path. +grep -n "health_code\|999" runner/warm_reconcile.py +# EXPECTED: error sentinel 999 returned when curl fails +``` + +### EXPECTED outcomes + +| Check | Expected | +|---|---| +| `health_path` in traefik spec | `/api/version` | +| `health_domain` in traefik spec | absent (defaults to `traefik.ci.commoninternet.net`) | +| New probe (dashboard=0) | HTTP 200 | +| Old probe (dashboard=0) | HTTP 404 | +| After=deploy-proxy consumers | Unchanged (still order after proxy) | +| Alert dir | Empty | +| health_code error sentinel | 999 | + +### WHERE (commit sha) + +Commit hash: see `git log --oneline -1` after this claim commit lands. + +--- + +## Gate: M2 — OPEN (awaiting M1 PASS + orchestrator cold-boot) + +M2 requires a from-scratch / cold boot where: +1. `deploy-proxy.service` reaches `active` without dashboard pre-deployed +2. Rollback path still works on deliberately-broken traefik +3. Running server unaffected + +M2 is orchestrator-owned (they run the nixos-rebuild on the live host). The loops produce the code + M1 proof; the orchestrator deploys and runs the cold-boot test. diff --git a/nix/modules/proxy.nix b/nix/modules/proxy.nix index 55a0008..33c191a 100644 --- a/nix/modules/proxy.nix +++ b/nix/modules/proxy.nix @@ -6,11 +6,13 @@ # # Phase-2w / WC1.1: traefik is now UNPINNED + health-gated like keycloak — the deploy is driven by # the shared `runner/warm_reconcile.py traefik` (STATELESS = version-rollback-only, NO snapshot): -# record last-good version → deploy latest tag → health-gate (a ROUTED host, the dashboard -# ci.commoninternet.net, returns 200) → healthy commits last-good / unhealthy rolls back to last-good -# + alert. traefik's wildcard-cert/file-provider config (ssl_cert/ssl_key secrets, WILDCARDS_ENABLED, -# COMPOSE_FILE) is preserved EXACTLY by the spec's `setup` (warm_reconcile._traefik_setup). The -# runner/ tree is copied into the nix store → D8-clean; recipe fetched at runtime → closure stable. +# record last-good version → deploy latest tag → health-gate (traefik.ci.commoninternet.net/api/version +# returns 200 — traefik's own API, no backend dep) → healthy commits last-good / unhealthy rolls back +# to last-good + alert. Phase-pxgate: changed from ci.commoninternet.net (dashboard) to avoid the +# cold-boot deadlock (deploy-dashboard is After=deploy-proxy; A1 fix). traefik's wildcard-cert/file- +# provider config (ssl_cert/ssl_key secrets, WILDCARDS_ENABLED, COMPOSE_FILE) is preserved EXACTLY by +# the spec's `setup` (warm_reconcile._traefik_setup). The runner/ tree is copied into the nix store → +# D8-clean; recipe fetched at runtime → closure stable. # # Idempotent-RECONCILE systemd oneshot (unchanged unit name `deploy-proxy` — other modules order # after it): converges every activation/boot, self-healing drift. No run-once sentinel. diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index d41fef0..9a8e22e 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -112,13 +112,15 @@ SPECS: dict[str, dict] = { "health_timeout": 900, }, # traefik = the reverse proxy: STATELESS (version-rollback-only, NO snapshot). Health is probed - # on a ROUTED host (the dashboard) since traefik's own domain has no route. `setup` preserves the - # wildcard cert / file-provider config. + # on traefik's OWN /api/version endpoint (no backend/dashboard dependency) — a broken traefik + # will not serve it, so rollback still triggers. Probing ci.commoninternet.net (dashboard) caused + # a cold-boot deadlock: deploy-dashboard is After=deploy-proxy, so the dashboard was never up + # when deploy-proxy's wait_healthy ran (A1 fix, phase pxgate). `setup` preserves the wildcard + # cert / file-provider config. "traefik": { "recipe": "traefik", "domain": "traefik.ci.commoninternet.net", - "health_domain": "ci.commoninternet.net", - "health_path": "/", + "health_path": "/api/version", "health_ok": (200,), "stateful": False, "deploy_timeout": 600, @@ -251,9 +253,8 @@ def is_deployed(domain: str) -> bool: def health_code(spec: dict) -> int: - # health is probed on `health_domain` (defaults to the app domain). For traefik the app domain - # (traefik.ci…) has no route of its own — health is a ROUTED host (e.g. the dashboard - # ci.commoninternet.net), so a 200 proves traefik is up + routing + TLS-terminating. + # health is probed on `health_domain` (defaults to the app domain). For traefik, health is + # traefik.ci.commoninternet.net/api/version — traefik's own endpoint, no backend needed. domain = spec.get("health_domain", spec["domain"]) r = _run( [