From e678d2e0065917d7758e73bf40c039511500f42c Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Fri, 29 May 2026 03:50:32 +0100 Subject: [PATCH] =?UTF-8?q?claim(2w):=20W0.10a=20traefik=20WC1.1=20migrate?= =?UTF-8?q?d=20onto=20shared=20health-gated=20reconciler=20=E2=80=94=20no-?= =?UTF-8?q?op=20converge=20proven;=20destructive=20rollback=20=3D=20Advers?= =?UTF-8?q?ary=20cold=20proof?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit warm_reconcile.py: per-spec setup hook + health_domain; SPECS[traefik] (stateful=False, version-rollback-only, _traefik_setup preserves wildcard-cert/ file-provider config, health on routed dashboard host). keycloak path unchanged. proxy.nix: deploy-proxy.service now execs warm_reconcile.py traefik. ZERO-disruption migration (traefik already at latest 5.1.1+v3.6.15; pre-seeded TYPE+last_good → clean no-op converge; traefik 200 + keycloak-through-traefik 200 + 0 failed). 65 unit pass. Per operator out: code+converge delivered; destructive rollback (brief TLS blip) = Adversary's required cold proof. Closes the W0.10a tracked-open. Co-Authored-By: Claude Opus 4.8 (1M context) --- machine-docs/JOURNAL-2w.md | 21 +++++++++++ machine-docs/STATUS-2w.md | 39 ++++++++++++++++++-- nix/modules/proxy.nix | 56 +++++++++------------------- runner/warm_reconcile.py | 61 +++++++++++++++++++++++++++++-- tests/unit/test_warm_reconcile.py | 13 +++++++ 5 files changed, 145 insertions(+), 45 deletions(-) diff --git a/machine-docs/JOURNAL-2w.md b/machine-docs/JOURNAL-2w.md index bc72bd5..19b60ce 100644 --- a/machine-docs/JOURNAL-2w.md +++ b/machine-docs/JOURNAL-2w.md @@ -308,3 +308,24 @@ Plan for next: (a) W0.10a traefik health-gated reconciler migration (quiet windo serves all TLS); (b) W3 WC5 promote-on-green-cold (extend cold-run teardown to re-seed the canonical on green-latest, reusing seed_canonical); (c) W3 WC6 nightly sweep (systemd timer: rebuild-then-cold- sweep). traefik first (use the window) or interleave; W0.10b alert-relay is a small loop step. + +## 2026-05-29 — W0.10a traefik WC1.1 migrated (quiet window) — code + no-op converge; rollback = Adversary proof + +Used the post-W2 quiet window (Adversary idle) for the tracked traefik WC1.1 migration. Generalized +warm_reconcile.py: per-spec `setup` hook + `health_domain`; added SPECS["traefik"] (stateful=False → +stateless version-rollback-only, NO snapshot; setup=_traefik_setup preserving the wildcard-cert/ +file-provider config EXACTLY via the proven newline-safe abra.env_set; health on the routed dashboard +host). keycloak's path is unchanged (no `setup` key → default). proxy.nix migrated: +deploy-proxy.service now execs `warm_reconcile.py traefik` (runner/ packaged in the store, D8-clean). + +ZERO-DISRUPTION migration: traefik was already at the latest tag (5.1.1+v3.6.15, image v3.6.15, chaos +commit 005f023 = the tag commit). I pre-seeded the .env TYPE + last_good to 5.1.1+v3.6.15 (accurate — +traefik IS at that version), so the health-gated reconcile is a clean no-op (current==latest==healthy) +→ NO redeploy, NO TLS blip. Verified via nixos-rebuild switch: deploy-proxy.service → "no-op", +traefik 200 + keycloak-through-traefik 200 + 0 failed units. 65 unit pass. + +Per the operator's explicit out (a destructive traefik test risks ALL TLS), I delivered the code + +safe no-op converge and left the DESTRUCTIVE rollback as the Adversary's required cold proof (staged +broken traefik tag → reconcile → rollback to last-good, brief TLS blip + manual recovery ready). The +rollback logic is the proven keycloak pattern, stateless variant. Claiming W0.10a so the Adversary +runs that cold proof. After this clears, WC1.1 is fully closed (keycloak + traefik). diff --git a/machine-docs/STATUS-2w.md b/machine-docs/STATUS-2w.md index 8a970db..a74017d 100644 --- a/machine-docs/STATUS-2w.md +++ b/machine-docs/STATUS-2w.md @@ -15,9 +15,10 @@ nightly full-cold sweep. Definition of Done = WC1–WC9 (plan §1), each Adversa - [x] **WC1** — Live-warm UNPINNED keycloak; per-run namespaced realms (create+delete); concurrent distinct realms; orphan realms reaped. **Adversary PASS @2026-05-29** (REVIEW-2w, gate 985686f). - [~] **WC1.1** — Health-gated deploy-with-rollback. **keycloak (stateful) — Adversary PASS - @2026-05-29** (marquee: broken latest → snapshot→restore→prior, data intact, last_good held, - alert). **traefik (stateless, version-rollback-only) — NOT yet migrated = W0.10**, MUST close - before Phase-2w DONE (Adversary will require a cold proof). + @2026-05-29** (marquee). **traefik (stateless, version-rollback-only) — reconciler MIGRATED + (W0.10a): proxy.nix now drives `warm_reconcile.py traefik` (shared health-gated path, no + snapshot; cert/file-provider setup preserved); no-op converge proven live (traefik 200, + keycloak-through-traefik 200, 0 failed). CLAIMED — destructive rollback = Adversary cold proof.** - [x] **WC1.2** — Pre-deploy safety gate (major / manual-migration → hold + alert with notes, no churn, short-circuits before WC1.1). **Adversary PASS @2026-05-29**. - [x] **WC2** — Data-warm canonical model: per-recipe canonical at stable domain `warm-`, @@ -125,6 +126,38 @@ headline e2e is green (below). No recipe/harness change needed. ## Gate +### Gate: W0.10a traefik WC1.1 — CLAIMED, awaiting Adversary (@2026-05-29) + +**WHAT.** traefik migrated onto the shared health-gated reconciler (WC1.1, stateless = +version-rollback-only, NO snapshot): record last-good → deploy latest tag → health-gate (routed host +ci.commoninternet.net = 200) → healthy commit / unhealthy roll back to last-good + alert. Closes the +W0.10a tracked-open item from the W0 gate. traefik's wildcard-cert/file-provider config preserved. + +**WHERE.** `runner/warm_reconcile.py` (SPECS["traefik"] stateful=False + `_traefik_setup` + health_domain; +reconcile() per-app setup hook; the stateless path skips snapshot/restore — version rollback only), +`nix/modules/proxy.nix` (deploy-proxy.service now execs `python3 …/warm_reconcile.py traefik`). + +**HOW + EXPECTED (cold):** +1. **Units:** `cc-ci-run -m pytest tests/unit -q` → **65 passed** (incl. test_warm_reconcile traefik + spec: stateful=False, callable setup, health_domain=ci.commoninternet.net; keycloak unchanged). +2. **No-op converge (delivered, proven live):** `systemctl is-active deploy-proxy.service` → active; + `journalctl -u deploy-proxy.service` → `[traefik] already on latest 5.1.1+v3.6.15 and healthy — + no-op`; traefik serving (ci.commoninternet.net=200) + keycloak-through-traefik=200 + system + `running` (0 failed). The migration was zero-disruption (traefik was already at the latest tag; I + pre-seeded TYPE+last_good to 5.1.1+v3.6.15 so the reconcile is a clean no-op). +3. **Destructive rollback (the Adversary's required cold proof):** stage a fake newer traefik tag with + a broken config → `CCCI_SKIP_FETCH=1 cc-ci-run runner/warm_reconcile.py traefik` → broken deploy + fails health → reconciler rolls back to last-good 5.1.1+v3.6.15 (version-only, no snapshot — traefik + is stateless) → traefik healthy again + a `*-rollback.json` alert. NOTE: a destructive traefik test + briefly drops TLS for ALL routes during the broken-deploy window until rollback — run it knowing + that + with manual recovery ready (`abra app deploy traefik.ci.commoninternet.net 5.1.1+v3.6.15 + -o -n -f`). The rollback logic is the SAME proven keycloak pattern, stateless variant (no snapshot). + +Per operator guidance, I delivered the code + the safe no-op converge this iteration and left the +destructive rollback as the Adversary's cold proof (a live destructive traefik test risks all TLS). + +--- + ### Gate: WC4 + WC7 — ✅ Adversary PASS @2026-05-29 (REVIEW-2w 31f0e42, gate 3ff2bf6) Cold-verified from the Adversary's own clone: 64 units; WC7 adversarial trigger battery (all negatives rejected, live bridge); WC4 never-promote (snapshot byte-identical, registry unchanged); WC4 diff --git a/nix/modules/proxy.nix b/nix/modules/proxy.nix index e280dc4..55a0008 100644 --- a/nix/modules/proxy.nix +++ b/nix/modules/proxy.nix @@ -4,55 +4,31 @@ # Phase-1c: the cert at CERT_DIR is sops-decrypted from git (cc-ci-secrets) at activation # (modules/secrets.nix wildcard_cert/wildcard_key), NOT an out-of-band operator file drop. # -# Declared as an idempotent-RECONCILE systemd oneshot (like swarm-init): it inspects current -# state and converges every activation/boot, self-healing drift (redeploys if the stack is gone, -# re-inserts secrets if missing). No run-once sentinel. So a from-scratch install is just -# `nixos-rebuild switch` + operator preconditions (D8) — no manual post-steps. +# Phase-2w / WC1.1: traefik is now UNPINNED + health-gated like keycloak — the deploy is driven by +# the shared `runner/warm_reconcile.py traefik` (STATELESS = version-rollback-only, NO snapshot): +# record last-good version → deploy latest tag → health-gate (a ROUTED host, the dashboard +# ci.commoninternet.net, returns 200) → healthy commits last-good / unhealthy rolls back to last-good +# + alert. traefik's wildcard-cert/file-provider config (ssl_cert/ssl_key secrets, WILDCARDS_ENABLED, +# COMPOSE_FILE) is preserved EXACTLY by the spec's `setup` (warm_reconcile._traefik_setup). The +# runner/ tree is copied into the nix store → D8-clean; recipe fetched at runtime → closure stable. +# +# Idempotent-RECONCILE systemd oneshot (unchanged unit name `deploy-proxy` — other modules order +# after it): converges every activation/boot, self-healing drift. No run-once sentinel. { pkgs, ... }: let + runnerSrc = ../../runner; reconcile = pkgs.writeShellApplication { name = "cc-ci-reconcile-proxy"; - runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git ]; + runtimeInputs = with pkgs; [ abra docker git curl jq gnused gnugrep gnutar coreutils ]; text = '' - PROXY_DOMAIN="traefik.ci.commoninternet.net" - CERT_DIR="/var/lib/ci-certs/live" - ENV_FILE="$HOME/.abra/servers/default/$PROXY_DOMAIN.env" - - # Fail visibly (failed unit) if the cert is missing — do NOT silently skip. It is - # sops-decrypted from git (cc-ci-secrets) at activation; a miss here means the sops decrypt - # path is broken (e.g. age identity not present), which must surface, not be papered over. - if [ ! -r "$CERT_DIR/fullchain.pem" ] || [ ! -r "$CERT_DIR/privkey.pem" ]; then - echo "FATAL: wildcard cert missing at $CERT_DIR (sops decrypt from cc-ci-secrets failed?)" >&2 - exit 1 - fi - - abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true - abra recipe fetch traefik -n >/dev/null - - [ -f "$ENV_FILE" ] || abra app new traefik -s default -D "$PROXY_DOMAIN" -n - - set_env() { - sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE" - printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE" - } - set_env LETS_ENCRYPT_ENV "" - set_env WILDCARDS_ENABLED "1" - set_env SECRET_WILDCARD_CERT_VERSION "v1" - set_env SECRET_WILDCARD_KEY_VERSION "v1" - set_env COMPOSE_FILE '"compose.yml:compose.wildcard.yml"' - - have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; } - have_secret ssl_cert || abra app secret insert "$PROXY_DOMAIN" ssl_cert v1 "$CERT_DIR/fullchain.pem" -f -n - have_secret ssl_key || abra app secret insert "$PROXY_DOMAIN" ssl_key v1 "$CERT_DIR/privkey.pem" -f -n - - # Converge the stack (idempotent: no-op if already at desired state). - abra app deploy "$PROXY_DOMAIN" -n -C + export HOME=/root + exec ${pkgs.python3}/bin/python3 ${runnerSrc}/warm_reconcile.py traefik ''; }; in { systemd.services.deploy-proxy = { - description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME) via abra"; + description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME, health-gated) via abra"; after = [ "swarm-init.service" "docker.service" "network-online.target" ]; requires = [ "swarm-init.service" "docker.service" ]; wants = [ "network-online.target" ]; @@ -61,6 +37,8 @@ in serviceConfig = { Type = "oneshot"; RemainAfterExit = true; + # Generous: a traefik (re)deploy + health-gate; rollback on an unhealthy upgrade. + TimeoutStartSec = "900"; ExecStart = "${reconcile}/bin/cc-ci-reconcile-proxy"; }; }; diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index 838c3d8..f3e8496 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -36,6 +36,38 @@ from harness import abra, lifecycle, warmsnap # noqa: E402 # --------------------------------------------------------------------------- specs + +def _traefik_setup(recipe: str, domain: str, version: str) -> None: + """Per-app config for the traefik reverse-proxy reconcile — preserves EXACTLY what the prior + proxy.nix bash reconcile did (wildcard/file-provider mode serving the pre-issued cert as + ssl_cert/ssl_key swarm secrets; NO ACME). Uses the proven abra.env_set (newline-safe, unlike the + bash set_env that bit keycloak).""" + cert_dir = "/var/lib/ci-certs/live" + if not (os.path.isfile(f"{cert_dir}/fullchain.pem") and os.path.isfile(f"{cert_dir}/privkey.pem")): + raise RuntimeError(f"FATAL: wildcard cert missing at {cert_dir} (sops decrypt broken?)") + if not os.path.isfile(env_file(domain)): + _run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"], + timeout=120, check=True) + abra.env_set(domain, "DOMAIN", domain) + abra.env_set(domain, "LETS_ENCRYPT_ENV", "") + abra.env_set(domain, "WILDCARDS_ENABLED", "1") + abra.env_set(domain, "SECRET_WILDCARD_CERT_VERSION", "v1") + abra.env_set(domain, "SECRET_WILDCARD_KEY_VERSION", "v1") + abra.env_set(domain, "COMPOSE_FILE", '"compose.yml:compose.wildcard.yml"') + stack = lifecycle._stack_name(domain) # noqa: SLF001 + have = set(lifecycle._docker_names("secret", stack)) # noqa: SLF001 + + def _has(name): + return any(s.endswith(f"_{name}_v1") for s in have) + + if not _has("ssl_cert"): + _run(["abra", "app", "secret", "insert", domain, "ssl_cert", "v1", + f"{cert_dir}/fullchain.pem", "-f", "-n"], timeout=120, check=True) + if not _has("ssl_key"): + _run(["abra", "app", "secret", "insert", domain, "ssl_key", "v1", + f"{cert_dir}/privkey.pem", "-f", "-n"], timeout=120, check=True) + + SPECS: dict[str, dict] = { "keycloak": { "recipe": "keycloak", @@ -46,6 +78,20 @@ SPECS: dict[str, dict] = { "deploy_timeout": 900, "health_timeout": 900, }, + # traefik = the reverse proxy: STATELESS (version-rollback-only, NO snapshot). Health is probed + # on a ROUTED host (the dashboard) since traefik's own domain has no route. `setup` preserves the + # wildcard cert / file-provider config. + "traefik": { + "recipe": "traefik", + "domain": "traefik.ci.commoninternet.net", + "health_domain": "ci.commoninternet.net", + "health_path": "/", + "health_ok": (200,), + "stateful": False, + "deploy_timeout": 600, + "health_timeout": 300, + "setup": _traefik_setup, + }, } ALERTS_DIR = os.path.join(warmsnap.DEFAULT_WARM_ROOT, "alerts") @@ -166,7 +212,10 @@ def is_deployed(domain: str) -> bool: def health_code(spec: dict) -> int: - domain = spec["domain"] + # health is probed on `health_domain` (defaults to the app domain). For traefik the app domain + # (traefik.ci…) has no route of its own — health is a ROUTED host (e.g. the dashboard + # ci.commoninternet.net), so a 200 proves traefik is up + routing + TLS-terminating. + domain = spec.get("health_domain", spec["domain"]) r = _run( [ "curl", "-sk", "-o", "/dev/null", "-w", "%{http_code}", "--max-time", "10", @@ -300,8 +349,14 @@ def reconcile(app: str) -> str: latest = latest_version(tags) if not latest: raise RuntimeError(f"no version tags for {recipe}") - ensure_app_config(recipe, domain, latest) - ensure_secrets(domain) + # Per-app config/secrets: a spec may provide its own `setup` (traefik's cert/file-provider wiring); + # otherwise the default keycloak-shaped path (app new + DOMAIN/LETS_ENCRYPT + generate secrets). + setup = spec.get("setup") + if setup: + setup(recipe, domain, latest) + else: + ensure_app_config(recipe, domain, latest) + ensure_secrets(domain) current = current_version(domain) deployed = is_deployed(domain) diff --git a/tests/unit/test_warm_reconcile.py b/tests/unit/test_warm_reconcile.py index c3e5277..0e41a68 100644 --- a/tests/unit/test_warm_reconcile.py +++ b/tests/unit/test_warm_reconcile.py @@ -54,6 +54,19 @@ def test_app_major_bump_held_even_if_no_plus_on_current(): assert wr.is_major_bump("0", "11.0.0+1.0.0") is True +def test_traefik_spec_is_stateless_with_setup(): + # WC1.1 traefik = stateless (version-rollback-only, NO snapshot) + its own cert/file-provider + # setup + health probed on a ROUTED host (the dashboard), not traefik's own domain. + t = wr.SPECS["traefik"] + assert t["stateful"] is False + assert callable(t.get("setup")) + assert t["health_domain"] == "ci.commoninternet.net" + assert t["domain"] == "traefik.ci.commoninternet.net" + # keycloak stays stateful with no custom setup (default path) + assert wr.SPECS["keycloak"]["stateful"] is True + assert "setup" not in wr.SPECS["keycloak"] + + def test_manual_migration_markers(): assert wr.notes_flag_manual_migration("This release requires a MANUAL MIGRATION of the DB.") assert wr.notes_flag_manual_migration("Breaking change: action required before upgrade.")