diff --git a/nix/modules/warm-keycloak.nix b/nix/modules/warm-keycloak.nix index 43eb8e3..b26d59e 100644 --- a/nix/modules/warm-keycloak.nix +++ b/nix/modules/warm-keycloak.nix @@ -1,90 +1,35 @@ -# Phase 2w / WC1 — a live-warm, shared keycloak SSO provider, deployed via abra at a STABLE domain -# (distinct from cold per-run `-<6hex>`; see DECISIONS.md Phase-2w). SSO-dependent -# recipe runs use this one instance (creating + deleting a per-run namespaced realm) instead of -# co-deploying a fresh keycloak each run — the highest-ROI warm layer (W0). +# Phase 2w / WC1+WC1.1+WC1.2 — a live-warm, shared keycloak SSO provider, auto-updating to LATEST +# with a pre-deploy safety gate + post-deploy health-gated rollback. Deployed via abra at a STABLE +# domain (distinct from cold per-run `-<6hex>`; see DECISIONS.md Phase-2w). SSO-dependent +# recipe runs use this one instance (per-run namespaced realm, created+deleted) instead of +# co-deploying a fresh keycloak each run. # -# Declared as an idempotent-RECONCILE systemd oneshot (like deploy-proxy / swarm-init): it inspects -# current state and converges every activation/boot, self-healing drift (redeploys if the stack is -# gone). No run-once sentinel. So a from-scratch install re-warms keycloak with just -# `nixos-rebuild switch` (D8 / WC8 "re-warmable from scratch"). The keycloak is declarative INFRA -# (in the D8 closure); only warm *volumes/snapshots* (W1+) are cache excluded from D8. Its realm -# data is ephemeral per-run. +# The reconcile logic lives in `runner/warm_reconcile.py` (Python — reuses warmsnap/abra/lifecycle so +# there is ONE snapshot impl, also used by the runner for WC5). The runner/ tree is copied into the +# nix store, so this is D8-clean (no dependence on the /root/cc-ci checkout) and the recipe is fetched +# at *runtime* → the nix closure stays byte-identical regardless of which keycloak version is live +# (UNPINNED; the kcVersion pin is gone). # -# Secrets are generated ONLY if missing — never rotated — so a reconcile against a running provider -# does not invalidate the admin/db creds the harness reads from inside the container. +# Idempotent RECONCILE oneshot (like deploy-proxy / swarm-init): converges every activation/boot. +# WC1.2 safety gate (major / manual-migration → hold + alert, no churn) runs BEFORE WC1.1's +# health-gated upgrade-with-rollback (snapshot keycloak's data volume before upgrade; restore + +# redeploy prior version on an unhealthy upgrade). Alerts are sentinel JSON under +# /var/lib/ci-warm/alerts/ relayed by the Builder loop (see DECISIONS). { pkgs, ... }: let - # Pinned known-good keycloak version (latest published as of 2026-05-28). Bump deliberately. - kcVersion = "10.7.1+26.6.2"; + runnerSrc = ../../runner; reconcile = pkgs.writeShellApplication { name = "cc-ci-reconcile-warm-keycloak"; - runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git curl ]; + runtimeInputs = with pkgs; [ abra docker git curl jq gnused gnugrep gnutar coreutils ]; text = '' - DOMAIN="warm-keycloak.ci.commoninternet.net" - VERSION="${kcVersion}" - ENV_FILE="$HOME/.abra/servers/default/$DOMAIN.env" - RECIPE_DIR="$HOME/.abra/recipes/keycloak" - - abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true - abra recipe fetch keycloak -n >/dev/null - - # Create the app config once (records ENV VERSION). No -S here: secrets are generated below, - # guarded, so a reconcile never rotates a running provider's creds. - [ -f "$ENV_FILE" ] || abra app new keycloak -s default -D "$DOMAIN" "$VERSION" -o -n - - set_env() { - sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE" - # Ensure the file ends in a newline before appending — keycloak's .env.sample ends with a - # newline-less comment line (#COMPOSE_FILE=...), so a bare append would glue the var onto - # that comment (commenting it out → KC_HOSTNAME=https:// with no host → crash). `$(tail -c1)` - # is empty iff the last byte is already a newline. (Same bite as backupbot.nix.) - if [ -s "$ENV_FILE" ] && [ -n "$(tail -c1 "$ENV_FILE")" ]; then printf '\n' >> "$ENV_FILE"; fi - printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE" - } - set_env DOMAIN "$DOMAIN" - set_env LETS_ENCRYPT_ENV "" - - # Pin the on-disk recipe to the version tag so a non-chaos deploy genuinely deploys VERSION - # (a chaos deploy would ignore ENV VERSION and use the current checkout — see abra.recipe_checkout). - git -C "$RECIPE_DIR" checkout --quiet "$VERSION" - - # Generate secrets only if absent (idempotent; never rotate a live provider). - have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; } - if ! have_secret admin_password; then - abra app secret generate "$DOMAIN" --all -m -o -n - fi - - health() { - curl -sk -o /dev/null -w '%{http_code}' --max-time 10 \ - --resolve "$DOMAIN:443:127.0.0.1" "https://$DOMAIN/realms/master" 2>/dev/null || true - } - - # Converge WITHOUT churning a healthy provider: only (re)deploy if it is not already serving. - # This makes every activation/boot a true no-op when keycloak is up (no JVM restart blip), and - # self-heals when the stack is gone or crash-looping. (To roll a new kcVersion, `abra app - # undeploy` first so this redeploys — a deliberate, rare op; keycloak is the SSO dep, not under - # test.) `-f` because a plain non-chaos deploy FATALs "already deployed". - stack="warm-keycloak_ci_commoninternet_net" - if [ "$(health)" = "200" ] && docker service ls --format '{{.Name}}' | grep -q "^''${stack}_app$"; then - echo "warm keycloak already healthy ($DOMAIN) — no-op converge" - exit 0 - fi - abra app deploy "$DOMAIN" -o -n -f - - # Wait until keycloak actually answers /realms/master (JVM + DB migration is slow). Surface a - # failed unit if it never comes up rather than reporting success on a half-booted provider. - for _ in $(seq 1 90); do - [ "$(health)" = "200" ] && { echo "warm keycloak healthy ($DOMAIN)"; exit 0; } - sleep 10 - done - echo "FATAL: warm keycloak $DOMAIN did not become healthy" >&2 - exit 1 + export HOME=/root + exec ${pkgs.python3}/bin/python3 ${runnerSrc}/warm_reconcile.py keycloak ''; }; in { systemd.services.warm-keycloak = { - description = "Reconcile the live-warm shared keycloak SSO provider (WC1) via abra"; + description = "Reconcile the live-warm shared keycloak SSO provider (WC1/WC1.1/WC1.2) via abra"; after = [ "deploy-proxy.service" "swarm-init.service" "docker.service" "network-online.target" ]; requires = [ "swarm-init.service" "docker.service" ]; wants = [ "deploy-proxy.service" "network-online.target" ]; @@ -93,8 +38,9 @@ in serviceConfig = { Type = "oneshot"; RemainAfterExit = true; - # Generous: a cold keycloak boot (JVM + DB migration) can take ~10min on this 2-vCPU node. - TimeoutStartSec = "1200"; + # Generous: a cold keycloak boot (JVM + DB migration) can take ~10min, and a health-gated + # upgrade may snapshot + deploy + (rollback) within one run. + TimeoutStartSec = "1800"; ExecStart = "${reconcile}/bin/cc-ci-reconcile-warm-keycloak"; }; }; diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py new file mode 100644 index 0000000..738897f --- /dev/null +++ b/runner/warm_reconcile.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +"""Warm/infra auto-update reconciler (Phase 2w / WC1 + WC1.1 + WC1.2). + +Invoked by the per-app systemd reconcile unit (nix/modules/warm-keycloak.nix) at every +activation/boot (and nightly via WC6). For one warm/infra app it converges to the LATEST published +recipe version, gated TWICE: + + WC1.2 (pre-deploy SAFETY gate, runs FIRST): only auto-apply non-major (patch/minor) recipe bumps + with no manual-migration release notes. A MAJOR recipe/app version bump, or a target whose + releaseNotes flag a manual migration → DO NOT deploy: stay on current + write an alert sentinel + carrying the notes (operator upgrades manually). No snapshot/deploy/rollback churn on a hold. + + WC1.1 (post-deploy HEALTH gate, for upgrades we DO apply): record running version = last-good → + [stateful: undeploy → snapshot data volume] → deploy latest → health-check → + healthy: commit last-good := latest; + unhealthy: [stateful: restore snapshot] → redeploy last-good → health-check → ALERT. + +The reconciler is UNPINNED (keycloak floats to latest like traefik); the nix closure stays +byte-identical because the recipe is fetched at runtime. Alerts are sentinel JSON files under +/var/lib/ci-warm/alerts/ that the Builder loop relays via PushNotification (see DECISIONS Phase-2w). + +Run as root on cc-ci (direct docker/volume access). CLI: `warm_reconcile.py ` (app = keycloak). +""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +import sys +import time + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import abra, lifecycle, warmsnap # noqa: E402 + +# --------------------------------------------------------------------------- specs + +SPECS: dict[str, dict] = { + "keycloak": { + "recipe": "keycloak", + "domain": "warm-keycloak.ci.commoninternet.net", + "health_path": "/realms/master", + "health_ok": (200,), + "stateful": True, + "deploy_timeout": 900, + "health_timeout": 900, + }, +} + +ALERTS_DIR = os.path.join(warmsnap.DEFAULT_WARM_ROOT, "alerts") + + +# --------------------------------------------------------------------------- pure version helpers + + +# A coop-cloud version tag is "+" (observed: keycloak 10.7.1+26.6.2 -> +# image :26.6.2; n8n 3.2.0+2.20.6 -> image :2.20.6). The RECIPE semver is the part BEFORE '+'. +_VER_RE = re.compile(r"^\d+(\.\d+)*(\+.+)?$") + + +def is_version_tag(tag: str) -> bool: + """True for a coop-cloud version tag (leading numeric semver, optional +app part).""" + return bool(_VER_RE.match(tag.strip())) + + +def sort_versions(tags) -> list[str]: + """Sort coop-cloud version tags ascending by (recipe-semver tuple, app-version tuple).""" + + def key(t: str): + recipe, _, app = t.partition("+") + return (_numtuple(recipe), _numtuple(app)) + + return sorted([t for t in tags if is_version_tag(t)], key=key) + + +def _numtuple(s: str) -> tuple: + out = [] + for part in s.split("."): + m = re.match(r"^\d+", part) + out.append(int(m.group()) if m else 0) + return tuple(out) + + +def latest_version(tags) -> str | None: + s = sort_versions(tags) + return s[-1] if s else None + + +def _major(semver: str) -> int: + return _numtuple(semver)[0] if semver else 0 + + +def is_major_bump(current: str, latest: str) -> bool: + """True if current→latest bumps the MAJOR of either the recipe-semver (pre-'+') or the + app-version (post-'+'). Conservative: an app-major bump (e.g. keycloak 25→26) is exactly when + manual DB migrations happen, so it must also be held. A genuine patch/minor (neither major moves) + is never held by this rule.""" + cr, _, ca = current.partition("+") + lr, _, la = latest.partition("+") + return _major(lr) > _major(cr) or _major(la) > _major(ca) + + +_MIGRATION_MARKERS = re.compile( + r"manual migration|manual action|manual step|action required|by hand|manually|breaking change", + re.IGNORECASE, +) + + +def notes_flag_manual_migration(text: str) -> bool: + """True if release-notes text contains a manual-migration marker (heuristic, err toward holding).""" + return bool(_MIGRATION_MARKERS.search(text or "")) + + +# --------------------------------------------------------------------------- integration helpers + + +def _run(cmd, timeout=120, check=False): + return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=check) + + +def _recipe_dir(recipe: str) -> str: + return os.path.expanduser(f"~/.abra/recipes/{recipe}") + + +def recipe_tags(recipe: str) -> list[str]: + r = _run(["git", "-C", _recipe_dir(recipe), "tag"], timeout=30) + return [t for t in r.stdout.split() if t.strip()] + + +def fetch_recipe(recipe: str) -> None: + # CCCI_SKIP_FETCH=1 lets a test/Adversary stage a fake "latest" tag (a simulated major bump / + # manual-migration / broken release) in the local recipe clone without it being clobbered by a + # re-fetch. Never set in production (the systemd unit does not set it). + if os.environ.get("CCCI_SKIP_FETCH") == "1": + print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True) + return + _run(["abra", "recipe", "fetch", recipe, "-n"], timeout=300) + + +def env_file(domain: str) -> str: + return os.path.expanduser(f"~/.abra/servers/default/{domain}.env") + + +def current_version(domain: str) -> str | None: + """Read the deployed version from the app .env. abra records it in `TYPE=:` + (updated on each `app new`/`app deploy `). Returns the `` part, or None.""" + path = env_file(domain) + if not os.path.isfile(path): + return None + with open(path) as f: + for line in f: + line = line.strip() + if line.startswith("TYPE="): + val = line.split("=", 1)[1].strip().strip('"').strip("'") + # ":" → version (everything after the first ':') + if ":" in val: + v = val.split(":", 1)[1].strip() + return v or None + return None + + +def is_deployed(domain: str) -> bool: + stack = lifecycle._stack_name(domain) # noqa: SLF001 + return bool(lifecycle._docker_names("service", stack)) # noqa: SLF001 + + +def health_code(spec: dict) -> int: + domain = spec["domain"] + r = _run( + [ + "curl", "-sk", "-o", "/dev/null", "-w", "%{http_code}", "--max-time", "10", + "--resolve", f"{domain}:443:127.0.0.1", f"https://{domain}{spec['health_path']}", + ], + timeout=20, + ) + try: + return int(r.stdout.strip() or "0") + except ValueError: + return 0 + + +def wait_healthy(spec: dict, timeout: int | None = None) -> bool: + domain = spec["domain"] + deadline = time.time() + (timeout or spec["health_timeout"]) + while time.time() < deadline: + if health_code(spec) in tuple(spec["health_ok"]): + return True + time.sleep(10) + return False + + +def release_notes(recipe: str, version: str) -> str: + """Concatenated releaseNotes for `version` (try a couple of common filename shapes).""" + base = os.path.join(_recipe_dir(recipe), "releaseNotes") + for name in (f"{version}.md", version): + p = os.path.join(base, name) + if os.path.isfile(p): + try: + with open(p) as f: + return f.read() + except OSError: + return "" + return "" + + +def deploy_version(recipe: str, domain: str, version: str, timeout: int) -> None: + """Deploy a specific published version: checkout the tag (so the on-disk tree matches) then a + pinned non-chaos redeploy with the version positional (so abra records TYPE=:). + `-f` makes it idempotent against an already-deployed app.""" + abra.recipe_checkout(recipe, version) + r = _run(["abra", "app", "deploy", domain, version, "-o", "-n", "-f"], timeout=timeout) + if r.returncode != 0: + raise RuntimeError(f"deploy {domain} {version} failed: {r.stderr.strip()[:300]}") + + +# --------------------------------------------------------------------------- last-good + alerts + + +def last_good_path(recipe: str) -> str: + return os.path.join(warmsnap.app_dir(recipe), "last_good") + + +def read_last_good(recipe: str) -> str | None: + try: + with open(last_good_path(recipe)) as f: + return f.read().strip() or None + except OSError: + return None + + +def write_last_good(recipe: str, version: str) -> None: + os.makedirs(warmsnap.app_dir(recipe), exist_ok=True) + tmp = last_good_path(recipe) + ".tmp" + with open(tmp, "w") as f: + f.write(version) + os.replace(tmp, last_good_path(recipe)) + + +def write_alert(app: str, reason: str, **fields) -> str: + """Write a sentinel JSON alert under /var/lib/ci-warm/alerts/ for the Builder loop to relay.""" + os.makedirs(ALERTS_DIR, exist_ok=True) + ts = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime()) + rec = {"app": app, "reason": reason, "ts": ts, **fields} + path = os.path.join(ALERTS_DIR, f"{ts}-{app}-{reason}.json") + tmp = path + ".tmp" + with open(tmp, "w") as f: + json.dump(rec, f, indent=2) + os.replace(tmp, path) + print(f"ALERT[{reason}] {app}: {fields}", flush=True) + return path + + +# --------------------------------------------------------------------------- reconcile + + +def ensure_server() -> None: + if _run(["abra", "server", "ls", "-m", "-n"], timeout=30).returncode != 0: + _run(["abra", "server", "add", "--local", "-n"], timeout=60) + + +def ensure_app_config(recipe: str, domain: str, version: str) -> None: + if not os.path.isfile(env_file(domain)): + _run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"], + timeout=120, check=True) + abra.env_set(domain, "DOMAIN", domain) + abra.env_set(domain, "LETS_ENCRYPT_ENV", "") + + +def ensure_secrets(domain: str) -> None: + stack = lifecycle._stack_name(domain) # noqa: SLF001 + have = {n for n in lifecycle._docker_names("secret", stack)} # noqa: SLF001 + if not any(n.endswith("_admin_password_v1") for n in have): + abra.secret_generate(domain) + + +def reconcile(app: str) -> str: + spec = SPECS[app] + recipe, domain = spec["recipe"], spec["domain"] + dt, stateful = spec["deploy_timeout"], spec["stateful"] + + ensure_server() + fetch_recipe(recipe) + tags = recipe_tags(recipe) + latest = latest_version(tags) + if not latest: + raise RuntimeError(f"no version tags for {recipe}") + ensure_app_config(recipe, domain, latest) + ensure_secrets(domain) + + current = current_version(domain) + deployed = is_deployed(domain) + + # Fresh deploy (nothing running) — deploy current-pinned (or latest if never deployed). + if not deployed: + target = current or latest + print(f"[{app}] not deployed → fresh deploy {target}", flush=True) + deploy_version(recipe, domain, target, dt) + if not wait_healthy(spec): + raise RuntimeError(f"{app} fresh deploy {target} did not become healthy") + write_last_good(recipe, target) + return f"deployed-fresh:{target}" + + # Deployed & already on latest → converge to a no-op (commit last-good if healthy). + if current == latest: + if wait_healthy(spec, timeout=60): + write_last_good(recipe, latest) + print(f"[{app}] already on latest {latest} and healthy — no-op", flush=True) + return f"noop-healthy:{latest}" + # On latest but unhealthy: try a redeploy; if still bad, alert (rollback target unknown/same). + print(f"[{app}] on latest {latest} but UNHEALTHY → redeploy", flush=True) + deploy_version(recipe, domain, latest, dt) + if wait_healthy(spec): + write_last_good(recipe, latest) + return f"redeployed-healthy:{latest}" + write_alert(app, "unhealthy-on-latest", version=latest) + return f"unhealthy:{latest}" + + # --- An upgrade current→latest is available --- + # WC1.2 pre-deploy SAFETY gate (runs BEFORE any snapshot/deploy). + notes = release_notes(recipe, latest) + if is_major_bump(current or "0", latest): + write_alert(app, "held-major", current=current, latest=latest, release_notes=notes[:4000]) + return f"held-major:{current}->{latest}" + if notes_flag_manual_migration(notes): + write_alert(app, "held-manual-migration", current=current, latest=latest, + release_notes=notes[:4000]) + return f"held-manual-migration:{current}->{latest}" + + # WC1.1 health-gated upgrade with rollback. + last_good = current + print(f"[{app}] auto-upgrade {last_good} → {latest} (health-gated)", flush=True) + if stateful: + abra.undeploy(domain) + warmsnap.snapshot(recipe, domain, version=last_good) + # snapshot requires undeployed; now bring up latest. + deploy_version(recipe, domain, latest, dt) + if wait_healthy(spec): + write_last_good(recipe, latest) + print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True) + return f"upgraded:{last_good}->{latest}" + + # Unhealthy → roll back. + print(f"[{app}] latest {latest} UNHEALTHY → rolling back to {last_good}", flush=True) + if stateful: + abra.undeploy(domain) + warmsnap.restore(recipe, domain) + deploy_version(recipe, domain, last_good, dt) + recovered = wait_healthy(spec) + write_alert(app, "rollback", last_good=last_good, attempted=latest, recovered=recovered, + release_notes=notes[:2000]) + if not recovered: + raise RuntimeError(f"{app} rollback to {last_good} did not become healthy") + return f"rolled-back:{latest}->{last_good}" + + +def main(argv) -> int: + if len(argv) != 2 or argv[1] not in SPECS: + print(f"usage: warm_reconcile.py <{'|'.join(SPECS)}>", file=sys.stderr) + return 2 + result = reconcile(argv[1]) + print(f"RECONCILE RESULT: {result}", flush=True) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/tests/unit/test_warm_reconcile.py b/tests/unit/test_warm_reconcile.py new file mode 100644 index 0000000..c3e5277 --- /dev/null +++ b/tests/unit/test_warm_reconcile.py @@ -0,0 +1,62 @@ +"""Unit tests for the WC1.2 safety-gate + version helpers in runner/warm_reconcile.py. + +Pure logic only (no abra/docker). The reconcile flow itself is proven live on cc-ci against the warm +keycloak (W0.6). These lock the gate's correctness: which bumps auto-apply vs hold, and the +manual-migration marker scan. +""" + +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +import warm_reconcile as wr # noqa: E402 + + +def test_is_version_tag(): + assert wr.is_version_tag("10.7.1+26.6.2") + assert wr.is_version_tag("3.2.0") + assert not wr.is_version_tag("main") + assert not wr.is_version_tag("latest") + assert not wr.is_version_tag("") + + +def test_sort_and_latest(): + tags = ["10.6.0+26.5.4", "10.7.1+26.6.2", "10.5.1+26.4.5", "main", "10.7.0+26.6.1"] + assert wr.latest_version(tags) == "10.7.1+26.6.2" + assert wr.sort_versions(tags)[0] == "10.5.1+26.4.5" + + +def test_latest_none_when_no_tags(): + assert wr.latest_version(["main", "feature-x"]) is None + + +def test_minor_patch_bump_not_major(): + # recipe-semver 10.7.0 -> 10.7.1 (patch); app 26.6.1 -> 26.6.2 (patch). Auto-apply. + assert wr.is_major_bump("10.7.0+26.6.1", "10.7.1+26.6.2") is False + # minor recipe bump 10.7.1 -> 10.8.0. Auto-apply. + assert wr.is_major_bump("10.7.1+26.6.2", "10.8.0+26.6.2") is False + + +def test_recipe_major_bump_held(): + # recipe-semver 10.x -> 11.0 (major). HELD. + assert wr.is_major_bump("10.7.1+26.6.2", "11.0.0+26.6.2") is True + + +def test_app_major_bump_held(): + # app version 26.x -> 27.0 (major, e.g. keycloak DB migration era). HELD (conservative). + assert wr.is_major_bump("10.7.1+26.6.2", "10.8.0+27.0.0") is True + + +def test_app_major_bump_held_even_if_no_plus_on_current(): + assert wr.is_major_bump("0", "11.0.0+1.0.0") is True + + +def test_manual_migration_markers(): + assert wr.notes_flag_manual_migration("This release requires a MANUAL MIGRATION of the DB.") + assert wr.notes_flag_manual_migration("Breaking change: action required before upgrade.") + assert wr.notes_flag_manual_migration("You must run the migration by hand.") + assert not wr.notes_flag_manual_migration("Routine patch. Automatic, no action needed.") + assert not wr.notes_flag_manual_migration("")