#!/usr/bin/env python3 """Warm/infra auto-update reconciler (Phase 2w / WC1 + WC1.1 + WC1.2). Invoked by the per-app systemd reconcile unit (nix/modules/warm-keycloak.nix) at every activation/boot (and nightly via WC6). For one warm/infra app it converges to the LATEST published recipe version, gated TWICE: WC1.2 (pre-deploy SAFETY gate, runs FIRST): only auto-apply non-major (patch/minor) recipe bumps with no manual-migration release notes. A MAJOR recipe/app version bump, or a target whose releaseNotes flag a manual migration → DO NOT deploy: stay on current + write an alert sentinel carrying the notes (operator upgrades manually). No snapshot/deploy/rollback churn on a hold. WC1.1 (post-deploy HEALTH gate, for upgrades we DO apply): record running version = last-good → [stateful: undeploy → snapshot data volume] → deploy latest → health-check → healthy: commit last-good := latest; unhealthy: [stateful: restore snapshot] → redeploy last-good → health-check → ALERT. The reconciler is UNPINNED (keycloak floats to latest like traefik); the nix closure stays byte-identical because the recipe is fetched at runtime. Alerts are sentinel JSON files under /var/lib/ci-warm/alerts/ that the Builder loop relays via PushNotification (see DECISIONS Phase-2w). Run as root on cc-ci (direct docker/volume access). CLI: `warm_reconcile.py ` (app = keycloak). """ from __future__ import annotations import json import os import re import subprocess import sys import time sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from harness import abra, lifecycle, warmsnap # noqa: E402 # --------------------------------------------------------------------------- specs def _traefik_setup(recipe: str, domain: str, version: str) -> None: """Per-app config for the traefik reverse-proxy reconcile — preserves EXACTLY what the prior proxy.nix bash reconcile did (wildcard/file-provider mode serving the pre-issued cert as ssl_cert/ssl_key swarm secrets; NO ACME). Uses the proven abra.env_set (newline-safe, unlike the bash set_env that bit keycloak).""" cert_dir = "/var/lib/ci-certs/live" if not (os.path.isfile(f"{cert_dir}/fullchain.pem") and os.path.isfile(f"{cert_dir}/privkey.pem")): raise RuntimeError(f"FATAL: wildcard cert missing at {cert_dir} (sops decrypt broken?)") if not os.path.isfile(env_file(domain)): _run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"], timeout=120, check=True) abra.env_set(domain, "DOMAIN", domain) abra.env_set(domain, "LETS_ENCRYPT_ENV", "") abra.env_set(domain, "WILDCARDS_ENABLED", "1") abra.env_set(domain, "SECRET_WILDCARD_CERT_VERSION", "v1") abra.env_set(domain, "SECRET_WILDCARD_KEY_VERSION", "v1") abra.env_set(domain, "COMPOSE_FILE", '"compose.yml:compose.wildcard.yml"') stack = lifecycle._stack_name(domain) # noqa: SLF001 have = set(lifecycle._docker_names("secret", stack)) # noqa: SLF001 def _has(name): return any(s.endswith(f"_{name}_v1") for s in have) if not _has("ssl_cert"): _run(["abra", "app", "secret", "insert", domain, "ssl_cert", "v1", f"{cert_dir}/fullchain.pem", "-f", "-n"], timeout=120, check=True) if not _has("ssl_key"): _run(["abra", "app", "secret", "insert", domain, "ssl_key", "v1", f"{cert_dir}/privkey.pem", "-f", "-n"], timeout=120, check=True) SPECS: dict[str, dict] = { "keycloak": { "recipe": "keycloak", "domain": "warm-keycloak.ci.commoninternet.net", "health_path": "/realms/master", "health_ok": (200,), "stateful": True, "deploy_timeout": 900, "health_timeout": 900, }, # traefik = the reverse proxy: STATELESS (version-rollback-only, NO snapshot). Health is probed # on a ROUTED host (the dashboard) since traefik's own domain has no route. `setup` preserves the # wildcard cert / file-provider config. "traefik": { "recipe": "traefik", "domain": "traefik.ci.commoninternet.net", "health_domain": "ci.commoninternet.net", "health_path": "/", "health_ok": (200,), "stateful": False, "deploy_timeout": 600, "health_timeout": 300, "setup": _traefik_setup, }, } ALERTS_DIR = os.path.join(warmsnap.DEFAULT_WARM_ROOT, "alerts") # --------------------------------------------------------------------------- pure version helpers # A coop-cloud version tag is "+" (observed: keycloak 10.7.1+26.6.2 -> # image :26.6.2; n8n 3.2.0+2.20.6 -> image :2.20.6). The RECIPE semver is the part BEFORE '+'. _VER_RE = re.compile(r"^\d+(\.\d+)*(\+.+)?$") def is_version_tag(tag: str) -> bool: """True for a coop-cloud version tag (leading numeric semver, optional +app part).""" return bool(_VER_RE.match(tag.strip())) def sort_versions(tags) -> list[str]: """Sort coop-cloud version tags ascending by (recipe-semver tuple, app-version tuple).""" def key(t: str): recipe, _, app = t.partition("+") return (_numtuple(recipe), _numtuple(app)) return sorted([t for t in tags if is_version_tag(t)], key=key) def _numtuple(s: str) -> tuple: out = [] for part in s.split("."): m = re.match(r"^\d+", part) out.append(int(m.group()) if m else 0) return tuple(out) def latest_version(tags) -> str | None: s = sort_versions(tags) return s[-1] if s else None def _major(semver: str) -> int: return _numtuple(semver)[0] if semver else 0 def is_major_bump(current: str, latest: str) -> bool: """True if current→latest bumps the MAJOR of either the recipe-semver (pre-'+') or the app-version (post-'+'). Conservative: an app-major bump (e.g. keycloak 25→26) is exactly when manual DB migrations happen, so it must also be held. A genuine patch/minor (neither major moves) is never held by this rule.""" cr, _, ca = current.partition("+") lr, _, la = latest.partition("+") return _major(lr) > _major(cr) or _major(la) > _major(ca) _MIGRATION_MARKERS = re.compile( r"manual migration|manual action|manual step|action required|by hand|manually|breaking change", re.IGNORECASE, ) def notes_flag_manual_migration(text: str) -> bool: """True if release-notes text contains a manual-migration marker (heuristic, err toward holding).""" return bool(_MIGRATION_MARKERS.search(text or "")) # --------------------------------------------------------------------------- integration helpers def _run(cmd, timeout=120, check=False): return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=check) def _recipe_dir(recipe: str) -> str: return os.path.expanduser(f"~/.abra/recipes/{recipe}") def recipe_tags(recipe: str) -> list[str]: r = _run(["git", "-C", _recipe_dir(recipe), "tag"], timeout=30) return [t for t in r.stdout.split() if t.strip()] def fetch_recipe(recipe: str) -> None: # CCCI_SKIP_FETCH=1 lets a test/Adversary stage a fake "latest" tag (a simulated major bump / # manual-migration / broken release) in the local recipe clone without it being clobbered by a # re-fetch. Never set in production (the systemd unit does not set it). if os.environ.get("CCCI_SKIP_FETCH") == "1": print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True) return _run(["abra", "recipe", "fetch", recipe, "-n"], timeout=300) def env_file(domain: str) -> str: return os.path.expanduser(f"~/.abra/servers/default/{domain}.env") def current_version(domain: str) -> str | None: """Read the deployed version from the app .env. abra records it in `TYPE=:` (updated on each `app new`/`app deploy `). Returns the `` part, or None.""" path = env_file(domain) if not os.path.isfile(path): return None with open(path) as f: for line in f: line = line.strip() if line.startswith("TYPE="): val = line.split("=", 1)[1].strip().strip('"').strip("'") # ":" → version (everything after the first ':') if ":" in val: v = val.split(":", 1)[1].strip() return v or None return None def is_deployed(domain: str) -> bool: stack = lifecycle._stack_name(domain) # noqa: SLF001 return bool(lifecycle._docker_names("service", stack)) # noqa: SLF001 def health_code(spec: dict) -> int: # health is probed on `health_domain` (defaults to the app domain). For traefik the app domain # (traefik.ci…) has no route of its own — health is a ROUTED host (e.g. the dashboard # ci.commoninternet.net), so a 200 proves traefik is up + routing + TLS-terminating. domain = spec.get("health_domain", spec["domain"]) r = _run( [ "curl", "-sk", "-o", "/dev/null", "-w", "%{http_code}", "--max-time", "10", "--resolve", f"{domain}:443:127.0.0.1", f"https://{domain}{spec['health_path']}", ], timeout=20, ) try: return int(r.stdout.strip() or "0") except ValueError: return 0 def wait_healthy(spec: dict, timeout: int | None = None) -> bool: domain = spec["domain"] deadline = time.time() + (timeout or spec["health_timeout"]) while time.time() < deadline: if health_code(spec) in tuple(spec["health_ok"]): return True time.sleep(10) return False def release_notes(recipe: str, version: str) -> str: """Concatenated releaseNotes for `version` (try a couple of common filename shapes).""" base = os.path.join(_recipe_dir(recipe), "releaseNotes") for name in (f"{version}.md", version): p = os.path.join(base, name) if os.path.isfile(p): try: with open(p) as f: return f.read() except OSError: return "" return "" def deploy_version(recipe: str, domain: str, version: str, timeout: int) -> None: """Deploy a specific published version: checkout the tag (so the on-disk tree matches) then a pinned non-chaos redeploy with the version positional (so abra records TYPE=:). `-f` makes it idempotent against an already-deployed app. abra writes FATA to stdout, so include both streams in the error.""" abra.recipe_checkout(recipe, version) r = _run(["abra", "app", "deploy", domain, version, "-o", "-n", "-f"], timeout=timeout) if r.returncode != 0: msg = (r.stderr.strip() + " " + r.stdout.strip()).strip()[:400] raise RuntimeError(f"deploy {domain} {version} failed: {msg}") def wait_undeployed(domain: str, timeout: int = 120) -> None: """Block until the app's swarm stack is fully removed after an undeploy. abra's undeploy may return before swarm finishes tearing down tasks; snapshot/restore (which require undeployed) and an immediate redeploy of the same stack name otherwise race a half-removed stack.""" stack = lifecycle._stack_name(domain) # noqa: SLF001 deadline = time.time() + timeout while time.time() < deadline: if not lifecycle._docker_names("service", stack): # noqa: SLF001 return time.sleep(2) raise RuntimeError(f"{domain} stack not fully undeployed after {timeout}s") # --------------------------------------------------------------------------- last-good + alerts def last_good_path(recipe: str) -> str: return os.path.join(warmsnap.app_dir(recipe), "last_good") def read_last_good(recipe: str) -> str | None: try: with open(last_good_path(recipe)) as f: return f.read().strip() or None except OSError: return None def write_last_good(recipe: str, version: str) -> None: os.makedirs(warmsnap.app_dir(recipe), exist_ok=True) tmp = last_good_path(recipe) + ".tmp" with open(tmp, "w") as f: f.write(version) os.replace(tmp, last_good_path(recipe)) def write_alert(app: str, reason: str, **fields) -> str: """Write a sentinel JSON alert under /var/lib/ci-warm/alerts/ for the Builder loop to relay.""" os.makedirs(ALERTS_DIR, exist_ok=True) ts = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime()) rec = {"app": app, "reason": reason, "ts": ts, **fields} path = os.path.join(ALERTS_DIR, f"{ts}-{app}-{reason}.json") tmp = path + ".tmp" with open(tmp, "w") as f: json.dump(rec, f, indent=2) os.replace(tmp, path) print(f"ALERT[{reason}] {app}: {fields}", flush=True) return path # --------------------------------------------------------------------------- reconcile def ensure_server() -> None: if _run(["abra", "server", "ls", "-m", "-n"], timeout=30).returncode != 0: _run(["abra", "server", "add", "--local", "-n"], timeout=60) def ensure_app_config(recipe: str, domain: str, version: str) -> None: if not os.path.isfile(env_file(domain)): _run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"], timeout=120, check=True) abra.env_set(domain, "DOMAIN", domain) abra.env_set(domain, "LETS_ENCRYPT_ENV", "") def ensure_secrets(domain: str) -> None: stack = lifecycle._stack_name(domain) # noqa: SLF001 have = {n for n in lifecycle._docker_names("secret", stack)} # noqa: SLF001 if not any(n.endswith("_admin_password_v1") for n in have): abra.secret_generate(domain) def reconcile(app: str) -> str: spec = SPECS[app] recipe, domain = spec["recipe"], spec["domain"] dt, stateful = spec["deploy_timeout"], spec["stateful"] ensure_server() fetch_recipe(recipe) tags = recipe_tags(recipe) latest = latest_version(tags) if not latest: raise RuntimeError(f"no version tags for {recipe}") # Per-app config/secrets: a spec may provide its own `setup` (traefik's cert/file-provider wiring); # otherwise the default keycloak-shaped path (app new + DOMAIN/LETS_ENCRYPT + generate secrets). setup = spec.get("setup") if setup: setup(recipe, domain, latest) else: ensure_app_config(recipe, domain, latest) ensure_secrets(domain) current = current_version(domain) deployed = is_deployed(domain) # Fresh deploy (nothing running) — deploy current-pinned (or latest if never deployed). if not deployed: target = current or latest print(f"[{app}] not deployed → fresh deploy {target}", flush=True) deploy_version(recipe, domain, target, dt) if not wait_healthy(spec): raise RuntimeError(f"{app} fresh deploy {target} did not become healthy") write_last_good(recipe, target) return f"deployed-fresh:{target}" # Deployed & already on latest → converge to a no-op (commit last-good if healthy). if current == latest: if wait_healthy(spec, timeout=60): write_last_good(recipe, latest) print(f"[{app}] already on latest {latest} and healthy — no-op", flush=True) return f"noop-healthy:{latest}" # On latest but unhealthy: try a redeploy; if still bad, alert (rollback target unknown/same). print(f"[{app}] on latest {latest} but UNHEALTHY → redeploy", flush=True) deploy_version(recipe, domain, latest, dt) if wait_healthy(spec): write_last_good(recipe, latest) return f"redeployed-healthy:{latest}" write_alert(app, "unhealthy-on-latest", version=latest) return f"unhealthy:{latest}" # --- An upgrade current→latest is available --- # WC1.2 pre-deploy SAFETY gate (runs BEFORE any snapshot/deploy). notes = release_notes(recipe, latest) if is_major_bump(current or "0", latest): write_alert(app, "held-major", current=current, latest=latest, release_notes=notes[:4000]) return f"held-major:{current}->{latest}" if notes_flag_manual_migration(notes): write_alert(app, "held-manual-migration", current=current, latest=latest, release_notes=notes[:4000]) return f"held-manual-migration:{current}->{latest}" # WC1.1 health-gated upgrade with rollback. last_good = current print(f"[{app}] auto-upgrade {last_good} → {latest} (health-gated)", flush=True) if stateful: abra.undeploy(domain) wait_undeployed(domain) warmsnap.snapshot(recipe, domain, version=last_good) # snapshot requires undeployed; now bring up latest. # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a # crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a # deploy exception the same as an unhealthy result. upgrade_ok = False try: deploy_version(recipe, domain, latest, dt) upgrade_ok = wait_healthy(spec) except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True) upgrade_ok = False if upgrade_ok: write_last_good(recipe, latest) print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True) return f"upgraded:{last_good}->{latest}" # Unhealthy → roll back. print(f"[{app}] latest {latest} UNHEALTHY → rolling back to {last_good}", flush=True) if stateful: abra.undeploy(domain) wait_undeployed(domain) warmsnap.restore(recipe, domain) deploy_version(recipe, domain, last_good, dt) recovered = wait_healthy(spec) write_alert(app, "rollback", last_good=last_good, attempted=latest, recovered=recovered, release_notes=notes[:2000]) if not recovered: raise RuntimeError(f"{app} rollback to {last_good} did not become healthy") return f"rolled-back:{latest}->{last_good}" def main(argv) -> int: if len(argv) != 2 or argv[1] not in SPECS: print(f"usage: warm_reconcile.py <{'|'.join(SPECS)}>", file=sys.stderr) return 2 result = reconcile(argv[1]) print(f"RECONCILE RESULT: {result}", flush=True) return 0 if __name__ == "__main__": raise SystemExit(main(sys.argv))