From 1b8d26b504ef751c35e8e2569e340e96403559d4 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Thu, 28 May 2026 23:26:02 +0100 Subject: [PATCH] feat(2w): W0.2 live-warm keycloak dep mode in orchestrator (WC1) - runner/harness/warm.py: stable-domain scheme (warm-), is_warm_up probe, live_app_hexes scan, per-run realm_for naming, reap_orphan_realms. - run_recipe_ci.py: split declared deps into live-warm (shared provider + per-run realm, no deploy, realm deleted at teardown) vs cold (co-deploy). Warm path used only when provider is up; cold fallback otherwise. Reap orphan realms at run start (concurrency-safe). deploy-count excludes warm deps. Realm naming now per-run namespaced (-<6hex>). - dependent tests assert the namespaced realm pattern (stronger than ==parent). Live proof on warm keycloak: realm create -> password-grant JWT -> discovery issuer -> delete(idempotent) -> reap(keeps live hex, deletes orphan): PASS. 43 unit pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- runner/harness/warm.py | 109 ++++++++++++++++++ runner/run_recipe_ci.py | 86 ++++++++++---- .../functional/test_oidc_with_keycloak.py | 6 +- .../functional/test_oidc_with_keycloak.py | 7 +- 4 files changed, 184 insertions(+), 24 deletions(-) create mode 100644 runner/harness/warm.py diff --git a/runner/harness/warm.py b/runner/harness/warm.py new file mode 100644 index 0000000..ef9935b --- /dev/null +++ b/runner/harness/warm.py @@ -0,0 +1,109 @@ +"""Warm-infrastructure harness primitive (Phase 2w / WC1+). + +Phase 2w keeps a small set of apps "warm" at STABLE domains (distinct from the cold per-run +`-<6hex>` scheme — see DECISIONS.md Phase-2w): + +- **live-warm** — actually deployed and running (keycloak today): a shared SSO provider that + dependent runs use instead of co-deploying a fresh provider. The per-run *realm* (not the app) is + the isolation unit — created at run start, deleted at run end (see harness.sso WC1 helpers). +- **data-warm** (W1+) — undeployed-when-idle canonicals whose data volume is retained. + +This module owns the stable-domain scheme + the "is the warm provider actually usable right now?" +probe + the live-app-hex scan used to reap orphan realms concurrency-safely. It deliberately does NOT +deploy the warm provider — that's the declarative Nix reconciler's job (nix/modules/warm-keycloak.nix). +The harness only *uses* a warm provider when one is up, and falls back to cold co-deploy otherwise. +""" + +from __future__ import annotations + +import re +import ssl +import subprocess +import urllib.error +import urllib.request + +# Recipes that, when declared as a dep, are served from a shared live-warm instance at a stable +# domain instead of being co-deployed per run. Maps dep-recipe -> stable domain. +WARM_DOMAINS = { + "keycloak": "warm-keycloak.ci.commoninternet.net", +} + +# Health probe per warm provider: (path, ok-codes). Mirrors the recipe_meta health contract. +_WARM_HEALTH = { + "keycloak": ("/realms/master", (200,)), +} + +_CTX = ssl.create_default_context() +_CTX.check_hostname = False +_CTX.verify_mode = ssl.CERT_NONE + +# A cold per-run stack name looks like "-<6hex>_ci_commoninternet_net_"; extract the hex. +_STACK_HEX_RE = re.compile(r"^[a-z0-9]{1,4}-([0-9a-f]{6})_ci_commoninternet_net_") + + +def warm_domain(recipe: str) -> str | None: + """The stable warm domain for a dep recipe, or None if this recipe is not served warm.""" + return WARM_DOMAINS.get(recipe) + + +def is_warm_up(recipe: str, domain: str | None = None, timeout: int = 10) -> bool: + """True iff the warm provider for `recipe` answers its health endpoint right now. Used to decide + whether to use the warm path or fall back to cold co-deploy. Conservative: any error → False.""" + domain = domain or warm_domain(recipe) + if not domain: + return False + path, ok = _WARM_HEALTH.get(recipe, ("/", (200, 301, 302))) + req = urllib.request.Request(f"https://{domain}{path}", method="GET") + try: + with urllib.request.urlopen(req, timeout=timeout, context=_CTX) as r: + return r.status in ok + except urllib.error.HTTPError as e: + return e.code in ok + except Exception: # noqa: BLE001 — down / unreachable / TLS / DNS → not usable + return False + + +def live_app_hexes() -> set[str]: + """The set of 6hex suffixes of currently-deployed cold per-run app stacks. Used to reap orphan + realms safely: a realm whose hex maps to a live stack belongs to an in-flight run and is kept. + Reads docker service names directly so it works even when an app's .env was already removed.""" + out: set[str] = set() + try: + res = subprocess.run( + ["docker", "service", "ls", "--format", "{{.Name}}"], + capture_output=True, + text=True, + timeout=30, + ) + except Exception: # noqa: BLE001 + return out + for name in res.stdout.splitlines(): + m = _STACK_HEX_RE.match(name.strip()) + if m: + out.add(m.group(1)) + return out + + +def reap_orphan_realms(recipe: str, domain: str | None = None) -> list[str]: + """Reap per-run realms on the warm provider left behind by crashed/killed dependent runs. Safe + under concurrency: realms whose hex maps to a currently-live app stack are kept. Returns the + realms actually deleted; [] on any error (best-effort run-start cleanup, never fatal).""" + domain = domain or warm_domain(recipe) + if recipe != "keycloak" or not domain: + return [] + from . import sso # local import avoids import cycle at module load + + try: + return sso.reap_orphaned_realms(domain, live_app_hexes()) + except Exception: # noqa: BLE001 — reaping is hygiene, not correctness-critical + return [] + + +def realm_for(parent_recipe: str, parent_domain: str) -> str: + """The per-run realm name for a dependent run: "-<6hex>" where the 6hex is the + parent's per-run domain label suffix. Unique per (parent, pr, ref) so concurrent dependents never + collide on a shared keycloak, and traceable back to the app stack for reaping/debugging.""" + label = parent_domain.split(".", 1)[0] # "lasu-0a6fb2" + m = re.search(r"-([0-9a-f]{6})$", label) + suffix = m.group(1) if m else label + return f"{parent_recipe}-{suffix}" diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index c4689df..4125915 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -40,7 +40,7 @@ import tempfile ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, "runner")) -from harness import deps as deps_mod, discovery, generic, lifecycle, naming # noqa: E402 +from harness import deps as deps_mod, discovery, generic, lifecycle, naming, warm # noqa: E402 ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom") @@ -298,7 +298,7 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> Provider routing: today only `keycloak` is supported. authentik will need a parallel `setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9). """ - from harness import sso # local import — sso may not be needed for dep-less runs + from harness import sso, warm # local import — sso may not be needed for dep-less runs out: dict[str, dict] = {} for entry in deps_list or []: @@ -311,9 +311,11 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> # raise if they need realm/client info they don't see. out[dep_recipe] = entry continue - # The realm/client name uses the parent recipe name so collisions across parents are - # impossible on a shared keycloak (and the values are predictable for debugging). - realm = parent_recipe + # The realm is the per-run isolation unit on a (possibly shared live-warm) keycloak: name it + # "-<6hex>" so concurrent dependents — even two PRs of the SAME recipe — never + # collide on a realm (WC1). client_id stays the parent recipe name (isolated within the + # unique realm; predictable for debugging). + realm = warm.realm_for(parent_recipe, parent_domain) client_id = parent_recipe creds = sso.setup_keycloak_realm( dep_domain, @@ -325,6 +327,7 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> out[dep_recipe] = { "recipe": dep_recipe, "domain": dep_domain, + "warm": bool(entry.get("warm")), "realm": creds["realm"], "client_id": creds["client_id"], "client_secret": creds["client_secret"], @@ -519,12 +522,39 @@ def main() -> int: if declared: print("\n===== setup_custom_tests: deps + OIDC wiring =====", flush=True) try: - dep_metas = {d: _load_meta(d) for d in declared} - deps_list = deps_mod.deploy_deps( - recipe, os.environ.get("PR", "0"), ref, declared, meta_for=dep_metas + # WC1: split deps into live-warm (shared provider at a stable domain + per-run + # realm) vs cold (co-deploy per run). A warm dep is used ONLY if its provider is + # actually up right now; otherwise it falls back to cold so a from-scratch host + # (before the warm reconciler has run) still works. + warm_deps, cold_deps = [], [] + for d in declared: + wd = warm.warm_domain(d) + if wd and warm.is_warm_up(d, wd): + warm_deps.append(d) + else: + if wd: + print(f" dep: {d} warm provider {wd} not up — cold fallback", flush=True) + cold_deps.append(d) + # Cold deps: co-deploy per run (existing path). + dep_metas = {d: _load_meta(d) for d in cold_deps} + deps_list = ( + deps_mod.deploy_deps( + recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas + ) + if cold_deps + else [] ) - # Enrich each dep entry with SSO creds (realm/client/secret) by setting up a - # keycloak realm per dep. The dict form is what setup_custom_tests.sh reads. + # Warm deps: no deploy. Reap orphan realms first (concurrency-safe), then point + # at the stable domain; _enrich creates the per-run realm on it. + for d in warm_deps: + wd = warm.warm_domain(d) + reaped = warm.reap_orphan_realms(d, wd) + if reaped: + print(f" dep: reaped {len(reaped)} orphan realm(s) on warm {d}: {reaped}", flush=True) + deps_list.append({"recipe": d, "domain": wd, "warm": True}) + print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True) + # Enrich each dep entry with SSO creds (realm/client/secret). The dict form is + # what setup_custom_tests.sh reads. deps_state = _enrich_deps_with_sso(recipe, domain, deps_list) deps_mod.write_run_state(deps_state) # Run the per-recipe post-deps hook (jq-driven OIDC wiring + in-place redeploy) @@ -558,14 +588,26 @@ def main() -> int: lifecycle.teardown_app(domain, verify=False) if deps_state: print("\n===== DEPS teardown =====", flush=True) + # Flatten the dict-shape state in declaration order; teardown_deps reverses for cold. + if isinstance(deps_state, dict): + ordered = [deps_state[d] for d in declared if d in deps_state] + else: + ordered = deps_state + # WC1: warm deps are NOT undeployed — we only delete the per-run realm on the shared + # live-warm provider (the app stays up for the next run). Cold deps undeploy as before. + warm_entries = [e for e in ordered if e.get("warm")] + cold_entries = [e for e in ordered if not e.get("warm")] + for e in warm_entries: + try: + from harness import sso + + sso.delete_keycloak_realm(e["domain"], e["realm"]) + print(f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True) + except Exception as ex: # noqa: BLE001 — a leaked realm is a teardown failure (§9) + dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}" + print(f"!! {dep_teardown_error}", flush=True) try: - # teardown_deps accepts a list of entries; flatten the dict-shape state in - # declaration-reverse order so teardown sequencing matches §1's contract. - if isinstance(deps_state, dict): - list_for_teardown = [deps_state[d] for d in declared if d in deps_state] - else: - list_for_teardown = deps_state - deps_mod.teardown_deps(list_for_teardown) + deps_mod.teardown_deps(cold_entries) except lifecycle.TeardownError as e: dep_teardown_error = str(e) print(f"!! {dep_teardown_error}", flush=True) @@ -590,10 +632,12 @@ def main() -> int: # ---- per-op summary (DG6 feed) ---- # SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each - # dep). In-place reconfigure-and-redeploy (the setup_custom_tests step's - # `abra app deploy --force --chaos`) is NOT a fresh `app_new` and does NOT increment the - # count. So expected = 1 + (number of deps that actually got deployed). - deps_deployed_count = len(deps_state) if isinstance(deps_state, dict) else len(deps_state or []) + # COLD dep). In-place reconfigure-and-redeploy (the setup_custom_tests step's + # `abra app deploy --force --chaos`) is NOT a fresh `app_new` and does NOT increment the count. + # WC1: a live-warm dep (keycloak) is NOT deployed by the run — it only gets a per-run realm — so + # warm deps contribute 0. So expected = 1 + (number of COLD deps that actually got deployed). + _dep_entries = deps_state.values() if isinstance(deps_state, dict) else (deps_state or []) + deps_deployed_count = sum(1 for e in _dep_entries if not (isinstance(e, dict) and e.get("warm"))) expected_deploy_count = 1 + deps_deployed_count print("\n===== RUN SUMMARY =====", flush=True) print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})") diff --git a/tests/lasuite-docs/functional/test_oidc_with_keycloak.py b/tests/lasuite-docs/functional/test_oidc_with_keycloak.py index 9bfa731..c5fb875 100644 --- a/tests/lasuite-docs/functional/test_oidc_with_keycloak.py +++ b/tests/lasuite-docs/functional/test_oidc_with_keycloak.py @@ -15,6 +15,7 @@ from __future__ import annotations import base64 import json import os +import re import sys import time @@ -40,7 +41,10 @@ def test_oidc_password_grant_against_dep_keycloak(live_app, deps_creds): # Sanity-check the creds shape — orchestrator-written assert kc["domain"] - assert kc["realm"] == "lasuite-docs" # orchestrator names the realm after the parent recipe + # WC1: realm is per-run namespaced "-<6hex>" so concurrent dependents never collide. + assert re.fullmatch(r"lasuite-docs-[0-9a-f]{6}", kc["realm"]), ( + f"realm {kc['realm']!r} not the per-run namespaced form lasuite-docs-<6hex>" + ) assert kc["client_id"] == "lasuite-docs" assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16 assert isinstance(kc["password"], str) and len(kc["password"]) >= 16 diff --git a/tests/lasuite-drive/functional/test_oidc_with_keycloak.py b/tests/lasuite-drive/functional/test_oidc_with_keycloak.py index ce3069d..eb2820f 100644 --- a/tests/lasuite-drive/functional/test_oidc_with_keycloak.py +++ b/tests/lasuite-drive/functional/test_oidc_with_keycloak.py @@ -20,6 +20,7 @@ from __future__ import annotations import base64 import json import os +import re import sys import time @@ -43,9 +44,11 @@ def test_oidc_password_grant_against_dep_keycloak(live_app, deps_creds): ) kc = deps_creds["keycloak"] - # Creds shape — orchestrator names the realm + client after the parent recipe. + # Creds shape. WC1: realm is per-run namespaced "-<6hex>"; client_id stays the parent. assert kc["domain"] - assert kc["realm"] == "lasuite-drive" + assert re.fullmatch(r"lasuite-drive-[0-9a-f]{6}", kc["realm"]), ( + f"realm {kc['realm']!r} not the per-run namespaced form lasuite-drive-<6hex>" + ) assert kc["client_id"] == "lasuite-drive" assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16 assert isinstance(kc["password"], str) and len(kc["password"]) >= 16