feat(2w): W0.2 live-warm keycloak dep mode in orchestrator (WC1)
- runner/harness/warm.py: stable-domain scheme (warm-<recipe>), is_warm_up probe, live_app_hexes scan, per-run realm_for naming, reap_orphan_realms. - run_recipe_ci.py: split declared deps into live-warm (shared provider + per-run realm, no deploy, realm deleted at teardown) vs cold (co-deploy). Warm path used only when provider is up; cold fallback otherwise. Reap orphan realms at run start (concurrency-safe). deploy-count excludes warm deps. Realm naming now per-run namespaced (<parent>-<6hex>). - dependent tests assert the namespaced realm pattern (stronger than ==parent). Live proof on warm keycloak: realm create -> password-grant JWT -> discovery issuer -> delete(idempotent) -> reap(keeps live hex, deletes orphan): PASS. 43 unit pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
109
runner/harness/warm.py
Normal file
109
runner/harness/warm.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
"""Warm-infrastructure harness primitive (Phase 2w / WC1+).
|
||||||
|
|
||||||
|
Phase 2w keeps a small set of apps "warm" at STABLE domains (distinct from the cold per-run
|
||||||
|
`<recipe[:4]>-<6hex>` scheme — see DECISIONS.md Phase-2w):
|
||||||
|
|
||||||
|
- **live-warm** — actually deployed and running (keycloak today): a shared SSO provider that
|
||||||
|
dependent runs use instead of co-deploying a fresh provider. The per-run *realm* (not the app) is
|
||||||
|
the isolation unit — created at run start, deleted at run end (see harness.sso WC1 helpers).
|
||||||
|
- **data-warm** (W1+) — undeployed-when-idle canonicals whose data volume is retained.
|
||||||
|
|
||||||
|
This module owns the stable-domain scheme + the "is the warm provider actually usable right now?"
|
||||||
|
probe + the live-app-hex scan used to reap orphan realms concurrency-safely. It deliberately does NOT
|
||||||
|
deploy the warm provider — that's the declarative Nix reconciler's job (nix/modules/warm-keycloak.nix).
|
||||||
|
The harness only *uses* a warm provider when one is up, and falls back to cold co-deploy otherwise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import ssl
|
||||||
|
import subprocess
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
# Recipes that, when declared as a dep, are served from a shared live-warm instance at a stable
|
||||||
|
# domain instead of being co-deployed per run. Maps dep-recipe -> stable domain.
|
||||||
|
WARM_DOMAINS = {
|
||||||
|
"keycloak": "warm-keycloak.ci.commoninternet.net",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Health probe per warm provider: (path, ok-codes). Mirrors the recipe_meta health contract.
|
||||||
|
_WARM_HEALTH = {
|
||||||
|
"keycloak": ("/realms/master", (200,)),
|
||||||
|
}
|
||||||
|
|
||||||
|
_CTX = ssl.create_default_context()
|
||||||
|
_CTX.check_hostname = False
|
||||||
|
_CTX.verify_mode = ssl.CERT_NONE
|
||||||
|
|
||||||
|
# A cold per-run stack name looks like "<tag>-<6hex>_ci_commoninternet_net_<svc>"; extract the hex.
|
||||||
|
_STACK_HEX_RE = re.compile(r"^[a-z0-9]{1,4}-([0-9a-f]{6})_ci_commoninternet_net_")
|
||||||
|
|
||||||
|
|
||||||
|
def warm_domain(recipe: str) -> str | None:
|
||||||
|
"""The stable warm domain for a dep recipe, or None if this recipe is not served warm."""
|
||||||
|
return WARM_DOMAINS.get(recipe)
|
||||||
|
|
||||||
|
|
||||||
|
def is_warm_up(recipe: str, domain: str | None = None, timeout: int = 10) -> bool:
|
||||||
|
"""True iff the warm provider for `recipe` answers its health endpoint right now. Used to decide
|
||||||
|
whether to use the warm path or fall back to cold co-deploy. Conservative: any error → False."""
|
||||||
|
domain = domain or warm_domain(recipe)
|
||||||
|
if not domain:
|
||||||
|
return False
|
||||||
|
path, ok = _WARM_HEALTH.get(recipe, ("/", (200, 301, 302)))
|
||||||
|
req = urllib.request.Request(f"https://{domain}{path}", method="GET")
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout, context=_CTX) as r:
|
||||||
|
return r.status in ok
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
return e.code in ok
|
||||||
|
except Exception: # noqa: BLE001 — down / unreachable / TLS / DNS → not usable
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def live_app_hexes() -> set[str]:
|
||||||
|
"""The set of 6hex suffixes of currently-deployed cold per-run app stacks. Used to reap orphan
|
||||||
|
realms safely: a realm whose hex maps to a live stack belongs to an in-flight run and is kept.
|
||||||
|
Reads docker service names directly so it works even when an app's .env was already removed."""
|
||||||
|
out: set[str] = set()
|
||||||
|
try:
|
||||||
|
res = subprocess.run(
|
||||||
|
["docker", "service", "ls", "--format", "{{.Name}}"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return out
|
||||||
|
for name in res.stdout.splitlines():
|
||||||
|
m = _STACK_HEX_RE.match(name.strip())
|
||||||
|
if m:
|
||||||
|
out.add(m.group(1))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def reap_orphan_realms(recipe: str, domain: str | None = None) -> list[str]:
|
||||||
|
"""Reap per-run realms on the warm provider left behind by crashed/killed dependent runs. Safe
|
||||||
|
under concurrency: realms whose hex maps to a currently-live app stack are kept. Returns the
|
||||||
|
realms actually deleted; [] on any error (best-effort run-start cleanup, never fatal)."""
|
||||||
|
domain = domain or warm_domain(recipe)
|
||||||
|
if recipe != "keycloak" or not domain:
|
||||||
|
return []
|
||||||
|
from . import sso # local import avoids import cycle at module load
|
||||||
|
|
||||||
|
try:
|
||||||
|
return sso.reap_orphaned_realms(domain, live_app_hexes())
|
||||||
|
except Exception: # noqa: BLE001 — reaping is hygiene, not correctness-critical
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def realm_for(parent_recipe: str, parent_domain: str) -> str:
|
||||||
|
"""The per-run realm name for a dependent run: "<parent_recipe>-<6hex>" where the 6hex is the
|
||||||
|
parent's per-run domain label suffix. Unique per (parent, pr, ref) so concurrent dependents never
|
||||||
|
collide on a shared keycloak, and traceable back to the app stack for reaping/debugging."""
|
||||||
|
label = parent_domain.split(".", 1)[0] # "lasu-0a6fb2"
|
||||||
|
m = re.search(r"-([0-9a-f]{6})$", label)
|
||||||
|
suffix = m.group(1) if m else label
|
||||||
|
return f"{parent_recipe}-{suffix}"
|
||||||
@ -40,7 +40,7 @@ import tempfile
|
|||||||
|
|
||||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
sys.path.insert(0, os.path.join(ROOT, "runner"))
|
sys.path.insert(0, os.path.join(ROOT, "runner"))
|
||||||
from harness import deps as deps_mod, discovery, generic, lifecycle, naming # noqa: E402
|
from harness import deps as deps_mod, discovery, generic, lifecycle, naming, warm # noqa: E402
|
||||||
|
|
||||||
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
|
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
|
||||||
|
|
||||||
@ -298,7 +298,7 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) ->
|
|||||||
Provider routing: today only `keycloak` is supported. authentik will need a parallel
|
Provider routing: today only `keycloak` is supported. authentik will need a parallel
|
||||||
`setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9).
|
`setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9).
|
||||||
"""
|
"""
|
||||||
from harness import sso # local import — sso may not be needed for dep-less runs
|
from harness import sso, warm # local import — sso may not be needed for dep-less runs
|
||||||
|
|
||||||
out: dict[str, dict] = {}
|
out: dict[str, dict] = {}
|
||||||
for entry in deps_list or []:
|
for entry in deps_list or []:
|
||||||
@ -311,9 +311,11 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) ->
|
|||||||
# raise if they need realm/client info they don't see.
|
# raise if they need realm/client info they don't see.
|
||||||
out[dep_recipe] = entry
|
out[dep_recipe] = entry
|
||||||
continue
|
continue
|
||||||
# The realm/client name uses the parent recipe name so collisions across parents are
|
# The realm is the per-run isolation unit on a (possibly shared live-warm) keycloak: name it
|
||||||
# impossible on a shared keycloak (and the values are predictable for debugging).
|
# "<parent>-<6hex>" so concurrent dependents — even two PRs of the SAME recipe — never
|
||||||
realm = parent_recipe
|
# collide on a realm (WC1). client_id stays the parent recipe name (isolated within the
|
||||||
|
# unique realm; predictable for debugging).
|
||||||
|
realm = warm.realm_for(parent_recipe, parent_domain)
|
||||||
client_id = parent_recipe
|
client_id = parent_recipe
|
||||||
creds = sso.setup_keycloak_realm(
|
creds = sso.setup_keycloak_realm(
|
||||||
dep_domain,
|
dep_domain,
|
||||||
@ -325,6 +327,7 @@ def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) ->
|
|||||||
out[dep_recipe] = {
|
out[dep_recipe] = {
|
||||||
"recipe": dep_recipe,
|
"recipe": dep_recipe,
|
||||||
"domain": dep_domain,
|
"domain": dep_domain,
|
||||||
|
"warm": bool(entry.get("warm")),
|
||||||
"realm": creds["realm"],
|
"realm": creds["realm"],
|
||||||
"client_id": creds["client_id"],
|
"client_id": creds["client_id"],
|
||||||
"client_secret": creds["client_secret"],
|
"client_secret": creds["client_secret"],
|
||||||
@ -519,12 +522,39 @@ def main() -> int:
|
|||||||
if declared:
|
if declared:
|
||||||
print("\n===== setup_custom_tests: deps + OIDC wiring =====", flush=True)
|
print("\n===== setup_custom_tests: deps + OIDC wiring =====", flush=True)
|
||||||
try:
|
try:
|
||||||
dep_metas = {d: _load_meta(d) for d in declared}
|
# WC1: split deps into live-warm (shared provider at a stable domain + per-run
|
||||||
deps_list = deps_mod.deploy_deps(
|
# realm) vs cold (co-deploy per run). A warm dep is used ONLY if its provider is
|
||||||
recipe, os.environ.get("PR", "0"), ref, declared, meta_for=dep_metas
|
# actually up right now; otherwise it falls back to cold so a from-scratch host
|
||||||
|
# (before the warm reconciler has run) still works.
|
||||||
|
warm_deps, cold_deps = [], []
|
||||||
|
for d in declared:
|
||||||
|
wd = warm.warm_domain(d)
|
||||||
|
if wd and warm.is_warm_up(d, wd):
|
||||||
|
warm_deps.append(d)
|
||||||
|
else:
|
||||||
|
if wd:
|
||||||
|
print(f" dep: {d} warm provider {wd} not up — cold fallback", flush=True)
|
||||||
|
cold_deps.append(d)
|
||||||
|
# Cold deps: co-deploy per run (existing path).
|
||||||
|
dep_metas = {d: _load_meta(d) for d in cold_deps}
|
||||||
|
deps_list = (
|
||||||
|
deps_mod.deploy_deps(
|
||||||
|
recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas
|
||||||
|
)
|
||||||
|
if cold_deps
|
||||||
|
else []
|
||||||
)
|
)
|
||||||
# Enrich each dep entry with SSO creds (realm/client/secret) by setting up a
|
# Warm deps: no deploy. Reap orphan realms first (concurrency-safe), then point
|
||||||
# keycloak realm per dep. The dict form is what setup_custom_tests.sh reads.
|
# at the stable domain; _enrich creates the per-run realm on it.
|
||||||
|
for d in warm_deps:
|
||||||
|
wd = warm.warm_domain(d)
|
||||||
|
reaped = warm.reap_orphan_realms(d, wd)
|
||||||
|
if reaped:
|
||||||
|
print(f" dep: reaped {len(reaped)} orphan realm(s) on warm {d}: {reaped}", flush=True)
|
||||||
|
deps_list.append({"recipe": d, "domain": wd, "warm": True})
|
||||||
|
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
|
||||||
|
# Enrich each dep entry with SSO creds (realm/client/secret). The dict form is
|
||||||
|
# what setup_custom_tests.sh reads.
|
||||||
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
||||||
deps_mod.write_run_state(deps_state)
|
deps_mod.write_run_state(deps_state)
|
||||||
# Run the per-recipe post-deps hook (jq-driven OIDC wiring + in-place redeploy)
|
# Run the per-recipe post-deps hook (jq-driven OIDC wiring + in-place redeploy)
|
||||||
@ -558,14 +588,26 @@ def main() -> int:
|
|||||||
lifecycle.teardown_app(domain, verify=False)
|
lifecycle.teardown_app(domain, verify=False)
|
||||||
if deps_state:
|
if deps_state:
|
||||||
print("\n===== DEPS teardown =====", flush=True)
|
print("\n===== DEPS teardown =====", flush=True)
|
||||||
|
# Flatten the dict-shape state in declaration order; teardown_deps reverses for cold.
|
||||||
|
if isinstance(deps_state, dict):
|
||||||
|
ordered = [deps_state[d] for d in declared if d in deps_state]
|
||||||
|
else:
|
||||||
|
ordered = deps_state
|
||||||
|
# WC1: warm deps are NOT undeployed — we only delete the per-run realm on the shared
|
||||||
|
# live-warm provider (the app stays up for the next run). Cold deps undeploy as before.
|
||||||
|
warm_entries = [e for e in ordered if e.get("warm")]
|
||||||
|
cold_entries = [e for e in ordered if not e.get("warm")]
|
||||||
|
for e in warm_entries:
|
||||||
|
try:
|
||||||
|
from harness import sso
|
||||||
|
|
||||||
|
sso.delete_keycloak_realm(e["domain"], e["realm"])
|
||||||
|
print(f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True)
|
||||||
|
except Exception as ex: # noqa: BLE001 — a leaked realm is a teardown failure (§9)
|
||||||
|
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
|
||||||
|
print(f"!! {dep_teardown_error}", flush=True)
|
||||||
try:
|
try:
|
||||||
# teardown_deps accepts a list of entries; flatten the dict-shape state in
|
deps_mod.teardown_deps(cold_entries)
|
||||||
# declaration-reverse order so teardown sequencing matches §1's contract.
|
|
||||||
if isinstance(deps_state, dict):
|
|
||||||
list_for_teardown = [deps_state[d] for d in declared if d in deps_state]
|
|
||||||
else:
|
|
||||||
list_for_teardown = deps_state
|
|
||||||
deps_mod.teardown_deps(list_for_teardown)
|
|
||||||
except lifecycle.TeardownError as e:
|
except lifecycle.TeardownError as e:
|
||||||
dep_teardown_error = str(e)
|
dep_teardown_error = str(e)
|
||||||
print(f"!! {dep_teardown_error}", flush=True)
|
print(f"!! {dep_teardown_error}", flush=True)
|
||||||
@ -590,10 +632,12 @@ def main() -> int:
|
|||||||
|
|
||||||
# ---- per-op summary (DG6 feed) ----
|
# ---- per-op summary (DG6 feed) ----
|
||||||
# SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each
|
# SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each
|
||||||
# dep). In-place reconfigure-and-redeploy (the setup_custom_tests step's
|
# COLD dep). In-place reconfigure-and-redeploy (the setup_custom_tests step's
|
||||||
# `abra app deploy --force --chaos`) is NOT a fresh `app_new` and does NOT increment the
|
# `abra app deploy --force --chaos`) is NOT a fresh `app_new` and does NOT increment the count.
|
||||||
# count. So expected = 1 + (number of deps that actually got deployed).
|
# WC1: a live-warm dep (keycloak) is NOT deployed by the run — it only gets a per-run realm — so
|
||||||
deps_deployed_count = len(deps_state) if isinstance(deps_state, dict) else len(deps_state or [])
|
# warm deps contribute 0. So expected = 1 + (number of COLD deps that actually got deployed).
|
||||||
|
_dep_entries = deps_state.values() if isinstance(deps_state, dict) else (deps_state or [])
|
||||||
|
deps_deployed_count = sum(1 for e in _dep_entries if not (isinstance(e, dict) and e.get("warm")))
|
||||||
expected_deploy_count = 1 + deps_deployed_count
|
expected_deploy_count = 1 + deps_deployed_count
|
||||||
print("\n===== RUN SUMMARY =====", flush=True)
|
print("\n===== RUN SUMMARY =====", flush=True)
|
||||||
print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})")
|
print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})")
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from __future__ import annotations
|
|||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -40,7 +41,10 @@ def test_oidc_password_grant_against_dep_keycloak(live_app, deps_creds):
|
|||||||
|
|
||||||
# Sanity-check the creds shape — orchestrator-written
|
# Sanity-check the creds shape — orchestrator-written
|
||||||
assert kc["domain"]
|
assert kc["domain"]
|
||||||
assert kc["realm"] == "lasuite-docs" # orchestrator names the realm after the parent recipe
|
# WC1: realm is per-run namespaced "<parent>-<6hex>" so concurrent dependents never collide.
|
||||||
|
assert re.fullmatch(r"lasuite-docs-[0-9a-f]{6}", kc["realm"]), (
|
||||||
|
f"realm {kc['realm']!r} not the per-run namespaced form lasuite-docs-<6hex>"
|
||||||
|
)
|
||||||
assert kc["client_id"] == "lasuite-docs"
|
assert kc["client_id"] == "lasuite-docs"
|
||||||
assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16
|
assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16
|
||||||
assert isinstance(kc["password"], str) and len(kc["password"]) >= 16
|
assert isinstance(kc["password"], str) and len(kc["password"]) >= 16
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from __future__ import annotations
|
|||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -43,9 +44,11 @@ def test_oidc_password_grant_against_dep_keycloak(live_app, deps_creds):
|
|||||||
)
|
)
|
||||||
kc = deps_creds["keycloak"]
|
kc = deps_creds["keycloak"]
|
||||||
|
|
||||||
# Creds shape — orchestrator names the realm + client after the parent recipe.
|
# Creds shape. WC1: realm is per-run namespaced "<parent>-<6hex>"; client_id stays the parent.
|
||||||
assert kc["domain"]
|
assert kc["domain"]
|
||||||
assert kc["realm"] == "lasuite-drive"
|
assert re.fullmatch(r"lasuite-drive-[0-9a-f]{6}", kc["realm"]), (
|
||||||
|
f"realm {kc['realm']!r} not the per-run namespaced form lasuite-drive-<6hex>"
|
||||||
|
)
|
||||||
assert kc["client_id"] == "lasuite-drive"
|
assert kc["client_id"] == "lasuite-drive"
|
||||||
assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16
|
assert isinstance(kc["client_secret"], str) and len(kc["client_secret"]) >= 16
|
||||||
assert isinstance(kc["password"], str) and len(kc["password"]) >= 16
|
assert isinstance(kc["password"], str) and len(kc["password"]) >= 16
|
||||||
|
|||||||
Reference in New Issue
Block a user