runner/harness/canonical.py: data-warm canonical registry + lifecycle — is_enrolled (recipe_meta.WARM_CANONICAL), canonical_domain (warm.stable_domain warm-<recipe>), registry read/write (/var/lib/ci-warm/<recipe>/canonical.json), has_canonical (record + retained volume), deploy_canonical (reattach volume at known-good version), undeploy_keep_volume (idle data-warm), seed_canonical (record + warmsnap snapshot). warm.stable_domain helper added (keycloak path unchanged). +4 unit tests (61 unit pass). Also archived the Adversary's verification alert sentinels to alerts/seen/ (simulated rollback + 2 holds — evidentiary, gate PASSED; dir clean for real alerts). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
117 lines
5.1 KiB
Python
117 lines
5.1 KiB
Python
"""Warm-infrastructure harness primitive (Phase 2w / WC1+).
|
|
|
|
Phase 2w keeps a small set of apps "warm" at STABLE domains (distinct from the cold per-run
|
|
`<recipe[:4]>-<6hex>` scheme — see DECISIONS.md Phase-2w):
|
|
|
|
- **live-warm** — actually deployed and running (keycloak today): a shared SSO provider that
|
|
dependent runs use instead of co-deploying a fresh provider. The per-run *realm* (not the app) is
|
|
the isolation unit — created at run start, deleted at run end (see harness.sso WC1 helpers).
|
|
- **data-warm** (W1+) — undeployed-when-idle canonicals whose data volume is retained.
|
|
|
|
This module owns the stable-domain scheme + the "is the warm provider actually usable right now?"
|
|
probe + the live-app-hex scan used to reap orphan realms concurrency-safely. It deliberately does NOT
|
|
deploy the warm provider — that's the declarative Nix reconciler's job (nix/modules/warm-keycloak.nix).
|
|
The harness only *uses* a warm provider when one is up, and falls back to cold co-deploy otherwise.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import ssl
|
|
import subprocess
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
# Recipes that, when declared as a dep, are served from a shared live-warm instance at a stable
|
|
# domain instead of being co-deployed per run. Maps dep-recipe -> stable domain.
|
|
WARM_DOMAINS = {
|
|
"keycloak": "warm-keycloak.ci.commoninternet.net",
|
|
}
|
|
|
|
# Health probe per warm provider: (path, ok-codes). Mirrors the recipe_meta health contract.
|
|
_WARM_HEALTH = {
|
|
"keycloak": ("/realms/master", (200,)),
|
|
}
|
|
|
|
_CTX = ssl.create_default_context()
|
|
_CTX.check_hostname = False
|
|
_CTX.verify_mode = ssl.CERT_NONE
|
|
|
|
# A cold per-run stack name looks like "<tag>-<6hex>_ci_commoninternet_net_<svc>"; extract the hex.
|
|
_STACK_HEX_RE = re.compile(r"^[a-z0-9]{1,4}-([0-9a-f]{6})_ci_commoninternet_net_")
|
|
|
|
|
|
def stable_domain(recipe: str) -> str:
|
|
"""The stable warm domain for a recipe: `warm-<recipe>.ci.commoninternet.net` — the canonical
|
|
scheme for BOTH the live-warm keycloak and the data-warm canonicals (WC2), distinct from cold
|
|
per-run `<recipe[:4]>-<6hex>`. (WARM_DOMAINS['keycloak'] equals stable_domain('keycloak').)"""
|
|
return f"warm-{recipe}.ci.commoninternet.net"
|
|
|
|
|
|
def warm_domain(recipe: str) -> str | None:
|
|
"""The stable warm domain for a dep recipe, or None if this recipe is not served warm."""
|
|
return WARM_DOMAINS.get(recipe)
|
|
|
|
|
|
def is_warm_up(recipe: str, domain: str | None = None, timeout: int = 10) -> bool:
|
|
"""True iff the warm provider for `recipe` answers its health endpoint right now. Used to decide
|
|
whether to use the warm path or fall back to cold co-deploy. Conservative: any error → False."""
|
|
domain = domain or warm_domain(recipe)
|
|
if not domain:
|
|
return False
|
|
path, ok = _WARM_HEALTH.get(recipe, ("/", (200, 301, 302)))
|
|
req = urllib.request.Request(f"https://{domain}{path}", method="GET")
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout, context=_CTX) as r:
|
|
return r.status in ok
|
|
except urllib.error.HTTPError as e:
|
|
return e.code in ok
|
|
except Exception: # noqa: BLE001 — down / unreachable / TLS / DNS → not usable
|
|
return False
|
|
|
|
|
|
def live_app_hexes() -> set[str]:
|
|
"""The set of 6hex suffixes of currently-deployed cold per-run app stacks. Used to reap orphan
|
|
realms safely: a realm whose hex maps to a live stack belongs to an in-flight run and is kept.
|
|
Reads docker service names directly so it works even when an app's .env was already removed."""
|
|
out: set[str] = set()
|
|
try:
|
|
res = subprocess.run(
|
|
["docker", "service", "ls", "--format", "{{.Name}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
except Exception: # noqa: BLE001
|
|
return out
|
|
for name in res.stdout.splitlines():
|
|
m = _STACK_HEX_RE.match(name.strip())
|
|
if m:
|
|
out.add(m.group(1))
|
|
return out
|
|
|
|
|
|
def reap_orphan_realms(recipe: str, domain: str | None = None) -> list[str]:
|
|
"""Reap per-run realms on the warm provider left behind by crashed/killed dependent runs. Safe
|
|
under concurrency: realms whose hex maps to a currently-live app stack are kept. Returns the
|
|
realms actually deleted; [] on any error (best-effort run-start cleanup, never fatal)."""
|
|
domain = domain or warm_domain(recipe)
|
|
if recipe != "keycloak" or not domain:
|
|
return []
|
|
from . import sso # local import avoids import cycle at module load
|
|
|
|
try:
|
|
return sso.reap_orphaned_realms(domain, live_app_hexes())
|
|
except Exception: # noqa: BLE001 — reaping is hygiene, not correctness-critical
|
|
return []
|
|
|
|
|
|
def realm_for(parent_recipe: str, parent_domain: str) -> str:
|
|
"""The per-run realm name for a dependent run: "<parent_recipe>-<6hex>" where the 6hex is the
|
|
parent's per-run domain label suffix. Unique per (parent, pr, ref) so concurrent dependents never
|
|
collide on a shared keycloak, and traceable back to the app stack for reaping/debugging."""
|
|
label = parent_domain.split(".", 1)[0] # "lasu-0a6fb2"
|
|
m = re.search(r"-([0-9a-f]{6})$", label)
|
|
suffix = m.group(1) if m else label
|
|
return f"{parent_recipe}-{suffix}"
|