warm_reconcile.py: per-spec setup hook + health_domain; SPECS[traefik] (stateful=False, version-rollback-only, _traefik_setup preserves wildcard-cert/ file-provider config, health on routed dashboard host). keycloak path unchanged. proxy.nix: deploy-proxy.service now execs warm_reconcile.py traefik. ZERO-disruption migration (traefik already at latest 5.1.1+v3.6.15; pre-seeded TYPE+last_good → clean no-op converge; traefik 200 + keycloak-through-traefik 200 + 0 failed). 65 unit pass. Per operator out: code+converge delivered; destructive rollback (brief TLS blip) = Adversary's required cold proof. Closes the W0.10a tracked-open. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
449 lines
18 KiB
Python
449 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Warm/infra auto-update reconciler (Phase 2w / WC1 + WC1.1 + WC1.2).
|
|
|
|
Invoked by the per-app systemd reconcile unit (nix/modules/warm-keycloak.nix) at every
|
|
activation/boot (and nightly via WC6). For one warm/infra app it converges to the LATEST published
|
|
recipe version, gated TWICE:
|
|
|
|
WC1.2 (pre-deploy SAFETY gate, runs FIRST): only auto-apply non-major (patch/minor) recipe bumps
|
|
with no manual-migration release notes. A MAJOR recipe/app version bump, or a target whose
|
|
releaseNotes flag a manual migration → DO NOT deploy: stay on current + write an alert sentinel
|
|
carrying the notes (operator upgrades manually). No snapshot/deploy/rollback churn on a hold.
|
|
|
|
WC1.1 (post-deploy HEALTH gate, for upgrades we DO apply): record running version = last-good →
|
|
[stateful: undeploy → snapshot data volume] → deploy latest → health-check →
|
|
healthy: commit last-good := latest;
|
|
unhealthy: [stateful: restore snapshot] → redeploy last-good → health-check → ALERT.
|
|
|
|
The reconciler is UNPINNED (keycloak floats to latest like traefik); the nix closure stays
|
|
byte-identical because the recipe is fetched at runtime. Alerts are sentinel JSON files under
|
|
/var/lib/ci-warm/alerts/ that the Builder loop relays via PushNotification (see DECISIONS Phase-2w).
|
|
|
|
Run as root on cc-ci (direct docker/volume access). CLI: `warm_reconcile.py <app>` (app = keycloak).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from harness import abra, lifecycle, warmsnap # noqa: E402
|
|
|
|
# --------------------------------------------------------------------------- specs
|
|
|
|
|
|
def _traefik_setup(recipe: str, domain: str, version: str) -> None:
|
|
"""Per-app config for the traefik reverse-proxy reconcile — preserves EXACTLY what the prior
|
|
proxy.nix bash reconcile did (wildcard/file-provider mode serving the pre-issued cert as
|
|
ssl_cert/ssl_key swarm secrets; NO ACME). Uses the proven abra.env_set (newline-safe, unlike the
|
|
bash set_env that bit keycloak)."""
|
|
cert_dir = "/var/lib/ci-certs/live"
|
|
if not (os.path.isfile(f"{cert_dir}/fullchain.pem") and os.path.isfile(f"{cert_dir}/privkey.pem")):
|
|
raise RuntimeError(f"FATAL: wildcard cert missing at {cert_dir} (sops decrypt broken?)")
|
|
if not os.path.isfile(env_file(domain)):
|
|
_run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
|
timeout=120, check=True)
|
|
abra.env_set(domain, "DOMAIN", domain)
|
|
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
|
abra.env_set(domain, "WILDCARDS_ENABLED", "1")
|
|
abra.env_set(domain, "SECRET_WILDCARD_CERT_VERSION", "v1")
|
|
abra.env_set(domain, "SECRET_WILDCARD_KEY_VERSION", "v1")
|
|
abra.env_set(domain, "COMPOSE_FILE", '"compose.yml:compose.wildcard.yml"')
|
|
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
|
have = set(lifecycle._docker_names("secret", stack)) # noqa: SLF001
|
|
|
|
def _has(name):
|
|
return any(s.endswith(f"_{name}_v1") for s in have)
|
|
|
|
if not _has("ssl_cert"):
|
|
_run(["abra", "app", "secret", "insert", domain, "ssl_cert", "v1",
|
|
f"{cert_dir}/fullchain.pem", "-f", "-n"], timeout=120, check=True)
|
|
if not _has("ssl_key"):
|
|
_run(["abra", "app", "secret", "insert", domain, "ssl_key", "v1",
|
|
f"{cert_dir}/privkey.pem", "-f", "-n"], timeout=120, check=True)
|
|
|
|
|
|
SPECS: dict[str, dict] = {
|
|
"keycloak": {
|
|
"recipe": "keycloak",
|
|
"domain": "warm-keycloak.ci.commoninternet.net",
|
|
"health_path": "/realms/master",
|
|
"health_ok": (200,),
|
|
"stateful": True,
|
|
"deploy_timeout": 900,
|
|
"health_timeout": 900,
|
|
},
|
|
# traefik = the reverse proxy: STATELESS (version-rollback-only, NO snapshot). Health is probed
|
|
# on a ROUTED host (the dashboard) since traefik's own domain has no route. `setup` preserves the
|
|
# wildcard cert / file-provider config.
|
|
"traefik": {
|
|
"recipe": "traefik",
|
|
"domain": "traefik.ci.commoninternet.net",
|
|
"health_domain": "ci.commoninternet.net",
|
|
"health_path": "/",
|
|
"health_ok": (200,),
|
|
"stateful": False,
|
|
"deploy_timeout": 600,
|
|
"health_timeout": 300,
|
|
"setup": _traefik_setup,
|
|
},
|
|
}
|
|
|
|
ALERTS_DIR = os.path.join(warmsnap.DEFAULT_WARM_ROOT, "alerts")
|
|
|
|
|
|
# --------------------------------------------------------------------------- pure version helpers
|
|
|
|
|
|
# A coop-cloud version tag is "<recipe-semver>+<app-version>" (observed: keycloak 10.7.1+26.6.2 ->
|
|
# image :26.6.2; n8n 3.2.0+2.20.6 -> image :2.20.6). The RECIPE semver is the part BEFORE '+'.
|
|
_VER_RE = re.compile(r"^\d+(\.\d+)*(\+.+)?$")
|
|
|
|
|
|
def is_version_tag(tag: str) -> bool:
|
|
"""True for a coop-cloud version tag (leading numeric semver, optional +app part)."""
|
|
return bool(_VER_RE.match(tag.strip()))
|
|
|
|
|
|
def sort_versions(tags) -> list[str]:
|
|
"""Sort coop-cloud version tags ascending by (recipe-semver tuple, app-version tuple)."""
|
|
|
|
def key(t: str):
|
|
recipe, _, app = t.partition("+")
|
|
return (_numtuple(recipe), _numtuple(app))
|
|
|
|
return sorted([t for t in tags if is_version_tag(t)], key=key)
|
|
|
|
|
|
def _numtuple(s: str) -> tuple:
|
|
out = []
|
|
for part in s.split("."):
|
|
m = re.match(r"^\d+", part)
|
|
out.append(int(m.group()) if m else 0)
|
|
return tuple(out)
|
|
|
|
|
|
def latest_version(tags) -> str | None:
|
|
s = sort_versions(tags)
|
|
return s[-1] if s else None
|
|
|
|
|
|
def _major(semver: str) -> int:
|
|
return _numtuple(semver)[0] if semver else 0
|
|
|
|
|
|
def is_major_bump(current: str, latest: str) -> bool:
|
|
"""True if current→latest bumps the MAJOR of either the recipe-semver (pre-'+') or the
|
|
app-version (post-'+'). Conservative: an app-major bump (e.g. keycloak 25→26) is exactly when
|
|
manual DB migrations happen, so it must also be held. A genuine patch/minor (neither major moves)
|
|
is never held by this rule."""
|
|
cr, _, ca = current.partition("+")
|
|
lr, _, la = latest.partition("+")
|
|
return _major(lr) > _major(cr) or _major(la) > _major(ca)
|
|
|
|
|
|
_MIGRATION_MARKERS = re.compile(
|
|
r"manual migration|manual action|manual step|action required|by hand|manually|breaking change",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def notes_flag_manual_migration(text: str) -> bool:
|
|
"""True if release-notes text contains a manual-migration marker (heuristic, err toward holding)."""
|
|
return bool(_MIGRATION_MARKERS.search(text or ""))
|
|
|
|
|
|
# --------------------------------------------------------------------------- integration helpers
|
|
|
|
|
|
def _run(cmd, timeout=120, check=False):
|
|
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=check)
|
|
|
|
|
|
def _recipe_dir(recipe: str) -> str:
|
|
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
|
|
|
|
|
def recipe_tags(recipe: str) -> list[str]:
|
|
r = _run(["git", "-C", _recipe_dir(recipe), "tag"], timeout=30)
|
|
return [t for t in r.stdout.split() if t.strip()]
|
|
|
|
|
|
def fetch_recipe(recipe: str) -> None:
|
|
# CCCI_SKIP_FETCH=1 lets a test/Adversary stage a fake "latest" tag (a simulated major bump /
|
|
# manual-migration / broken release) in the local recipe clone without it being clobbered by a
|
|
# re-fetch. Never set in production (the systemd unit does not set it).
|
|
if os.environ.get("CCCI_SKIP_FETCH") == "1":
|
|
print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True)
|
|
return
|
|
_run(["abra", "recipe", "fetch", recipe, "-n"], timeout=300)
|
|
|
|
|
|
def env_file(domain: str) -> str:
|
|
return os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
|
|
|
|
|
|
def current_version(domain: str) -> str | None:
|
|
"""Read the deployed version from the app .env. abra records it in `TYPE=<recipe>:<version>`
|
|
(updated on each `app new`/`app deploy <version>`). Returns the `<version>` part, or None."""
|
|
path = env_file(domain)
|
|
if not os.path.isfile(path):
|
|
return None
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line.startswith("TYPE="):
|
|
val = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
# "<recipe>:<version>" → version (everything after the first ':')
|
|
if ":" in val:
|
|
v = val.split(":", 1)[1].strip()
|
|
return v or None
|
|
return None
|
|
|
|
|
|
def is_deployed(domain: str) -> bool:
|
|
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
|
return bool(lifecycle._docker_names("service", stack)) # noqa: SLF001
|
|
|
|
|
|
def health_code(spec: dict) -> int:
|
|
# health is probed on `health_domain` (defaults to the app domain). For traefik the app domain
|
|
# (traefik.ci…) has no route of its own — health is a ROUTED host (e.g. the dashboard
|
|
# ci.commoninternet.net), so a 200 proves traefik is up + routing + TLS-terminating.
|
|
domain = spec.get("health_domain", spec["domain"])
|
|
r = _run(
|
|
[
|
|
"curl", "-sk", "-o", "/dev/null", "-w", "%{http_code}", "--max-time", "10",
|
|
"--resolve", f"{domain}:443:127.0.0.1", f"https://{domain}{spec['health_path']}",
|
|
],
|
|
timeout=20,
|
|
)
|
|
try:
|
|
return int(r.stdout.strip() or "0")
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
def wait_healthy(spec: dict, timeout: int | None = None) -> bool:
|
|
domain = spec["domain"]
|
|
deadline = time.time() + (timeout or spec["health_timeout"])
|
|
while time.time() < deadline:
|
|
if health_code(spec) in tuple(spec["health_ok"]):
|
|
return True
|
|
time.sleep(10)
|
|
return False
|
|
|
|
|
|
def release_notes(recipe: str, version: str) -> str:
|
|
"""Concatenated releaseNotes for `version` (try a couple of common filename shapes)."""
|
|
base = os.path.join(_recipe_dir(recipe), "releaseNotes")
|
|
for name in (f"{version}.md", version):
|
|
p = os.path.join(base, name)
|
|
if os.path.isfile(p):
|
|
try:
|
|
with open(p) as f:
|
|
return f.read()
|
|
except OSError:
|
|
return ""
|
|
return ""
|
|
|
|
|
|
def deploy_version(recipe: str, domain: str, version: str, timeout: int) -> None:
|
|
"""Deploy a specific published version: checkout the tag (so the on-disk tree matches) then a
|
|
pinned non-chaos redeploy with the version positional (so abra records TYPE=<recipe>:<version>).
|
|
`-f` makes it idempotent against an already-deployed app. abra writes FATA to stdout, so include
|
|
both streams in the error."""
|
|
abra.recipe_checkout(recipe, version)
|
|
r = _run(["abra", "app", "deploy", domain, version, "-o", "-n", "-f"], timeout=timeout)
|
|
if r.returncode != 0:
|
|
msg = (r.stderr.strip() + " " + r.stdout.strip()).strip()[:400]
|
|
raise RuntimeError(f"deploy {domain} {version} failed: {msg}")
|
|
|
|
|
|
def wait_undeployed(domain: str, timeout: int = 120) -> None:
|
|
"""Block until the app's swarm stack is fully removed after an undeploy. abra's undeploy may
|
|
return before swarm finishes tearing down tasks; snapshot/restore (which require undeployed) and
|
|
an immediate redeploy of the same stack name otherwise race a half-removed stack."""
|
|
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
|
deadline = time.time() + timeout
|
|
while time.time() < deadline:
|
|
if not lifecycle._docker_names("service", stack): # noqa: SLF001
|
|
return
|
|
time.sleep(2)
|
|
raise RuntimeError(f"{domain} stack not fully undeployed after {timeout}s")
|
|
|
|
|
|
# --------------------------------------------------------------------------- last-good + alerts
|
|
|
|
|
|
def last_good_path(recipe: str) -> str:
|
|
return os.path.join(warmsnap.app_dir(recipe), "last_good")
|
|
|
|
|
|
def read_last_good(recipe: str) -> str | None:
|
|
try:
|
|
with open(last_good_path(recipe)) as f:
|
|
return f.read().strip() or None
|
|
except OSError:
|
|
return None
|
|
|
|
|
|
def write_last_good(recipe: str, version: str) -> None:
|
|
os.makedirs(warmsnap.app_dir(recipe), exist_ok=True)
|
|
tmp = last_good_path(recipe) + ".tmp"
|
|
with open(tmp, "w") as f:
|
|
f.write(version)
|
|
os.replace(tmp, last_good_path(recipe))
|
|
|
|
|
|
def write_alert(app: str, reason: str, **fields) -> str:
|
|
"""Write a sentinel JSON alert under /var/lib/ci-warm/alerts/ for the Builder loop to relay."""
|
|
os.makedirs(ALERTS_DIR, exist_ok=True)
|
|
ts = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime())
|
|
rec = {"app": app, "reason": reason, "ts": ts, **fields}
|
|
path = os.path.join(ALERTS_DIR, f"{ts}-{app}-{reason}.json")
|
|
tmp = path + ".tmp"
|
|
with open(tmp, "w") as f:
|
|
json.dump(rec, f, indent=2)
|
|
os.replace(tmp, path)
|
|
print(f"ALERT[{reason}] {app}: {fields}", flush=True)
|
|
return path
|
|
|
|
|
|
# --------------------------------------------------------------------------- reconcile
|
|
|
|
|
|
def ensure_server() -> None:
|
|
if _run(["abra", "server", "ls", "-m", "-n"], timeout=30).returncode != 0:
|
|
_run(["abra", "server", "add", "--local", "-n"], timeout=60)
|
|
|
|
|
|
def ensure_app_config(recipe: str, domain: str, version: str) -> None:
|
|
if not os.path.isfile(env_file(domain)):
|
|
_run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
|
timeout=120, check=True)
|
|
abra.env_set(domain, "DOMAIN", domain)
|
|
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
|
|
|
|
|
def ensure_secrets(domain: str) -> None:
|
|
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
|
have = {n for n in lifecycle._docker_names("secret", stack)} # noqa: SLF001
|
|
if not any(n.endswith("_admin_password_v1") for n in have):
|
|
abra.secret_generate(domain)
|
|
|
|
|
|
def reconcile(app: str) -> str:
|
|
spec = SPECS[app]
|
|
recipe, domain = spec["recipe"], spec["domain"]
|
|
dt, stateful = spec["deploy_timeout"], spec["stateful"]
|
|
|
|
ensure_server()
|
|
fetch_recipe(recipe)
|
|
tags = recipe_tags(recipe)
|
|
latest = latest_version(tags)
|
|
if not latest:
|
|
raise RuntimeError(f"no version tags for {recipe}")
|
|
# Per-app config/secrets: a spec may provide its own `setup` (traefik's cert/file-provider wiring);
|
|
# otherwise the default keycloak-shaped path (app new + DOMAIN/LETS_ENCRYPT + generate secrets).
|
|
setup = spec.get("setup")
|
|
if setup:
|
|
setup(recipe, domain, latest)
|
|
else:
|
|
ensure_app_config(recipe, domain, latest)
|
|
ensure_secrets(domain)
|
|
|
|
current = current_version(domain)
|
|
deployed = is_deployed(domain)
|
|
|
|
# Fresh deploy (nothing running) — deploy current-pinned (or latest if never deployed).
|
|
if not deployed:
|
|
target = current or latest
|
|
print(f"[{app}] not deployed → fresh deploy {target}", flush=True)
|
|
deploy_version(recipe, domain, target, dt)
|
|
if not wait_healthy(spec):
|
|
raise RuntimeError(f"{app} fresh deploy {target} did not become healthy")
|
|
write_last_good(recipe, target)
|
|
return f"deployed-fresh:{target}"
|
|
|
|
# Deployed & already on latest → converge to a no-op (commit last-good if healthy).
|
|
if current == latest:
|
|
if wait_healthy(spec, timeout=60):
|
|
write_last_good(recipe, latest)
|
|
print(f"[{app}] already on latest {latest} and healthy — no-op", flush=True)
|
|
return f"noop-healthy:{latest}"
|
|
# On latest but unhealthy: try a redeploy; if still bad, alert (rollback target unknown/same).
|
|
print(f"[{app}] on latest {latest} but UNHEALTHY → redeploy", flush=True)
|
|
deploy_version(recipe, domain, latest, dt)
|
|
if wait_healthy(spec):
|
|
write_last_good(recipe, latest)
|
|
return f"redeployed-healthy:{latest}"
|
|
write_alert(app, "unhealthy-on-latest", version=latest)
|
|
return f"unhealthy:{latest}"
|
|
|
|
# --- An upgrade current→latest is available ---
|
|
# WC1.2 pre-deploy SAFETY gate (runs BEFORE any snapshot/deploy).
|
|
notes = release_notes(recipe, latest)
|
|
if is_major_bump(current or "0", latest):
|
|
write_alert(app, "held-major", current=current, latest=latest, release_notes=notes[:4000])
|
|
return f"held-major:{current}->{latest}"
|
|
if notes_flag_manual_migration(notes):
|
|
write_alert(app, "held-manual-migration", current=current, latest=latest,
|
|
release_notes=notes[:4000])
|
|
return f"held-manual-migration:{current}->{latest}"
|
|
|
|
# WC1.1 health-gated upgrade with rollback.
|
|
last_good = current
|
|
print(f"[{app}] auto-upgrade {last_good} → {latest} (health-gated)", flush=True)
|
|
if stateful:
|
|
abra.undeploy(domain)
|
|
wait_undeployed(domain)
|
|
warmsnap.snapshot(recipe, domain, version=last_good)
|
|
# snapshot requires undeployed; now bring up latest.
|
|
# A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
|
|
# crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a
|
|
# deploy exception the same as an unhealthy result.
|
|
upgrade_ok = False
|
|
try:
|
|
deploy_version(recipe, domain, latest, dt)
|
|
upgrade_ok = wait_healthy(spec)
|
|
except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit
|
|
print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True)
|
|
upgrade_ok = False
|
|
if upgrade_ok:
|
|
write_last_good(recipe, latest)
|
|
print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True)
|
|
return f"upgraded:{last_good}->{latest}"
|
|
|
|
# Unhealthy → roll back.
|
|
print(f"[{app}] latest {latest} UNHEALTHY → rolling back to {last_good}", flush=True)
|
|
if stateful:
|
|
abra.undeploy(domain)
|
|
wait_undeployed(domain)
|
|
warmsnap.restore(recipe, domain)
|
|
deploy_version(recipe, domain, last_good, dt)
|
|
recovered = wait_healthy(spec)
|
|
write_alert(app, "rollback", last_good=last_good, attempted=latest, recovered=recovered,
|
|
release_notes=notes[:2000])
|
|
if not recovered:
|
|
raise RuntimeError(f"{app} rollback to {last_good} did not become healthy")
|
|
return f"rolled-back:{latest}->{last_good}"
|
|
|
|
|
|
def main(argv) -> int:
|
|
if len(argv) != 2 or argv[1] not in SPECS:
|
|
print(f"usage: warm_reconcile.py <{'|'.join(SPECS)}>", file=sys.stderr)
|
|
return 2
|
|
result = reconcile(argv[1])
|
|
print(f"RECONCILE RESULT: {result}", flush=True)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main(sys.argv))
|