"""App lifecycle for the CI harness: deploy, wait-healthy, teardown, janitor (plan §4.3). The teardown guarantee is sacred: a failed test must never leak an app/volume/secret into the next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer). """ from __future__ import annotations import datetime import os import re import ssl import subprocess import time import urllib.request from . import abra GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci) # A run app domain is "-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the # janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match). RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$") class TeardownError(RuntimeError): pass def _docker_names(kind: str, stack: str) -> list[str]: """docker ls names filtered to a stack (kind: service|volume|secret).""" proc = subprocess.run( ["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"], capture_output=True, text=True, ) return [n for n in proc.stdout.split("\n") if n.strip()] def _residual(domain: str) -> dict: stack = _stack_name(domain) return { "services": _docker_names("service", stack), "volumes": _docker_names("volume", stack), "secrets": _docker_names("secret", stack), } def _stack_age_seconds(stack: str) -> float | None: """Age of the stack's oldest service, or None if not present.""" svcs = _docker_names("service", stack) if not svcs: return None oldest = None for s in svcs: p = subprocess.run(["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"], capture_output=True, text=True) ts = p.stdout.strip() try: # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace( tzinfo=datetime.timezone.utc) except ValueError: continue age = (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds() oldest = age if oldest is None else max(oldest, age) return oldest def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]: """Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe declares `EXTRA_ENV` in tests//recipe_meta.py as either a dict or a callable `EXTRA_ENV(domain) -> dict` (callable form lets it derive values from the per-run domain, e.g. cryptpad's SANDBOX_DOMAIN). Returns {} if none.""" path = os.path.join(os.path.dirname(__file__), "..", "..", "tests", recipe, "recipe_meta.py") if not os.path.exists(path): return {} ns: dict = {} with open(path) as fh: exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo) ee = ns.get("EXTRA_ENV") if callable(ee): ee = ee(domain) return {str(k): str(v) for k, v in (ee or {}).items()} def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None: """Create + configure + deploy an app. Forces LETS_ENCRYPT_ENV='' so traefik serves the wildcard cert via the file provider and NEVER attempts ACME (adversary finding A1). Applies any per-recipe EXTRA_ENV (recipe_meta.py) before deploy.""" abra.app_config_remove(domain) # clear any stale .env from a prior crashed run abra.app_new(recipe, domain, version=version, secrets=secrets) abra.env_set(domain, "LETS_ENCRYPT_ENV", "") for k, v in _recipe_extra_env(recipe, domain).items(): abra.env_set(domain, k, v) if secrets: abra.secret_generate(domain) abra.deploy(domain) def _stack_name(domain: str) -> str: # abra derives the swarm stack name from the domain by replacing dots with underscores # and KEEPING hyphens (e.g. custom-html-x.ci.commoninternet.net -> custom-html-x_ci_...). return domain.replace(".", "_") def services_converged(domain: str) -> bool: """True when every service in the stack reports replicas N/N (N>0).""" stack = _stack_name(domain) proc = subprocess.run( ["docker", "stack", "services", stack, "--format", "{{.Replicas}}"], capture_output=True, text=True, ) rows = [r for r in proc.stdout.split("\n") if r.strip()] if not rows: return False for r in rows: cur, _, want = r.partition("/") if not want or cur != want or want == "0": return False return True def http_get(domain: str, path: str = "/", timeout: int = 15) -> int: """HTTPS GET the app by its real hostname. On cc-ci the *.ci.commoninternet.net wildcard resolves (public DNS) to the gateway, which SNI-passthroughs to cc-ci's traefik — so using the real URL keeps SNI correct (connecting to the bare IP would drop SNI and fail to route).""" ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE req = urllib.request.Request(f"https://{domain}{path}", method="GET") try: with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: return resp.status except urllib.error.HTTPError as e: return e.code except Exception: return 0 def wait_healthy(domain: str, ok_codes=(200, 301, 302), path: str = "/", deploy_timeout: int = 600, http_timeout: int = 300) -> None: """Wait for stack services converged, then for the app to answer ok over HTTPS at `path`. `path` is per-recipe (recipe_meta.HEALTH_PATH), e.g. keycloak uses /realms/master.""" deadline = time.time() + deploy_timeout while time.time() < deadline: if services_converged(domain): break time.sleep(5) else: raise TimeoutError(f"{domain}: services did not converge in {deploy_timeout}s") deadline = time.time() + http_timeout last = 0 while time.time() < deadline: last = http_get(domain, path) if last in ok_codes: return time.sleep(5) raise TimeoutError(f"{domain}: not healthy over HTTPS {path} (last status {last})") def upgrade_app(domain: str, version: str | None = None) -> None: abra.upgrade(domain, version=version) def backup_app(domain: str) -> None: abra.backup_create(domain) def restore_app(domain: str) -> None: abra.restore(domain) def previous_version(recipe: str) -> str | None: """The second-newest published version (to deploy before upgrading to latest).""" vers = abra.recipe_versions(recipe) return vers[-2] if len(vers) >= 2 else None def _app_container(domain: str, service: str = "app") -> str: """The running container id for _.""" name = f"{_stack_name(domain)}_{service}" proc = subprocess.run( ["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"], capture_output=True, text=True, ) cid = proc.stdout.strip().split("\n")[0] if not cid: raise RuntimeError(f"no running container for {name}") return cid def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str: cid = _app_container(domain, service) proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True) return proc.stdout def http_body(domain: str, path: str = "/", timeout: int = 15) -> str: ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE req = urllib.request.Request(f"https://{domain}{path}", method="GET") with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: return resp.read().decode(errors="replace") def _force_stack_rm(stack: str, timeout: int = 120) -> None: """Remove a stack's services directly (no .env needed) and wait for them to disappear.""" subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True) deadline = time.time() + timeout while time.time() < deadline and _docker_names("service", stack): time.sleep(2) def teardown_app(domain: str, verify: bool = True) -> None: """Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise). Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra undeploy fails, fall back to `docker stack rm` (which needs no .env).""" stack = _stack_name(domain) abra.undeploy(domain) if _docker_names("service", stack): _force_stack_rm(stack) # fallback: abra undeploy didn't clear it abra.volume_remove(domain) # needs the .env -> before removing it abra.secret_remove_all(domain) # belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be # briefly held by a just-stopped task after `stack rm`, so retry the volume removal. deadline = time.time() + 60 while time.time() < deadline: vols = _docker_names("volume", stack) if not vols: break for v in vols: subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True) if not _docker_names("volume", stack): break time.sleep(3) for s in _docker_names("secret", stack): subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True) abra.app_config_remove(domain) # only now (stack gone) drop the .env if verify: residual = _residual(domain) if any(residual.values()): raise TeardownError(f"teardown left residual for {domain}: {residual}") def janitor(max_age_seconds: int | None = None) -> None: """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately).""" import os if max_age_seconds is None: max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200")) seen = set() for app in abra.app_ls(): name = app.get("appName") or app.get("domain") or "" if RUN_APP_RE.match(name): seen.add(name) # also catch stacks whose .env was already deleted (abra ls won't list them) for svc in _docker_names("service", ""): # svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc) if m: seen.add(f"{m.group(1)}.ci.commoninternet.net") for name in seen: stack = _stack_name(name) age = _stack_age_seconds(stack) if age is not None and age < max_age_seconds: continue # likely a concurrent in-flight run; leave it try: teardown_app(name, verify=False) except Exception: pass