Mechanical, semantics-preserving cleanup so the codebase passes the new lint stage:
- ruff format: all 32 Python files (wraps long signatures, normalizes quotes/blank lines).
- nixpkgs-fmt: modules/drone-runner.nix.
- shfmt (-i 2 -ci): scripts/*.sh.
Lint fixes (reviewed, behavior-preserving — no test weakened):
- ruff SIM105: try/except-pass -> contextlib.suppress (abra.py app_config rm; lifecycle.py janitor).
- ruff SIM115: open().read() -> with open() (run_recipe_ci.py redaction-values + gitea-token).
- statix: merge repeated sops `secrets.*` keys into one `secrets = { ... }` (comments kept);
empty fn pattern `{ ... }:` -> `_:` (packages.nix).
- deadnix: drop unused lambda args (flake `self`; configuration.nix `lib`; overlay `final` -> `_`).
Verified on cc-ci: `scripts/lint.sh` -> lint: PASS; nixosConfigurations.cc-ci evaluates;
all Python byte-compiles. The deployed bridge/dashboard/runner source changes hash (reformat),
so cc-ci will be rebuilt to the new closure in W2 before the cold D1-D10 re-verification.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
291 lines
11 KiB
Python
291 lines
11 KiB
Python
"""App lifecycle for the CI harness: deploy, wait-healthy, teardown, janitor (plan §4.3).
|
|
|
|
The teardown guarantee is sacred: a failed test must never leak an app/volume/secret into the
|
|
next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import datetime
|
|
import os
|
|
import re
|
|
import ssl
|
|
import subprocess
|
|
import time
|
|
import urllib.request
|
|
|
|
from . import abra
|
|
|
|
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
|
|
# A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
|
|
# janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match).
|
|
RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$")
|
|
|
|
|
|
class TeardownError(RuntimeError):
|
|
pass
|
|
|
|
|
|
def _docker_names(kind: str, stack: str) -> list[str]:
|
|
"""docker <kind> ls names filtered to a stack (kind: service|volume|secret)."""
|
|
proc = subprocess.run(
|
|
["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
return [n for n in proc.stdout.split("\n") if n.strip()]
|
|
|
|
|
|
def _residual(domain: str) -> dict:
|
|
stack = _stack_name(domain)
|
|
return {
|
|
"services": _docker_names("service", stack),
|
|
"volumes": _docker_names("volume", stack),
|
|
"secrets": _docker_names("secret", stack),
|
|
}
|
|
|
|
|
|
def _stack_age_seconds(stack: str) -> float | None:
|
|
"""Age of the stack's oldest service, or None if not present."""
|
|
svcs = _docker_names("service", stack)
|
|
if not svcs:
|
|
return None
|
|
oldest = None
|
|
for s in svcs:
|
|
p = subprocess.run(
|
|
["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
ts = p.stdout.strip()
|
|
try:
|
|
# docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
|
|
dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
|
|
tzinfo=datetime.UTC
|
|
)
|
|
except ValueError:
|
|
continue
|
|
age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds()
|
|
oldest = age if oldest is None else max(oldest, age)
|
|
return oldest
|
|
|
|
|
|
def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
|
|
"""Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe
|
|
with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe
|
|
declares `EXTRA_ENV` in tests/<recipe>/recipe_meta.py as either a dict or a callable
|
|
`EXTRA_ENV(domain) -> dict` (callable form lets it derive values from the per-run domain, e.g.
|
|
cryptpad's SANDBOX_DOMAIN). Returns {} if none."""
|
|
path = os.path.join(os.path.dirname(__file__), "..", "..", "tests", recipe, "recipe_meta.py")
|
|
if not os.path.exists(path):
|
|
return {}
|
|
ns: dict = {}
|
|
with open(path) as fh:
|
|
exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo)
|
|
ee = ns.get("EXTRA_ENV")
|
|
if callable(ee):
|
|
ee = ee(domain)
|
|
return {str(k): str(v) for k, v in (ee or {}).items()}
|
|
|
|
|
|
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
|
|
"""Create + configure + deploy an app. Forces LETS_ENCRYPT_ENV='' so traefik serves the
|
|
wildcard cert via the file provider and NEVER attempts ACME (adversary finding A1). Applies any
|
|
per-recipe EXTRA_ENV (recipe_meta.py) before deploy."""
|
|
abra.app_config_remove(domain) # clear any stale .env from a prior crashed run
|
|
abra.app_new(recipe, domain, version=version, secrets=secrets)
|
|
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
|
for k, v in _recipe_extra_env(recipe, domain).items():
|
|
abra.env_set(domain, k, v)
|
|
if secrets:
|
|
abra.secret_generate(domain)
|
|
abra.deploy(domain)
|
|
|
|
|
|
def _stack_name(domain: str) -> str:
|
|
# abra derives the swarm stack name from the domain by replacing dots with underscores
|
|
# and KEEPING hyphens (e.g. custom-html-x.ci.commoninternet.net -> custom-html-x_ci_...).
|
|
return domain.replace(".", "_")
|
|
|
|
|
|
def services_converged(domain: str) -> bool:
|
|
"""True when every service in the stack reports replicas N/N (N>0)."""
|
|
stack = _stack_name(domain)
|
|
proc = subprocess.run(
|
|
["docker", "stack", "services", stack, "--format", "{{.Replicas}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
|
if not rows:
|
|
return False
|
|
for r in rows:
|
|
cur, _, want = r.partition("/")
|
|
if not want or cur != want or want == "0":
|
|
return False
|
|
return True
|
|
|
|
|
|
def http_get(domain: str, path: str = "/", timeout: int = 15) -> int:
|
|
"""HTTPS GET the app by its real hostname. On cc-ci the *.ci.commoninternet.net wildcard
|
|
resolves (public DNS) to the gateway, which SNI-passthroughs to cc-ci's traefik — so using
|
|
the real URL keeps SNI correct (connecting to the bare IP would drop SNI and fail to route)."""
|
|
ctx = ssl.create_default_context()
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
req = urllib.request.Request(f"https://{domain}{path}", method="GET")
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
|
return resp.status
|
|
except urllib.error.HTTPError as e:
|
|
return e.code
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def wait_healthy(
|
|
domain: str,
|
|
ok_codes=(200, 301, 302),
|
|
path: str = "/",
|
|
deploy_timeout: int = 600,
|
|
http_timeout: int = 300,
|
|
) -> None:
|
|
"""Wait for stack services converged, then for the app to answer ok over HTTPS at `path`.
|
|
`path` is per-recipe (recipe_meta.HEALTH_PATH), e.g. keycloak uses /realms/master."""
|
|
deadline = time.time() + deploy_timeout
|
|
while time.time() < deadline:
|
|
if services_converged(domain):
|
|
break
|
|
time.sleep(5)
|
|
else:
|
|
raise TimeoutError(f"{domain}: services did not converge in {deploy_timeout}s")
|
|
|
|
deadline = time.time() + http_timeout
|
|
last = 0
|
|
while time.time() < deadline:
|
|
last = http_get(domain, path)
|
|
if last in ok_codes:
|
|
return
|
|
time.sleep(5)
|
|
raise TimeoutError(f"{domain}: not healthy over HTTPS {path} (last status {last})")
|
|
|
|
|
|
def upgrade_app(domain: str, version: str | None = None) -> None:
|
|
abra.upgrade(domain, version=version)
|
|
|
|
|
|
def backup_app(domain: str) -> None:
|
|
abra.backup_create(domain)
|
|
|
|
|
|
def restore_app(domain: str) -> None:
|
|
abra.restore(domain)
|
|
|
|
|
|
def previous_version(recipe: str) -> str | None:
|
|
"""The second-newest published version (to deploy before upgrading to latest)."""
|
|
vers = abra.recipe_versions(recipe)
|
|
return vers[-2] if len(vers) >= 2 else None
|
|
|
|
|
|
def _app_container(domain: str, service: str = "app") -> str:
|
|
"""The running container id for <stack>_<service>."""
|
|
name = f"{_stack_name(domain)}_{service}"
|
|
proc = subprocess.run(
|
|
["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
cid = proc.stdout.strip().split("\n")[0]
|
|
if not cid:
|
|
raise RuntimeError(f"no running container for {name}")
|
|
return cid
|
|
|
|
|
|
def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str:
|
|
cid = _app_container(domain, service)
|
|
proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True)
|
|
return proc.stdout
|
|
|
|
|
|
def http_body(domain: str, path: str = "/", timeout: int = 15) -> str:
|
|
ctx = ssl.create_default_context()
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
req = urllib.request.Request(f"https://{domain}{path}", method="GET")
|
|
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
|
return resp.read().decode(errors="replace")
|
|
|
|
|
|
def _force_stack_rm(stack: str, timeout: int = 120) -> None:
|
|
"""Remove a stack's services directly (no .env needed) and wait for them to disappear."""
|
|
subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True)
|
|
deadline = time.time() + timeout
|
|
while time.time() < deadline and _docker_names("service", stack):
|
|
time.sleep(2)
|
|
|
|
|
|
def teardown_app(domain: str, verify: bool = True) -> None:
|
|
"""Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise).
|
|
|
|
Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra
|
|
needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra
|
|
undeploy fails, fall back to `docker stack rm` (which needs no .env)."""
|
|
stack = _stack_name(domain)
|
|
abra.undeploy(domain)
|
|
if _docker_names("service", stack):
|
|
_force_stack_rm(stack) # fallback: abra undeploy didn't clear it
|
|
abra.volume_remove(domain) # needs the .env -> before removing it
|
|
abra.secret_remove_all(domain)
|
|
# belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be
|
|
# briefly held by a just-stopped task after `stack rm`, so retry the volume removal.
|
|
deadline = time.time() + 60
|
|
while time.time() < deadline:
|
|
vols = _docker_names("volume", stack)
|
|
if not vols:
|
|
break
|
|
for v in vols:
|
|
subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True)
|
|
if not _docker_names("volume", stack):
|
|
break
|
|
time.sleep(3)
|
|
for s in _docker_names("secret", stack):
|
|
subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True)
|
|
abra.app_config_remove(domain) # only now (stack gone) drop the .env
|
|
|
|
if verify:
|
|
residual = _residual(domain)
|
|
if any(residual.values()):
|
|
raise TeardownError(f"teardown left residual for {domain}: {residual}")
|
|
|
|
|
|
def janitor(max_age_seconds: int | None = None) -> None:
|
|
"""Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only
|
|
reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via
|
|
docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable
|
|
via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately)."""
|
|
import os
|
|
|
|
if max_age_seconds is None:
|
|
max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
|
|
seen = set()
|
|
for app in abra.app_ls():
|
|
name = app.get("appName") or app.get("domain") or ""
|
|
if RUN_APP_RE.match(name):
|
|
seen.add(name)
|
|
# also catch stacks whose .env was already deleted (abra ls won't list them)
|
|
for svc in _docker_names("service", ""):
|
|
# svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain
|
|
m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc)
|
|
if m:
|
|
seen.add(f"{m.group(1)}.ci.commoninternet.net")
|
|
|
|
for name in seen:
|
|
stack = _stack_name(name)
|
|
age = _stack_age_seconds(stack)
|
|
if age is not None and age < max_age_seconds:
|
|
continue # likely a concurrent in-flight run; leave it
|
|
with contextlib.suppress(Exception):
|
|
teardown_app(name, verify=False)
|