"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1). These are THE default for each lifecycle op: when a recipe ships no `test_.py` overlay, the generic tier (tests/_generic/test_.py) runs these against the single shared deployment the orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may reuse these by composition (`from harness import generic; generic.assert_serving(...)`). Design + precedence: machine-docs/DECISIONS.md (Phase 1d). """ from __future__ import annotations import glob import os import re import socket import ssl import time from . import abra, lifecycle # A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label. _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE) def _recipe_dir(recipe: str) -> str: return os.path.expanduser(f"~/.abra/recipes/{recipe}") def backup_capable(recipe: str, meta: dict | None = None) -> bool: """Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3). `recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention).""" if meta and "BACKUP_CAPABLE" in meta: return bool(meta["BACKUP_CAPABLE"]) for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")): try: with open(path) as fh: if _BACKUPBOT_RE.search(fh.read()): return True except OSError: continue return False def served_cert(domain: str, port: int = 443) -> tuple[bool, str]: """CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik). Returns (verified, detail) with CN+SAN on success, or the failure reason. Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in `assert_serving`, not this.""" ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname try: with ( socket.create_connection((domain, port), timeout=20) as sock, ctx.wrap_socket(sock, server_hostname=domain) as ssock, ): cert = ssock.getpeercert() except ssl.SSLCertVerificationError as e: return (False, f"cert did not verify (Traefik default/self-signed?): {e}") except (OSError, ssl.SSLError) as e: return (False, f"TLS handshake error: {e}") cn = next( (v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"), "", ) sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"] return (True, f"CN={cn} SAN={sans}") def assert_serving(domain: str, meta: dict) -> None: """The single generic "is the app really serving?" assertion (DG1). The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary): 1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host has no app service, so this is False for a non-deployment); 2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503); 3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page; 4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide, F1d-1) — it only catches a lapsed/mis-rotated cert. Steps 1–2 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves the app briefly reconverging settles, while a persistent failure still fails within the timeout.""" deadline = time.time() + meta["DEPLOY_TIMEOUT"] while time.time() < deadline and not lifecycle.services_converged(domain): time.sleep(5) assert lifecycle.services_converged(domain), f"{domain}: services did not converge" path = meta["HEALTH_PATH"] ok = tuple(meta["HEALTH_OK"]) deadline = time.time() + meta["HTTP_TIMEOUT"] served = False status, body = 0, "" while time.time() < deadline: status, body = lifecycle.http_fetch(domain, path) if status in ok and not (status == 200 and "404 page not found" in body): served = True break time.sleep(5) assert served, ( f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, " "unhealthy backend, or default-404 body)" ) # Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the # app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 1–2 above. verified, detail = served_cert(domain) assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}" assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}" # ---- Op/assertion split (Phase 1e HC3) ------------------------------------------------------- # The orchestrator performs each mutating op ONCE (the harness owns the op), records what an # assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at # $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay # assertion file against the shared post-op state. The assertion functions below read that state via # `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist. import json as _json # noqa: E402 def op_state() -> dict: """The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset). Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}.""" path = os.environ.get("CCCI_OP_STATE_FILE") if not path or not os.path.exists(path): return {} try: with open(path) as f: return _json.load(f) except (OSError, ValueError): return {} def assert_upgraded(domain: str, meta: dict) -> None: """Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via `abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2). The chaos deploy stamps `coop-cloud..chaos-version` = the deployed recipe commit. When the intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring a move vs the pre-upgrade state (version/image/chaos changed).""" st = op_state().get("upgrade", {}) before = st.get("before") or {} head_ref = st.get("head_ref") assert_serving(domain, meta) after = lifecycle.deployed_identity(domain) chaos = after.get("chaos") if head_ref: assert chaos, ( f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the " "PR-head checkout (the code under test was not exercised by the upgrade)" ) # chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short. # abra appends a working-tree-state marker (e.g. '+U' = untracked file present) to the # chaos-version when a cc-ci DEPLOY OVERLAY sits in the recipe checkout as an untracked file # (e.g. ghost's compose.ccci-health.yml, provided by install_steps). That marker is NOT part # of the commit identity — strip it before the HC1 commit match. HC1 is preserved: the # underlying COMMIT must still equal head_ref; a stale prev-checkout chaos redeploy would # stamp prev's commit (also '+U' if overlaid) and STILL not match head_ref after stripping. chaos_commit = chaos.split("+", 1)[0] assert head_ref.startswith(chaos_commit) or chaos_commit.startswith(head_ref), ( f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head " f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is " "not exercising the PR's changes (HC1)" ) return moved = ( (before.get("version") and after.get("version") and before["version"] != after["version"]) or (before.get("image") and after.get("image") and before["image"] != after["image"]) or (chaos and chaos != before.get("chaos")) ) assert moved, ( f"{domain}: upgrade did not move the deployment " f"(version {before.get('version')}->{after.get('version')}, " f"image {before.get('image')}->{after.get('image')}, " f"chaos {before.get('chaos')}->{chaos}) — " "not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)" ) _SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"') def parse_snapshot_id(backup_output: str) -> str | None: """The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the backup artifact identity (DG3) — read from the create output because `abra app backup snapshots` requires a TTY and is awkward to script.""" m = _SNAPSHOT_ID_RE.search(backup_output) return m.group(1) if m else None def assert_backup_artifact(domain: str) -> str: """Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a snapshot artifact was produced (its id recorded in op state). Returns the id.""" snap_id = op_state().get("backup", {}).get("snapshot_id") assert snap_id, ( f"{domain}: backup produced no snapshot artifact " "(no snapshot_id in `abra app backup create` output)" ) return snap_id def assert_restore_healthy(domain: str, meta: dict) -> None: """Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is healthy + serving again (assert_serving polls, so the post-restore reconverge settles).""" assert_serving(domain, meta) # ---- Op primitives (orchestrator-only; perform the op once, never assert) -------------------- def perform_upgrade( domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900, meta: dict | None = None ) -> dict[str, str | None]: """Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos` to redeploy the running app at that checkout. This is the real upgrade the PR's changes are exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code). Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check — after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it. `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait. F2-12: the chaos redeploy runs with `--no-converge-checks` (abra's own convergence monitor FATAs on the heavy lasuite-drive prev→PR-head crossover while the NEW collabora's healthcheck is still in its start_period, even though it converges given swarm's healthcheck retries). We then own a STRICTER convergence+health wait here: services N/N (wait_healthy) + app HEALTH_PATH healthy + any recipe READY_PROBE (collabora WOPI discovery 200). This bounds readiness by OUR generous deadline, not abra's impatient one — and is stronger evidence than abra's monitor.""" meta = meta or {} before = lifecycle.deployed_identity(domain) if head_ref: lifecycle.recipe_checkout_ref(recipe, head_ref) # UPGRADE_EXTRA_ENV (F2-14c): a recipe may need different app .env for the upgrade-TARGET deploy # than for the base — e.g. mumble's `compose.host-ports.yml` overlay exists ONLY in the newer # (target) version, so the base deploys minimally WITHOUT it and the upgrade adds it to COMPOSE_FILE # here, after the PR-head checkout (which ships the overlay) and before the chaos redeploy that # picks up the new .env. Dict or callable(domain)->dict. No-op for recipes without it. upgrade_env = meta.get("UPGRADE_EXTRA_ENV") or {} if callable(upgrade_env): upgrade_env = upgrade_env(domain) or {} for k, v in upgrade_env.items(): print(f" upgrade-env: {k}={v}", flush=True) abra.env_set(domain, k, v) # HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound. lifecycle.prepull_images(recipe, domain) lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True) # Own the convergence verification (abra's monitor was skipped via -c). lifecycle.wait_healthy( domain, ok_codes=tuple(meta.get("HEALTH_OK", (200, 301, 302))), path=meta.get("HEALTH_PATH", "/"), deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)), http_timeout=int(meta.get("HTTP_TIMEOUT", 300)), ) lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout))) after = lifecycle.deployed_identity(domain) # Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the # PR-head we checked out — proving the upgrade deployed the code under test, not a published tag. print( f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} " f"chaos-version={after.get('chaos')} version={before.get('version')}→{after.get('version')}", flush=True, ) return before def perform_backup(domain: str) -> str | None: """Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion.""" return parse_snapshot_id(lifecycle.backup_app(domain)) def perform_restore(domain: str) -> None: """Perform the RESTORE op once (restore the latest snapshot).""" lifecycle.restore_app(domain)