"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1). These are THE default for each lifecycle op: when a recipe ships no `test_.py` overlay, the generic tier (tests/_generic/test_.py) runs these against the single shared deployment the orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may reuse these by composition (`from harness import generic; generic.assert_serving(...)`). Design + precedence: machine-docs/DECISIONS.md (Phase 1d). """ from __future__ import annotations import glob import os import re import socket import ssl import time from . import lifecycle # A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label. _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE) def _recipe_dir(recipe: str) -> str: return os.path.expanduser(f"~/.abra/recipes/{recipe}") def backup_capable(recipe: str, meta: dict | None = None) -> bool: """Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3). `recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention).""" if meta and "BACKUP_CAPABLE" in meta: return bool(meta["BACKUP_CAPABLE"]) for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")): try: with open(path) as fh: if _BACKUPBOT_RE.search(fh.read()): return True except OSError: continue return False def served_cert(domain: str, port: int = 443) -> tuple[bool, str]: """CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik). Returns (verified, detail) with CN+SAN on success, or the failure reason. Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in `assert_serving`, not this.""" ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname try: with ( socket.create_connection((domain, port), timeout=20) as sock, ctx.wrap_socket(sock, server_hostname=domain) as ssock, ): cert = ssock.getpeercert() except ssl.SSLCertVerificationError as e: return (False, f"cert did not verify (Traefik default/self-signed?): {e}") except (OSError, ssl.SSLError) as e: return (False, f"TLS handshake error: {e}") cn = next( (v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"), "", ) sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"] return (True, f"CN={cn} SAN={sans}") def assert_serving(domain: str, meta: dict) -> None: """The single generic "is the app really serving?" assertion (DG1). The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary): 1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host has no app service, so this is False for a non-deployment); 2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503); 3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page; 4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide, F1d-1) — it only catches a lapsed/mis-rotated cert. Steps 1–2 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves the app briefly reconverging settles, while a persistent failure still fails within the timeout.""" deadline = time.time() + meta["DEPLOY_TIMEOUT"] while time.time() < deadline and not lifecycle.services_converged(domain): time.sleep(5) assert lifecycle.services_converged(domain), f"{domain}: services did not converge" path = meta["HEALTH_PATH"] ok = tuple(meta["HEALTH_OK"]) deadline = time.time() + meta["HTTP_TIMEOUT"] served = False status, body = 0, "" while time.time() < deadline: status, body = lifecycle.http_fetch(domain, path) if status in ok and not (status == 200 and "404 page not found" in body): served = True break time.sleep(5) assert served, ( f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, " "unhealthy backend, or default-404 body)" ) # Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the # app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 1–2 above. verified, detail = served_cert(domain) assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}" assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}" # ---- Op/assertion split (Phase 1e HC3) ------------------------------------------------------- # The orchestrator performs each mutating op ONCE (the harness owns the op), records what an # assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at # $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay # assertion file against the shared post-op state. The assertion functions below read that state via # `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist. import json as _json # noqa: E402 def op_state() -> dict: """The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset). Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}.""" path = os.environ.get("CCCI_OP_STATE_FILE") if not path or not os.path.exists(path): return {} try: with open(path) as f: return _json.load(f) except (OSError, ValueError): return {} def assert_upgraded(domain: str, meta: dict) -> None: """Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via `abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2). The chaos deploy stamps `coop-cloud..chaos-version` = the deployed recipe commit. When the intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring a move vs the pre-upgrade state (version/image/chaos changed).""" st = op_state().get("upgrade", {}) before = st.get("before") or {} head_ref = st.get("head_ref") assert_serving(domain, meta) after = lifecycle.deployed_identity(domain) chaos = after.get("chaos") if head_ref: assert chaos, ( f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the " "PR-head checkout (the code under test was not exercised by the upgrade)" ) # chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short. assert head_ref.startswith(chaos) or chaos.startswith(head_ref), ( f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head " f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is " "not exercising the PR's changes (HC1)" ) return moved = ( (before.get("version") and after.get("version") and before["version"] != after["version"]) or (before.get("image") and after.get("image") and before["image"] != after["image"]) or (chaos and chaos != before.get("chaos")) ) assert moved, ( f"{domain}: upgrade did not move the deployment " f"(version {before.get('version')}->{after.get('version')}, " f"image {before.get('image')}->{after.get('image')}, " f"chaos {before.get('chaos')}->{chaos}) — " "not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)" ) _SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"') def parse_snapshot_id(backup_output: str) -> str | None: """The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the backup artifact identity (DG3) — read from the create output because `abra app backup snapshots` requires a TTY and is awkward to script.""" m = _SNAPSHOT_ID_RE.search(backup_output) return m.group(1) if m else None def assert_backup_artifact(domain: str) -> str: """Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a snapshot artifact was produced (its id recorded in op state). Returns the id.""" snap_id = op_state().get("backup", {}).get("snapshot_id") assert snap_id, ( f"{domain}: backup produced no snapshot artifact " "(no snapshot_id in `abra app backup create` output)" ) return snap_id def assert_restore_healthy(domain: str, meta: dict) -> None: """Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is healthy + serving again (assert_serving polls, so the post-restore reconverge settles).""" assert_serving(domain, meta) # ---- Op primitives (orchestrator-only; perform the op once, never assert) -------------------- def perform_upgrade(domain: str, recipe: str, head_ref: str | None) -> dict[str, str | None]: """Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos` to redeploy the running app at that checkout. This is the real upgrade the PR's changes are exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code). Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check — after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.""" before = lifecycle.deployed_identity(domain) if head_ref: lifecycle.recipe_checkout_ref(recipe, head_ref) lifecycle.chaos_redeploy(domain) after = lifecycle.deployed_identity(domain) # Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the # PR-head we checked out — proving the upgrade deployed the code under test, not a published tag. print( f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} " f"chaos-version={after.get('chaos')} version={before.get('version')}→{after.get('version')}", flush=True, ) return before def perform_backup(domain: str) -> str | None: """Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion.""" return parse_snapshot_id(lifecycle.backup_app(domain)) def perform_restore(domain: str) -> None: """Perform the RESTORE op once (restore the latest snapshot).""" lifecycle.restore_app(domain)