cc-ci/runner/harness/generic.py

"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1).

These are THE default for each lifecycle op: when a recipe ships no `test_<op>.py` overlay, the
generic tier (tests/_generic/test_<op>.py) runs these against the single shared deployment the
orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by
the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may
reuse these by composition (`from harness import generic; generic.assert_serving(...)`).

Design + precedence: machine-docs/DECISIONS.md (Phase 1d).
"""

from __future__ import annotations

import glob
import os
import re
import socket
import ssl
import time

from . import lifecycle

# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)


def _recipe_dir(recipe: str) -> str:
    return os.path.expanduser(f"~/.abra/recipes/{recipe}")


def backup_capable(recipe: str, meta: dict | None = None) -> bool:
    """Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3).

    `recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's
    compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention)."""
    if meta and "BACKUP_CAPABLE" in meta:
        return bool(meta["BACKUP_CAPABLE"])
    for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")):
        try:
            with open(path) as fh:
                if _BACKUPBOT_RE.search(fh.read()):
                    return True
        except OSError:
            continue
    return False


def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
    """CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
    Returns (verified, detail) with CN+SAN on success, or the failure reason.

    Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served
    wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the
    operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT
    distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for
    the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app
    is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in
    `assert_serving`, not this."""
    ctx = ssl.create_default_context()  # verifies chain against system CAs + checks hostname
    try:
        with (
            socket.create_connection((domain, port), timeout=20) as sock,
            ctx.wrap_socket(sock, server_hostname=domain) as ssock,
        ):
            cert = ssock.getpeercert()
    except ssl.SSLCertVerificationError as e:
        return (False, f"cert did not verify (Traefik default/self-signed?): {e}")
    except (OSError, ssl.SSLError) as e:
        return (False, f"TLS handshake error: {e}")
    cn = next(
        (v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"),
        "",
    )
    sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"]
    return (True, f"CN={cn} SAN={sans}")


def assert_serving(domain: str, meta: dict) -> None:
    """The single generic "is the app really serving?" assertion (DG1).

    The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary):
      1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host
         has no app service, so this is False for a non-deployment);
      2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik
         unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503);
      3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page;
      4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This
         does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide,
         F1d-1) — it only catches a lapsed/mis-rotated cert.

    Steps 1–2 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves
    the app briefly reconverging settles, while a persistent failure still fails within the timeout."""
    deadline = time.time() + meta["DEPLOY_TIMEOUT"]
    while time.time() < deadline and not lifecycle.services_converged(domain):
        time.sleep(5)
    assert lifecycle.services_converged(domain), f"{domain}: services did not converge"

    path = meta["HEALTH_PATH"]
    ok = tuple(meta["HEALTH_OK"])
    deadline = time.time() + meta["HTTP_TIMEOUT"]
    served = False
    status, body = 0, ""
    while time.time() < deadline:
        status, body = lifecycle.http_fetch(domain, path)
        if status in ok and not (status == 200 and "404 page not found" in body):
            served = True
            break
        time.sleep(5)
    assert served, (
        f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, "
        "unhealthy backend, or default-404 body)"
    )

    # Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the
    # app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 1–2 above.
    verified, detail = served_cert(domain)
    assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}"
    assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"


# ---- Op/assertion split (Phase 1e HC3) -------------------------------------------------------
# The orchestrator performs each mutating op ONCE (the harness owns the op), records what an
# assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at
# $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay
# assertion file against the shared post-op state. The assertion functions below read that state via
# `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist.

import json as _json  # noqa: E402


def op_state() -> dict:
    """The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset).
    Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}."""
    path = os.environ.get("CCCI_OP_STATE_FILE")
    if not path or not os.path.exists(path):
        return {}
    try:
        with open(path) as f:
            return _json.load(f)
    except (OSError, ValueError):
        return {}


def assert_upgraded(domain: str, meta: dict) -> None:
    """Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via
    `abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that
    the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2).

    The chaos deploy stamps `coop-cloud.<stack>.chaos-version` = the deployed recipe commit. When the
    intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE
    proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would
    stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring
    a move vs the pre-upgrade state (version/image/chaos changed)."""
    st = op_state().get("upgrade", {})
    before = st.get("before") or {}
    head_ref = st.get("head_ref")
    assert_serving(domain, meta)
    after = lifecycle.deployed_identity(domain)
    chaos = after.get("chaos")
    if head_ref:
        assert chaos, (
            f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the "
            "PR-head checkout (the code under test was not exercised by the upgrade)"
        )
        # chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short.
        # abra appends a working-tree-state marker (e.g. '+U' = untracked file present) to the
        # chaos-version when a cc-ci DEPLOY OVERLAY sits in the recipe checkout as an untracked file
        # (e.g. ghost's compose.ccci-health.yml, provided by install_steps). That marker is NOT part
        # of the commit identity — strip it before the HC1 commit match. HC1 is preserved: the
        # underlying COMMIT must still equal head_ref; a stale prev-checkout chaos redeploy would
        # stamp prev's commit (also '+U' if overlaid) and STILL not match head_ref after stripping.
        chaos_commit = chaos.split("+", 1)[0]
        assert head_ref.startswith(chaos_commit) or chaos_commit.startswith(head_ref), (
            f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head "
            f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is "
            "not exercising the PR's changes (HC1)"
        )
        return
    moved = (
        (before.get("version") and after.get("version") and before["version"] != after["version"])
        or (before.get("image") and after.get("image") and before["image"] != after["image"])
        or (chaos and chaos != before.get("chaos"))
    )
    assert moved, (
        f"{domain}: upgrade did not move the deployment "
        f"(version {before.get('version')}->{after.get('version')}, "
        f"image {before.get('image')}->{after.get('image')}, "
        f"chaos {before.get('chaos')}->{chaos}) — "
        "not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)"
    )


_SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"')


def parse_snapshot_id(backup_output: str) -> str | None:
    """The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the
    backup artifact identity (DG3) — read from the create output because `abra app backup snapshots`
    requires a TTY and is awkward to script."""
    m = _SNAPSHOT_ID_RE.search(backup_output)
    return m.group(1) if m else None


def assert_backup_artifact(domain: str) -> str:
    """Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a
    snapshot artifact was produced (its id recorded in op state). Returns the id."""
    snap_id = op_state().get("backup", {}).get("snapshot_id")
    assert snap_id, (
        f"{domain}: backup produced no snapshot artifact "
        "(no snapshot_id in `abra app backup create` output)"
    )
    return snap_id


def assert_restore_healthy(domain: str, meta: dict) -> None:
    """Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is
    healthy + serving again (assert_serving polls, so the post-restore reconverge settles)."""
    assert_serving(domain, meta)


# ---- Op primitives (orchestrator-only; perform the op once, never assert) --------------------


def perform_upgrade(
    domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900, meta: dict | None = None
) -> dict[str, str | None]:
    """Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
    PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
    to redeploy the running app at that checkout. This is the real upgrade the PR's changes are
    exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code).
    Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check
    — after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.

    `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's
    reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait.

    F2-12: the chaos redeploy runs with `--no-converge-checks` (abra's own convergence monitor FATAs
    on the heavy lasuite-drive prev→PR-head crossover while the NEW collabora's healthcheck is still
    in its start_period, even though it converges given swarm's healthcheck retries). We then own a
    STRICTER convergence+health wait here: services N/N (wait_healthy) + app HEALTH_PATH healthy +
    any recipe READY_PROBE (collabora WOPI discovery 200). This bounds readiness by OUR generous
    deadline, not abra's impatient one — and is stronger evidence than abra's monitor."""
    meta = meta or {}
    before = lifecycle.deployed_identity(domain)
    if head_ref:
        lifecycle.recipe_checkout_ref(recipe, head_ref)
    # HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
    # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
    lifecycle.prepull_images(recipe, domain)
    lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
    # Own the convergence verification (abra's monitor was skipped via -c).
    lifecycle.wait_healthy(
        domain,
        ok_codes=tuple(meta.get("HEALTH_OK", (200, 301, 302))),
        path=meta.get("HEALTH_PATH", "/"),
        deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)),
        http_timeout=int(meta.get("HTTP_TIMEOUT", 300)),
    )
    lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)))
    after = lifecycle.deployed_identity(domain)
    # Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
    # PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
    print(
        f"  upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} "
        f"chaos-version={after.get('chaos')} version={before.get('version')}→{after.get('version')}",
        flush=True,
    )
    return before


def perform_backup(domain: str) -> str | None:
    """Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion."""
    return parse_snapshot_id(lifecycle.backup_app(domain))


def perform_restore(domain: str) -> None:
    """Perform the RESTORE op once (restore the latest snapshot)."""
    lifecycle.restore_app(domain)