Push builds have been RED on the lint step since ~build 209 from accumulated formatting drift. This is the mechanical cleanup: ruff format + ruff --fix (UP038 isinstance unions, SIM105 contextlib.suppress, UP031 f-strings, SIM115 tempfile context manager), shfmt -i 2 -ci, nixpkgs-fmt/statix/deadnix (merged attrsets, dropped unused lib args), yamllint, and shell quoting fixes in tests/lasuite-docs/setup_custom_tests.sh. No behaviour changes intended; lint: PASS, unit tests: 138 passed.
296 lines
15 KiB
Python
296 lines
15 KiB
Python
"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1).
|
||
|
||
These are THE default for each lifecycle op: when a recipe ships no `test_<op>.py` overlay, the
|
||
generic tier (tests/_generic/test_<op>.py) runs these against the single shared deployment the
|
||
orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by
|
||
the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may
|
||
reuse these by composition (`from harness import generic; generic.assert_serving(...)`).
|
||
|
||
Design + precedence: machine-docs/DECISIONS.md (Phase 1d).
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import glob
|
||
import os
|
||
import re
|
||
import socket
|
||
import ssl
|
||
import time
|
||
|
||
from . import abra, lifecycle
|
||
|
||
# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
|
||
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
|
||
|
||
|
||
def _recipe_dir(recipe: str) -> str:
|
||
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||
|
||
|
||
def backup_capable(recipe: str, meta: dict | None = None) -> bool:
|
||
"""Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3).
|
||
|
||
`recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's
|
||
compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention)."""
|
||
if meta and "BACKUP_CAPABLE" in meta:
|
||
return bool(meta["BACKUP_CAPABLE"])
|
||
for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")):
|
||
try:
|
||
with open(path) as fh:
|
||
if _BACKUPBOT_RE.search(fh.read()):
|
||
return True
|
||
except OSError:
|
||
continue
|
||
return False
|
||
|
||
|
||
def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
|
||
"""CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
|
||
Returns (verified, detail) with CN+SAN on success, or the failure reason.
|
||
|
||
Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served
|
||
wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the
|
||
operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT
|
||
distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for
|
||
the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app
|
||
is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in
|
||
`assert_serving`, not this."""
|
||
ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname
|
||
try:
|
||
with (
|
||
socket.create_connection((domain, port), timeout=20) as sock,
|
||
ctx.wrap_socket(sock, server_hostname=domain) as ssock,
|
||
):
|
||
cert = ssock.getpeercert()
|
||
except ssl.SSLCertVerificationError as e:
|
||
return (False, f"cert did not verify (Traefik default/self-signed?): {e}")
|
||
except (OSError, ssl.SSLError) as e:
|
||
return (False, f"TLS handshake error: {e}")
|
||
cn = next(
|
||
(v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"),
|
||
"",
|
||
)
|
||
sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"]
|
||
return (True, f"CN={cn} SAN={sans}")
|
||
|
||
|
||
def assert_serving(domain: str, meta: dict) -> None:
|
||
"""The single generic "is the app really serving?" assertion (DG1).
|
||
|
||
The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary):
|
||
1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host
|
||
has no app service, so this is False for a non-deployment);
|
||
2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik
|
||
unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503);
|
||
3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page;
|
||
4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This
|
||
does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide,
|
||
F1d-1) — it only catches a lapsed/mis-rotated cert.
|
||
|
||
Steps 1–2 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves
|
||
the app briefly reconverging settles, while a persistent failure still fails within the timeout."""
|
||
deadline = time.time() + meta["DEPLOY_TIMEOUT"]
|
||
while time.time() < deadline and not lifecycle.services_converged(domain):
|
||
time.sleep(5)
|
||
assert lifecycle.services_converged(domain), f"{domain}: services did not converge"
|
||
|
||
path = meta["HEALTH_PATH"]
|
||
ok = tuple(meta["HEALTH_OK"])
|
||
deadline = time.time() + meta["HTTP_TIMEOUT"]
|
||
served = False
|
||
status, body = 0, ""
|
||
while time.time() < deadline:
|
||
status, body = lifecycle.http_fetch(domain, path)
|
||
if status in ok and not (status == 200 and "404 page not found" in body):
|
||
served = True
|
||
break
|
||
time.sleep(5)
|
||
assert served, (
|
||
f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, "
|
||
"unhealthy backend, or default-404 body)"
|
||
)
|
||
|
||
# Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the
|
||
# app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 1–2 above.
|
||
verified, detail = served_cert(domain)
|
||
assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}"
|
||
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
|
||
|
||
|
||
# ---- Op/assertion split (Phase 1e HC3) -------------------------------------------------------
|
||
# The orchestrator performs each mutating op ONCE (the harness owns the op), records what an
|
||
# assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at
|
||
# $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay
|
||
# assertion file against the shared post-op state. The assertion functions below read that state via
|
||
# `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist.
|
||
|
||
import json as _json # noqa: E402
|
||
|
||
|
||
def op_state() -> dict:
|
||
"""The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset).
|
||
Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}."""
|
||
path = os.environ.get("CCCI_OP_STATE_FILE")
|
||
if not path or not os.path.exists(path):
|
||
return {}
|
||
try:
|
||
with open(path) as f:
|
||
return _json.load(f)
|
||
except (OSError, ValueError):
|
||
return {}
|
||
|
||
|
||
def assert_upgraded(domain: str, meta: dict) -> None:
|
||
"""Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via
|
||
`abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that
|
||
the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2).
|
||
|
||
The chaos deploy stamps `coop-cloud.<stack>.chaos-version` = the deployed recipe commit. When the
|
||
intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE
|
||
proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would
|
||
stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring
|
||
a move vs the pre-upgrade state (version/image/chaos changed)."""
|
||
st = op_state().get("upgrade", {})
|
||
before = st.get("before") or {}
|
||
head_ref = st.get("head_ref")
|
||
assert_serving(domain, meta)
|
||
after = lifecycle.deployed_identity(domain)
|
||
chaos = after.get("chaos")
|
||
if head_ref:
|
||
assert chaos, (
|
||
f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the "
|
||
"PR-head checkout (the code under test was not exercised by the upgrade)"
|
||
)
|
||
# chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short.
|
||
# abra appends a working-tree-state marker (e.g. '+U' = untracked file present) to the
|
||
# chaos-version when a cc-ci DEPLOY OVERLAY sits in the recipe checkout as an untracked file
|
||
# (e.g. ghost's compose.ccci-health.yml, provided by install_steps). That marker is NOT part
|
||
# of the commit identity — strip it before the HC1 commit match. HC1 is preserved: the
|
||
# underlying COMMIT must still equal head_ref; a stale prev-checkout chaos redeploy would
|
||
# stamp prev's commit (also '+U' if overlaid) and STILL not match head_ref after stripping.
|
||
chaos_commit = chaos.split("+", 1)[0]
|
||
assert head_ref.startswith(chaos_commit) or chaos_commit.startswith(head_ref), (
|
||
f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head "
|
||
f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is "
|
||
"not exercising the PR's changes (HC1)"
|
||
)
|
||
return
|
||
moved = (
|
||
(before.get("version") and after.get("version") and before["version"] != after["version"])
|
||
or (before.get("image") and after.get("image") and before["image"] != after["image"])
|
||
or (chaos and chaos != before.get("chaos"))
|
||
)
|
||
assert moved, (
|
||
f"{domain}: upgrade did not move the deployment "
|
||
f"(version {before.get('version')}->{after.get('version')}, "
|
||
f"image {before.get('image')}->{after.get('image')}, "
|
||
f"chaos {before.get('chaos')}->{chaos}) — "
|
||
"not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)"
|
||
)
|
||
|
||
|
||
_SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"')
|
||
|
||
|
||
def parse_snapshot_id(backup_output: str) -> str | None:
|
||
"""The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the
|
||
backup artifact identity (DG3) — read from the create output because `abra app backup snapshots`
|
||
requires a TTY and is awkward to script."""
|
||
m = _SNAPSHOT_ID_RE.search(backup_output)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
def assert_backup_artifact(domain: str) -> str:
|
||
"""Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a
|
||
snapshot artifact was produced (its id recorded in op state). Returns the id."""
|
||
snap_id = op_state().get("backup", {}).get("snapshot_id")
|
||
assert snap_id, (
|
||
f"{domain}: backup produced no snapshot artifact "
|
||
"(no snapshot_id in `abra app backup create` output)"
|
||
)
|
||
return snap_id
|
||
|
||
|
||
def assert_restore_healthy(domain: str, meta: dict) -> None:
|
||
"""Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is
|
||
healthy + serving again (assert_serving polls, so the post-restore reconverge settles)."""
|
||
assert_serving(domain, meta)
|
||
|
||
|
||
# ---- Op primitives (orchestrator-only; perform the op once, never assert) --------------------
|
||
|
||
|
||
def perform_upgrade(
|
||
domain: str,
|
||
recipe: str,
|
||
head_ref: str | None,
|
||
deploy_timeout: int = 900,
|
||
meta: dict | None = None,
|
||
) -> dict[str, str | None]:
|
||
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
|
||
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
|
||
to redeploy the running app at that checkout. This is the real upgrade the PR's changes are
|
||
exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code).
|
||
Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check
|
||
— after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.
|
||
|
||
`deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's
|
||
reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait.
|
||
|
||
F2-12: the chaos redeploy runs with `--no-converge-checks` (abra's own convergence monitor FATAs
|
||
on the heavy lasuite-drive prev→PR-head crossover while the NEW collabora's healthcheck is still
|
||
in its start_period, even though it converges given swarm's healthcheck retries). We then own a
|
||
STRICTER convergence+health wait here: services N/N (wait_healthy) + app HEALTH_PATH healthy +
|
||
any recipe READY_PROBE (collabora WOPI discovery 200). This bounds readiness by OUR generous
|
||
deadline, not abra's impatient one — and is stronger evidence than abra's monitor."""
|
||
meta = meta or {}
|
||
before = lifecycle.deployed_identity(domain)
|
||
if head_ref:
|
||
lifecycle.recipe_checkout_ref(recipe, head_ref)
|
||
# UPGRADE_EXTRA_ENV (F2-14c): a recipe may need different app .env for the upgrade-TARGET deploy
|
||
# than for the base — e.g. mumble's `compose.host-ports.yml` overlay exists ONLY in the newer
|
||
# (target) version, so the base deploys minimally WITHOUT it and the upgrade adds it to COMPOSE_FILE
|
||
# here, after the PR-head checkout (which ships the overlay) and before the chaos redeploy that
|
||
# picks up the new .env. Dict or callable(domain)->dict. No-op for recipes without it.
|
||
upgrade_env = meta.get("UPGRADE_EXTRA_ENV") or {}
|
||
if callable(upgrade_env):
|
||
upgrade_env = upgrade_env(domain) or {}
|
||
for k, v in upgrade_env.items():
|
||
print(f" upgrade-env: {k}={v}", flush=True)
|
||
abra.env_set(domain, k, v)
|
||
# HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
|
||
# tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
|
||
lifecycle.prepull_images(recipe, domain)
|
||
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
|
||
# Own the convergence verification (abra's monitor was skipped via -c).
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta.get("HEALTH_OK", (200, 301, 302))),
|
||
path=meta.get("HEALTH_PATH", "/"),
|
||
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)),
|
||
http_timeout=int(meta.get("HTTP_TIMEOUT", 300)),
|
||
)
|
||
lifecycle.wait_ready_probes(
|
||
meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout))
|
||
)
|
||
after = lifecycle.deployed_identity(domain)
|
||
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
|
||
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
|
||
print(
|
||
f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} "
|
||
f"chaos-version={after.get('chaos')} version={before.get('version')}→{after.get('version')}",
|
||
flush=True,
|
||
)
|
||
return before
|
||
|
||
|
||
def perform_backup(domain: str) -> str | None:
|
||
"""Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion."""
|
||
return parse_snapshot_id(lifecycle.backup_app(domain))
|
||
|
||
|
||
def perform_restore(domain: str) -> None:
|
||
"""Perform the RESTORE op once (restore the latest snapshot)."""
|
||
lifecycle.restore_app(domain)
|