Files
cc-ci/runner/harness/generic.py
autonomic-bot a7e2af444a fix(2): assert_upgraded tolerate abra's '+U' working-tree marker on chaos-version
A cc-ci deploy overlay sitting in the recipe checkout as an untracked file (ghost's
compose.ccci-health.yml via install_steps) makes abra stamp chaos-version='<commit>+U' (U=untracked).
The commit still equals head_ref (HC1 satisfied) but the '+U' broke the exact-prefix match → spurious
upgrade-tier FAIL. Strip the working-tree-state marker before the commit match; HC1 preserved (commit
must still equal head_ref — a stale checkout's commit would not match even after stripping). General:
benefits every future cc-ci overlay recipe.
2026-05-30 05:49:27 +01:00

279 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1).
These are THE default for each lifecycle op: when a recipe ships no `test_<op>.py` overlay, the
generic tier (tests/_generic/test_<op>.py) runs these against the single shared deployment the
orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by
the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may
reuse these by composition (`from harness import generic; generic.assert_serving(...)`).
Design + precedence: machine-docs/DECISIONS.md (Phase 1d).
"""
from __future__ import annotations
import glob
import os
import re
import socket
import ssl
import time
from . import lifecycle
# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
def _recipe_dir(recipe: str) -> str:
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
def backup_capable(recipe: str, meta: dict | None = None) -> bool:
"""Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3).
`recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's
compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention)."""
if meta and "BACKUP_CAPABLE" in meta:
return bool(meta["BACKUP_CAPABLE"])
for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")):
try:
with open(path) as fh:
if _BACKUPBOT_RE.search(fh.read()):
return True
except OSError:
continue
return False
def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
"""CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
Returns (verified, detail) with CN+SAN on success, or the failure reason.
Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served
wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the
operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT
distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for
the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app
is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in
`assert_serving`, not this."""
ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname
try:
with (
socket.create_connection((domain, port), timeout=20) as sock,
ctx.wrap_socket(sock, server_hostname=domain) as ssock,
):
cert = ssock.getpeercert()
except ssl.SSLCertVerificationError as e:
return (False, f"cert did not verify (Traefik default/self-signed?): {e}")
except (OSError, ssl.SSLError) as e:
return (False, f"TLS handshake error: {e}")
cn = next(
(v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"),
"",
)
sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"]
return (True, f"CN={cn} SAN={sans}")
def assert_serving(domain: str, meta: dict) -> None:
"""The single generic "is the app really serving?" assertion (DG1).
The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary):
1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host
has no app service, so this is False for a non-deployment);
2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik
unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503);
3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page;
4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This
does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide,
F1d-1) — it only catches a lapsed/mis-rotated cert.
Steps 12 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves
the app briefly reconverging settles, while a persistent failure still fails within the timeout."""
deadline = time.time() + meta["DEPLOY_TIMEOUT"]
while time.time() < deadline and not lifecycle.services_converged(domain):
time.sleep(5)
assert lifecycle.services_converged(domain), f"{domain}: services did not converge"
path = meta["HEALTH_PATH"]
ok = tuple(meta["HEALTH_OK"])
deadline = time.time() + meta["HTTP_TIMEOUT"]
served = False
status, body = 0, ""
while time.time() < deadline:
status, body = lifecycle.http_fetch(domain, path)
if status in ok and not (status == 200 and "404 page not found" in body):
served = True
break
time.sleep(5)
assert served, (
f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, "
"unhealthy backend, or default-404 body)"
)
# Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the
# app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 12 above.
verified, detail = served_cert(domain)
assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}"
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
# ---- Op/assertion split (Phase 1e HC3) -------------------------------------------------------
# The orchestrator performs each mutating op ONCE (the harness owns the op), records what an
# assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at
# $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay
# assertion file against the shared post-op state. The assertion functions below read that state via
# `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist.
import json as _json # noqa: E402
def op_state() -> dict:
"""The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset).
Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}."""
path = os.environ.get("CCCI_OP_STATE_FILE")
if not path or not os.path.exists(path):
return {}
try:
with open(path) as f:
return _json.load(f)
except (OSError, ValueError):
return {}
def assert_upgraded(domain: str, meta: dict) -> None:
"""Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via
`abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that
the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2).
The chaos deploy stamps `coop-cloud.<stack>.chaos-version` = the deployed recipe commit. When the
intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE
proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would
stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring
a move vs the pre-upgrade state (version/image/chaos changed)."""
st = op_state().get("upgrade", {})
before = st.get("before") or {}
head_ref = st.get("head_ref")
assert_serving(domain, meta)
after = lifecycle.deployed_identity(domain)
chaos = after.get("chaos")
if head_ref:
assert chaos, (
f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the "
"PR-head checkout (the code under test was not exercised by the upgrade)"
)
# chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short.
# abra appends a working-tree-state marker (e.g. '+U' = untracked file present) to the
# chaos-version when a cc-ci DEPLOY OVERLAY sits in the recipe checkout as an untracked file
# (e.g. ghost's compose.ccci-health.yml, provided by install_steps). That marker is NOT part
# of the commit identity — strip it before the HC1 commit match. HC1 is preserved: the
# underlying COMMIT must still equal head_ref; a stale prev-checkout chaos redeploy would
# stamp prev's commit (also '+U' if overlaid) and STILL not match head_ref after stripping.
chaos_commit = chaos.split("+", 1)[0]
assert head_ref.startswith(chaos_commit) or chaos_commit.startswith(head_ref), (
f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head "
f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is "
"not exercising the PR's changes (HC1)"
)
return
moved = (
(before.get("version") and after.get("version") and before["version"] != after["version"])
or (before.get("image") and after.get("image") and before["image"] != after["image"])
or (chaos and chaos != before.get("chaos"))
)
assert moved, (
f"{domain}: upgrade did not move the deployment "
f"(version {before.get('version')}->{after.get('version')}, "
f"image {before.get('image')}->{after.get('image')}, "
f"chaos {before.get('chaos')}->{chaos}) — "
"not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)"
)
_SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"')
def parse_snapshot_id(backup_output: str) -> str | None:
"""The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the
backup artifact identity (DG3) — read from the create output because `abra app backup snapshots`
requires a TTY and is awkward to script."""
m = _SNAPSHOT_ID_RE.search(backup_output)
return m.group(1) if m else None
def assert_backup_artifact(domain: str) -> str:
"""Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a
snapshot artifact was produced (its id recorded in op state). Returns the id."""
snap_id = op_state().get("backup", {}).get("snapshot_id")
assert snap_id, (
f"{domain}: backup produced no snapshot artifact "
"(no snapshot_id in `abra app backup create` output)"
)
return snap_id
def assert_restore_healthy(domain: str, meta: dict) -> None:
"""Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is
healthy + serving again (assert_serving polls, so the post-restore reconverge settles)."""
assert_serving(domain, meta)
# ---- Op primitives (orchestrator-only; perform the op once, never assert) --------------------
def perform_upgrade(
domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900, meta: dict | None = None
) -> dict[str, str | None]:
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
to redeploy the running app at that checkout. This is the real upgrade the PR's changes are
exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code).
Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check
— after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.
`deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's
reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait.
F2-12: the chaos redeploy runs with `--no-converge-checks` (abra's own convergence monitor FATAs
on the heavy lasuite-drive prev→PR-head crossover while the NEW collabora's healthcheck is still
in its start_period, even though it converges given swarm's healthcheck retries). We then own a
STRICTER convergence+health wait here: services N/N (wait_healthy) + app HEALTH_PATH healthy +
any recipe READY_PROBE (collabora WOPI discovery 200). This bounds readiness by OUR generous
deadline, not abra's impatient one — and is stronger evidence than abra's monitor."""
meta = meta or {}
before = lifecycle.deployed_identity(domain)
if head_ref:
lifecycle.recipe_checkout_ref(recipe, head_ref)
# HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
# tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
lifecycle.prepull_images(recipe, domain)
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
# Own the convergence verification (abra's monitor was skipped via -c).
lifecycle.wait_healthy(
domain,
ok_codes=tuple(meta.get("HEALTH_OK", (200, 301, 302))),
path=meta.get("HEALTH_PATH", "/"),
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)),
http_timeout=int(meta.get("HTTP_TIMEOUT", 300)),
)
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)))
after = lifecycle.deployed_identity(domain)
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
print(
f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} "
f"chaos-version={after.get('chaos')} version={before.get('version')}{after.get('version')}",
flush=True,
)
return before
def perform_backup(domain: str) -> str | None:
"""Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion."""
return parse_snapshot_id(lifecycle.backup_app(domain))
def perform_restore(domain: str) -> None:
"""Perform the RESTORE op once (restore the latest snapshot)."""
lifecycle.restore_app(domain)