diff --git a/machine-docs/JOURNAL-1d.md b/machine-docs/JOURNAL-1d.md index 2637de1..07b4c58 100644 --- a/machine-docs/JOURNAL-1d.md +++ b/machine-docs/JOURNAL-1d.md @@ -66,3 +66,36 @@ tests/_generic/test_install.py::test_serving PASSED $ docker stack ls | grep hedg -> (none — clean teardown) ``` Lint+format clean (`ruff check`/`ruff format --check` via `nix develop .#lint`). Claiming the G0 gate. + +## 2026-05-27 — G0/DG1 PASS; F1d-1 fixed; G1 backup+restore fixes + +**Adversary verdict: DG1 PASS @2026-05-27** (cold, own clone @ef44d46). G0 cleared. + +**Correcting an overstatement (Adversary finding F1d-1, valid):** my earlier G0 wording claimed the +CA-verified cert check distinguishes "the app vs a Traefik default-cert fallback." It does NOT — +Traefik's file provider serves the pre-issued **wildcard** for the WHOLE `*.ci.commoninternet.net` +zone, so ANY in-zone subdomain (even a non-deployed one) verifies; the self-signed default cert is +never served in-zone. The genuine app-vs-fallback proof is `services_converged` (the app's OWN +service replicas N/N) + a non-404 status in HEALTH_OK (Traefik's unmatched-router fallback = 404). +Fix applied (no code behavior change to the load-bearing checks; honesty/scope only): +- `generic.served_cert` + `assert_serving` docstrings/comments reframed: the cert check is an INFRA + TLS sanity check (catches a lapsed/mis-rotated wildcard cert — plan §4.0 renewal), explicitly NOT + an app-vs-fallback check. Kept because it CAN fail (cert expiry/untrust), unlike the old + openssl-missing no-op it replaced. +- Assertion message reworded ("served wildcard cert is not trusted/valid", not "...not the default"). +Noted for the Adversary to re-test + close F1d-1 (theirs to tick). + +**G1 — DG2 (upgrade) + DG3 (backup/restore) on hedgedoc (backup-capable, ≥2 tags 3.0.9→3.0.10):** +Two real bugs found+fixed via live runs: +1. *backup artifact check.* `abra app backup snapshots` needs a TTY (`FATA the input device is not a + TTY`), but `abra app backup create` already emits the restic JSON summary with the produced + `"snapshot_id"` (rc 0, "backup finished"). Verified raw on a live custom-html: + `snapshot_id": "d85bf492…"`. Fix: `backup_create` returns its output; `generic.parse_snapshot_id` + regex-extracts the id; `do_backup` asserts it. (Dropped the TTY-bound `snapshots` listing.) +2. *restore serving race.* `assert_serving` made TWO requests (http_get then http_body); post-restore + the app flapped between them → `http_body` raised an unhandled `HTTPError 404`. Fix: new + `lifecycle.http_fetch` returns (status, body) in ONE request, never raising; `assert_serving` now + BOUNDED-POLLS converged + serving (status+body from one request) so a post-op reconverge settles + while a persistent failure still fails within HTTP_TIMEOUT (no bare sleep). `do_upgrade`/`do_restore` + call it (dropped the redundant `wait_serving`). +Re-running full hedgedoc install→upgrade→backup→restore to confirm all-green before claiming G1. diff --git a/machine-docs/STATUS-1d.md b/machine-docs/STATUS-1d.md index af5fb76..60ebd01 100644 --- a/machine-docs/STATUS-1d.md +++ b/machine-docs/STATUS-1d.md @@ -12,8 +12,9 @@ every recipe gets a generic lifecycle suite for free; recipe-specific tests laye per-recipe overlay authoring is Phase 2. ## Definition of Done (Phase 1d) — DG1–DG8, each Adversary cold-verified in REVIEW-1d -- [ ] **DG1** — Generic INSTALL test (recipe-agnostic): app new→deploy→converged→really serving +- [x] **DG1** — Generic INSTALL test (recipe-agnostic): app new→deploy→converged→really serving (real HTTP(S), not Traefik fallback). Green on a simple recipe with no cc-ci/repo-local tests. + **Adversary PASS @2026-05-27** (cold, hedgedoc, deploy-count=1, clean teardown). - [ ] **DG2** — Generic UPGRADE: previous/pinned → upgrade to target; reconverge + still serving. - [ ] **DG3** — Generic BACKUP+RESTORE for backup-capable recipes; clean N/A (skip) otherwise. - [ ] **DG4** — Layering (override-or-extend; generic is the default); discovery + cc-ci/repo-local @@ -34,20 +35,21 @@ per-recipe overlay authoring is Phase 2. - **G4** — `!testme` e2e + per-op reporting + docs + cold verify. *Accept: DG6, DG7, DG8 → DONE.* ## In flight -**G1 — generic upgrade + backup/restore (next).** G0 code is in place and DG1 is green; while the -Adversary verifies G0, I'll build/prove the generic upgrade tier (previous→target in place) and the -backup/restore tiers gated on backup-capability (hedgedoc & custom-html are both backup-capable). +**G1 — generic upgrade + backup/restore.** Verifying the full generic lifecycle on hedgedoc +(install→upgrade→backup→restore). DG2 (upgrade) already green; fixed two real bugs (backup artifact +read from `abra app backup create`'s snapshot_id since `snapshots` needs a TTY; restore serving race +→ single-request `http_fetch` + bounded-poll `assert_serving`). Re-running to confirm all-green, then +claim G1. + +**F1d-1 (Adversary, low/DG7) — FIXED in code, awaiting Adversary re-test+close.** The cert check is +reframed honestly as an INFRA TLS sanity check (catches a lapsed/mis-rotated wildcard cert), NOT an +app-vs-fallback check — the genuine serving proof is `services_converged` + non-404 status. See +JOURNAL-1d + generic.py docstrings. ## Gate -**Gate: G0 CLAIMED, awaiting Adversary (DG1).** Generic INSTALL tier is green on **hedgedoc** — -a simple recipe with NO cc-ci/repo-local tests (pure generic), asserting it ACTUALLY serves (services -converged + real HTTP in HEALTH_OK [404 excluded] + not Traefik's 404 body + a CA-verified trusted -wildcard cert, not the default), with **deploy-count = 1** (DG4.1 one-deploy) and clean teardown -(no residual stack). Evidence in JOURNAL-1d (commands + output). custom-html-tiny was rejected as the -demo recipe: it's a static-web-server with an empty content volume → genuinely 404 zero-config. - -To reproduce (cold): on cc-ci, `cd /root/cc-ci && RECIPE=hedgedoc STAGES=install HOME=/root \ -CCCI_JANITOR_MAX_AGE=0 cc-ci-run runner/run_recipe_ci.py` → install: pass, deploy-count=1. +**G0/DG1 — Adversary PASS @2026-05-27.** Cleared past G0. Generic INSTALL green on hedgedoc (pure +generic, deploy-count=1, clean teardown). Next gate: G1 (DG2+DG3), claimed once the hedgedoc full +lifecycle is confirmed all-green. Design (DECISIONS.md Phase 1d): tier model with the lifecycle OP owned by the shared harness (test files = assertions only); override precedence repo-local > cc-ci > generic + extend-by-composition; diff --git a/runner/harness/abra.py b/runner/harness/abra.py index 066cd28..19c24fa 100644 --- a/runner/harness/abra.py +++ b/runner/harness/abra.py @@ -127,11 +127,13 @@ def upgrade(domain: str, version: str | None = None, timeout: int = 900) -> None _run(args, timeout=timeout) -def backup_create(domain: str, timeout: int = 900) -> None: +def backup_create(domain: str, timeout: int = 900) -> str: # -C -o: use the current recipe checkout, no remote fetch — like every other recipe-touching # call (DECISIONS.md). Without -o, abra tries to fetch recipe tags from the (possibly private) - # remote and fails "authentication required: Unauthorized". - _run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout) + # remote and fails "authentication required: Unauthorized". Returns the captured output, whose + # restic JSON summary line carries the produced "snapshot_id" (the backup artifact, DG3) — note + # `abra app backup snapshots` needs a TTY and is awkward to script, so we read the create output. + return _run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout).stdout def restore(domain: str, timeout: int = 900) -> None: diff --git a/runner/harness/generic.py b/runner/harness/generic.py index 19551a4..2514f3f 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -16,8 +16,9 @@ import os import re import socket import ssl +import time -from . import abra, lifecycle +from . import lifecycle # A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label. _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE) @@ -46,11 +47,15 @@ def backup_capable(recipe: str, meta: dict | None = None) -> bool: def served_cert(domain: str, port: int = 443) -> tuple[bool, str]: """CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik). - Returns (verified, detail). The pre-issued wildcard is a publicly-trusted Let's Encrypt cert, so - a real serve VERIFIES against the system CA bundle and matches the hostname; Traefik's self-signed - DEFAULT cert (served only when no router/cert matches the SNI) FAILS verification — so this is a - genuine 'not the default cert' assertion with no openssl dependency. detail carries CN+SAN on - success, or the failure reason.""" + Returns (verified, detail) with CN+SAN on success, or the failure reason. + + Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served + wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the + operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT + distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for + the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app + is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in + `assert_serving`, not this.""" ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname try: with ( @@ -71,75 +76,79 @@ def served_cert(domain: str, port: int = 443) -> tuple[bool, str]: def assert_serving(domain: str, meta: dict) -> None: - """The single generic "is the app really serving?" assertion (DG1). Proves, end-to-end: - 1. every service in the stack converged (the app's own containers, not just Traefik); - 2. a real HTTP(S) response over the run domain with a status in HEALTH_OK — which EXCLUDES - 404, so a Traefik unmatched-router fallback fails here; - 3. the body is not Traefik's default 404 page; - 4. the served TLS cert is the wildcard, not Traefik's default cert. - No bare sleeps, no health-only shortcut.""" - assert lifecycle.services_converged(domain), f"{domain}: not all services converged" + """The single generic "is the app really serving?" assertion (DG1). + + The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary): + 1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host + has no app service, so this is False for a non-deployment); + 2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik + unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503); + 3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page; + 4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This + does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide, + F1d-1) — it only catches a lapsed/mis-rotated cert. + + Steps 1–2 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves + the app briefly reconverging settles, while a persistent failure still fails within the timeout.""" + deadline = time.time() + meta["DEPLOY_TIMEOUT"] + while time.time() < deadline and not lifecycle.services_converged(domain): + time.sleep(5) + assert lifecycle.services_converged(domain), f"{domain}: services did not converge" path = meta["HEALTH_PATH"] ok = tuple(meta["HEALTH_OK"]) - status = lifecycle.http_get(domain, path) - assert status in ok, ( - f"{domain}{path}: HTTP {status} not in {ok} — app not serving " - "(a Traefik 404 fallback or an unhealthy backend)" + deadline = time.time() + meta["HTTP_TIMEOUT"] + served = False + status, body = 0, "" + while time.time() < deadline: + status, body = lifecycle.http_fetch(domain, path) + if status in ok and not (status == 200 and "404 page not found" in body): + served = True + break + time.sleep(5) + assert served, ( + f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, " + "unhealthy backend, or default-404 body)" ) - if status == 200: - body = lifecycle.http_body(domain, path) - assert ( - "404 page not found" not in body - ), f"{domain}{path}: served Traefik's default 404 page, not the app" - + # Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the + # app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 1–2 above. verified, detail = served_cert(domain) - assert verified, f"{domain}: TLS cert is not the trusted wildcard — {detail}" + assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}" assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}" -def wait_serving(domain: str, meta: dict) -> None: - """Wait for converged + healthy (per recipe_meta timeouts), then run the full serving assertion.""" - lifecycle.wait_healthy( - domain, - ok_codes=tuple(meta["HEALTH_OK"]), - path=meta["HEALTH_PATH"], - deploy_timeout=meta["DEPLOY_TIMEOUT"], - http_timeout=meta["HTTP_TIMEOUT"], - ) +def do_upgrade(domain: str, target: str | None, meta: dict) -> None: + """UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then assert it + reconverges + still serves (assert_serving polls, so the rolling upgrade settles).""" + lifecycle.upgrade_app(domain, version=target) assert_serving(domain, meta) -def do_upgrade(domain: str, target: str | None, meta: dict) -> None: - """UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then wait serving.""" - lifecycle.upgrade_app(domain, version=target) - wait_serving(domain, meta) +_SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"') -def snapshots(domain: str) -> list[str]: - """Snapshot ids backup-bot-two holds for this app (the backup 'artifact', DG3).""" - proc = abra._run(["app", "backup", "snapshots", domain, "-n", "-o"], check=False) - ids = [] - for ln in proc.stdout.splitlines(): - # restic snapshot rows start with an 8-hex short id - m = re.match(r"^([0-9a-f]{8})\b", ln.strip()) - if m: - ids.append(m.group(1)) - return ids +def parse_snapshot_id(backup_output: str) -> str | None: + """The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the + backup artifact identity (DG3) — read from the create output because `abra app backup snapshots` + requires a TTY and is awkward to script.""" + m = _SNAPSHOT_ID_RE.search(backup_output) + return m.group(1) if m else None -def do_backup(domain: str) -> list[str]: - """BACKUP op: create a snapshot, then assert an artifact now exists (returns snapshot ids).""" - lifecycle.backup_app(domain) - snaps = snapshots(domain) - assert ( - snaps - ), f"{domain}: backup produced no snapshot artifact (abra app backup snapshots empty)" - return snaps +def do_backup(domain: str) -> str: + """BACKUP op: create a backup, then assert a snapshot artifact was produced (returns its id).""" + out = lifecycle.backup_app(domain) + snap_id = parse_snapshot_id(out) + assert snap_id, ( + f"{domain}: backup produced no snapshot artifact " + "(no snapshot_id in `abra app backup create` output)" + ) + return snap_id def do_restore(domain: str, meta: dict) -> None: - """RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again.""" + """RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again + (assert_serving polls, so the post-restore reconverge settles).""" lifecycle.restore_app(domain) - wait_serving(domain, meta) + assert_serving(domain, meta) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index bf1c4d2..52ada9e 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -190,6 +190,27 @@ def http_get(domain: str, path: str = "/", timeout: int = 15) -> int: return 0 +def http_fetch(domain: str, path: str = "/", timeout: int = 15) -> tuple[int, str]: + """One HTTPS GET → (status, body) in a SINGLE request, never raising. Lets a caller check the + status and body together with no race between two requests (assert_serving) — and captures the + error body on a 4xx/5xx instead of throwing.""" + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + req = urllib.request.Request(f"https://{domain}{path}", method="GET") + try: + with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: + return resp.status, resp.read().decode(errors="replace") + except urllib.error.HTTPError as e: + try: + body = e.read().decode(errors="replace") + except Exception: # noqa: BLE001 + body = "" + return e.code, body + except Exception: # noqa: BLE001 + return 0, "" + + def wait_healthy( domain: str, ok_codes=(200, 301, 302), @@ -221,8 +242,9 @@ def upgrade_app(domain: str, version: str | None = None) -> None: abra.upgrade(domain, version=version) -def backup_app(domain: str) -> None: - abra.backup_create(domain) +def backup_app(domain: str) -> str: + """Create a backup; return the abra/restic output (carries the produced snapshot_id).""" + return abra.backup_create(domain) def restore_app(domain: str) -> None: