fix(1d): G1 backup/restore + F1d-1 cert-check reframe

- backup artifact: read snapshot_id from 'abra app backup create' output (snapshots needs a TTY);
  generic.parse_snapshot_id + do_backup assert it
- restore serving race: lifecycle.http_fetch (one request -> status+body, never raises) +
  assert_serving is now a bounded poll (settles a post-op reconverge, no bare sleep); drop wait_serving
- F1d-1 (Adversary, low): reframe served_cert/assert_serving honestly as an INFRA TLS sanity check
  (catches a lapsed/mis-rotated wildcard cert), NOT app-vs-fallback (Traefik serves the wildcard
  zone-wide); the genuine serving proof is services_converged + non-404 status. Awaiting re-test.

DG1 Adversary PASS @ef44d46. G1 full-lifecycle re-verification in flight.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 23:39:45 +01:00
parent a8f78b8673
commit 6c5d8f28ea
5 changed files with 144 additions and 76 deletions

View File

@ -127,11 +127,13 @@ def upgrade(domain: str, version: str | None = None, timeout: int = 900) -> None
_run(args, timeout=timeout)
def backup_create(domain: str, timeout: int = 900) -> None:
def backup_create(domain: str, timeout: int = 900) -> str:
# -C -o: use the current recipe checkout, no remote fetch — like every other recipe-touching
# call (DECISIONS.md). Without -o, abra tries to fetch recipe tags from the (possibly private)
# remote and fails "authentication required: Unauthorized".
_run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout)
# remote and fails "authentication required: Unauthorized". Returns the captured output, whose
# restic JSON summary line carries the produced "snapshot_id" (the backup artifact, DG3) — note
# `abra app backup snapshots` needs a TTY and is awkward to script, so we read the create output.
return _run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout).stdout
def restore(domain: str, timeout: int = 900) -> None:

View File

@ -16,8 +16,9 @@ import os
import re
import socket
import ssl
import time
from . import abra, lifecycle
from . import lifecycle
# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
@ -46,11 +47,15 @@ def backup_capable(recipe: str, meta: dict | None = None) -> bool:
def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
"""CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
Returns (verified, detail). The pre-issued wildcard is a publicly-trusted Let's Encrypt cert, so
a real serve VERIFIES against the system CA bundle and matches the hostname; Traefik's self-signed
DEFAULT cert (served only when no router/cert matches the SNI) FAILS verification — so this is a
genuine 'not the default cert' assertion with no openssl dependency. detail carries CN+SAN on
success, or the failure reason."""
Returns (verified, detail) with CN+SAN on success, or the failure reason.
Scope (per Adversary finding F1d-1): this is an INFRA TLS sanity check — it proves the served
wildcard cert is publicly trusted, unexpired, and hostname-valid (so it would fail if the
operator's LE wildcard lapsed/was mis-rotated — a real concern, plan §4.0 renewal). It does NOT
distinguish a routed app from an un-routed host: Traefik's file provider serves the wildcard for
the WHOLE `*.ci.commoninternet.net` zone, so any in-zone subdomain verifies whether or not an app
is deployed. The app-vs-Traefik-fallback proof is `services_converged` + a non-404 status in
`assert_serving`, not this."""
ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname
try:
with (
@ -71,75 +76,79 @@ def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
def assert_serving(domain: str, meta: dict) -> None:
"""The single generic "is the app really serving?" assertion (DG1). Proves, end-to-end:
1. every service in the stack converged (the app's own containers, not just Traefik);
2. a real HTTP(S) response over the run domain with a status in HEALTH_OK — which EXCLUDES
404, so a Traefik unmatched-router fallback fails here;
3. the body is not Traefik's default 404 page;
4. the served TLS cert is the wildcard, not Traefik's default cert.
No bare sleeps, no health-only shortcut."""
assert lifecycle.services_converged(domain), f"{domain}: not all services converged"
"""The single generic "is the app really serving?" assertion (DG1).
The app-vs-Traefik-fallback proof is steps 1+2 (both load-bearing, verified by the Adversary):
1. every service in the stack converged (the app's OWN containers are N/N — an un-routed host
has no app service, so this is False for a non-deployment);
2. a real HTTP(S) response with a status in HEALTH_OK — which EXCLUDES 404, so a Traefik
unmatched-router fallback (404) fails, as does a routed-but-dead backend (502/503);
3. the body (from the SAME request as the status — no race) is not Traefik's default 404 page;
4. an INFRA TLS sanity check (served_cert): the served wildcard cert is trusted+unexpired. This
does NOT distinguish the app from an un-routed host (Traefik serves the wildcard zone-wide,
F1d-1) — it only catches a lapsed/mis-rotated cert.
Steps 12 are BOUNDED POLLS (no bare sleep), so a state-mutating op (upgrade/restore) that leaves
the app briefly reconverging settles, while a persistent failure still fails within the timeout."""
deadline = time.time() + meta["DEPLOY_TIMEOUT"]
while time.time() < deadline and not lifecycle.services_converged(domain):
time.sleep(5)
assert lifecycle.services_converged(domain), f"{domain}: services did not converge"
path = meta["HEALTH_PATH"]
ok = tuple(meta["HEALTH_OK"])
status = lifecycle.http_get(domain, path)
assert status in ok, (
f"{domain}{path}: HTTP {status} not in {ok} — app not serving "
"(a Traefik 404 fallback or an unhealthy backend)"
deadline = time.time() + meta["HTTP_TIMEOUT"]
served = False
status, body = 0, ""
while time.time() < deadline:
status, body = lifecycle.http_fetch(domain, path)
if status in ok and not (status == 200 and "404 page not found" in body):
served = True
break
time.sleep(5)
assert served, (
f"{domain}{path}: not serving — last HTTP {status} (Traefik 404 fallback, "
"unhealthy backend, or default-404 body)"
)
if status == 200:
body = lifecycle.http_body(domain, path)
assert (
"404 page not found" not in body
), f"{domain}{path}: served Traefik's default 404 page, not the app"
# Infra TLS sanity only (F1d-1): catches a lapsed/mis-rotated wildcard cert; does NOT prove the
# app is routed (Traefik serves the wildcard zone-wide). The serving proof is steps 12 above.
verified, detail = served_cert(domain)
assert verified, f"{domain}: TLS cert is not the trusted wildcard — {detail}"
assert verified, f"{domain}: served wildcard cert is not trusted/valid — {detail}"
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
def wait_serving(domain: str, meta: dict) -> None:
"""Wait for converged + healthy (per recipe_meta timeouts), then run the full serving assertion."""
lifecycle.wait_healthy(
domain,
ok_codes=tuple(meta["HEALTH_OK"]),
path=meta["HEALTH_PATH"],
deploy_timeout=meta["DEPLOY_TIMEOUT"],
http_timeout=meta["HTTP_TIMEOUT"],
)
def do_upgrade(domain: str, target: str | None, meta: dict) -> None:
"""UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then assert it
reconverges + still serves (assert_serving polls, so the rolling upgrade settles)."""
lifecycle.upgrade_app(domain, version=target)
assert_serving(domain, meta)
def do_upgrade(domain: str, target: str | None, meta: dict) -> None:
"""UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then wait serving."""
lifecycle.upgrade_app(domain, version=target)
wait_serving(domain, meta)
_SNAPSHOT_ID_RE = re.compile(r'"snapshot_id"\s*:\s*"([0-9a-f]{8,})"')
def snapshots(domain: str) -> list[str]:
"""Snapshot ids backup-bot-two holds for this app (the backup 'artifact', DG3)."""
proc = abra._run(["app", "backup", "snapshots", domain, "-n", "-o"], check=False)
ids = []
for ln in proc.stdout.splitlines():
# restic snapshot rows start with an 8-hex short id
m = re.match(r"^([0-9a-f]{8})\b", ln.strip())
if m:
ids.append(m.group(1))
return ids
def parse_snapshot_id(backup_output: str) -> str | None:
"""The snapshot id from `abra app backup create` output (restic JSON summary line). This IS the
backup artifact identity (DG3) — read from the create output because `abra app backup snapshots`
requires a TTY and is awkward to script."""
m = _SNAPSHOT_ID_RE.search(backup_output)
return m.group(1) if m else None
def do_backup(domain: str) -> list[str]:
"""BACKUP op: create a snapshot, then assert an artifact now exists (returns snapshot ids)."""
lifecycle.backup_app(domain)
snaps = snapshots(domain)
assert (
snaps
), f"{domain}: backup produced no snapshot artifact (abra app backup snapshots empty)"
return snaps
def do_backup(domain: str) -> str:
"""BACKUP op: create a backup, then assert a snapshot artifact was produced (returns its id)."""
out = lifecycle.backup_app(domain)
snap_id = parse_snapshot_id(out)
assert snap_id, (
f"{domain}: backup produced no snapshot artifact "
"(no snapshot_id in `abra app backup create` output)"
)
return snap_id
def do_restore(domain: str, meta: dict) -> None:
"""RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again."""
"""RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again
(assert_serving polls, so the post-restore reconverge settles)."""
lifecycle.restore_app(domain)
wait_serving(domain, meta)
assert_serving(domain, meta)

View File

@ -190,6 +190,27 @@ def http_get(domain: str, path: str = "/", timeout: int = 15) -> int:
return 0
def http_fetch(domain: str, path: str = "/", timeout: int = 15) -> tuple[int, str]:
"""One HTTPS GET → (status, body) in a SINGLE request, never raising. Lets a caller check the
status and body together with no race between two requests (assert_serving) — and captures the
error body on a 4xx/5xx instead of throwing."""
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
req = urllib.request.Request(f"https://{domain}{path}", method="GET")
try:
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
return resp.status, resp.read().decode(errors="replace")
except urllib.error.HTTPError as e:
try:
body = e.read().decode(errors="replace")
except Exception: # noqa: BLE001
body = ""
return e.code, body
except Exception: # noqa: BLE001
return 0, ""
def wait_healthy(
domain: str,
ok_codes=(200, 301, 302),
@ -221,8 +242,9 @@ def upgrade_app(domain: str, version: str | None = None) -> None:
abra.upgrade(domain, version=version)
def backup_app(domain: str) -> None:
abra.backup_create(domain)
def backup_app(domain: str) -> str:
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
return abra.backup_create(domain)
def restore_app(domain: str) -> None: