feat(1d): G0 — generic install + deploy-once orchestrator (DG1 green on hedgedoc)

- harness/generic.py: recipe-agnostic assert_serving (converged + real HTTP, 404-excluded +
  not Traefik 404 body + CA-verified trusted wildcard cert), op helpers, backup_capable detect
- harness/discovery.py: per-op overlay resolution (repo-local > cc-ci > generic), custom + hook
- tests/_generic/: assertion-only tiers (install/upgrade/backup/restore) on the shared deployment
- run_recipe_ci.py: deploy-ONCE orchestrator, per-op summary, deploy-count guard (DG4.1)
- conftest live_app fixture; lifecycle deploy-count + install-steps hook + pin DOMAIN to run domain

DG1 cold-verified green on hedgedoc (pure generic, deploy-count=1, clean teardown). G0 CLAIMED.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 23:27:55 +01:00
parent a31095a087
commit ef44d4658b
12 changed files with 599 additions and 106 deletions

View File

@ -0,0 +1,71 @@
"""Overlay / custom-test / install-steps discovery + precedence (Phase 1d, plan §2.5, DG4/DG5).
The generic is the default for each lifecycle op; a recipe's `test_<op>.py` OVERRIDES it. Sources,
in precedence order (machine-docs/DECISIONS.md):
lifecycle op (install/upgrade/backup/restore) — exactly ONE assertion file runs:
repo-local tests/test_<op>.py (upstream-authoritative, wins same-name collisions)
> cc-ci tests/<recipe>/test_<op>.py
> generic tests/_generic/test_<op>.py <- always present; the floor
custom (non-lifecycle) test_*.py — ALL run, additively, from BOTH locations (opt-in).
install-steps hook — install_steps.sh: repo-local > cc-ci, or none.
Repo-local = the recipe repo's own tests/ dir, snapshotted after fetch (it survives abra
re-checking-out the recipe to a version tag — see the run orchestrator).
"""
from __future__ import annotations
import glob
import os
LIFECYCLE_OPS = ("install", "upgrade", "backup", "restore")
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
GENERIC_DIR = os.path.join(ROOT, "tests", "_generic")
def cc_ci_dir(recipe: str) -> str:
return os.path.join(ROOT, "tests", recipe)
def resolve_op(recipe: str, op: str, repo_local_dir: str | None) -> tuple[str, str]:
"""Return (source, path) for the single assertion file to run for `op`:
source in {"repo-local","cc-ci","generic"}. The generic file is the floor and always exists."""
fname = f"test_{op}.py"
if repo_local_dir:
p = os.path.join(repo_local_dir, fname)
if os.path.isfile(p):
return ("repo-local", p)
p = os.path.join(cc_ci_dir(recipe), fname)
if os.path.isfile(p):
return ("cc-ci", p)
return ("generic", os.path.join(GENERIC_DIR, fname))
def custom_tests(recipe: str, repo_local_dir: str | None) -> list[tuple[str, str]]:
"""All non-lifecycle test_*.py from cc-ci's tests/<recipe>/ and the recipe's repo-local tests/.
These have no generic equivalent and run only when present (opt-in), additively from both."""
lifecycle_names = {f"test_{op}.py" for op in LIFECYCLE_OPS}
found: list[tuple[str, str]] = []
for source, d in (("cc-ci", cc_ci_dir(recipe)), ("repo-local", repo_local_dir)):
if not d or not os.path.isdir(d):
continue
for p in sorted(glob.glob(os.path.join(d, "test_*.py"))):
if os.path.basename(p) not in lifecycle_names:
found.append((source, p))
return found
def install_steps(recipe: str, repo_local_dir: str | None) -> tuple[str, str] | None:
"""The custom install-steps hook (install_steps.sh) for a recipe, or None. repo-local > cc-ci."""
if repo_local_dir:
p = os.path.join(repo_local_dir, "install_steps.sh")
if os.path.isfile(p):
return ("repo-local", p)
p = os.path.join(cc_ci_dir(recipe), "install_steps.sh")
if os.path.isfile(p):
return ("cc-ci", p)
return None

145
runner/harness/generic.py Normal file
View File

@ -0,0 +1,145 @@
"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1).
These are THE default for each lifecycle op: when a recipe ships no `test_<op>.py` overlay, the
generic tier (tests/_generic/test_<op>.py) runs these against the single shared deployment the
orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by
the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may
reuse these by composition (`from harness import generic; generic.assert_serving(...)`).
Design + precedence: machine-docs/DECISIONS.md (Phase 1d).
"""
from __future__ import annotations
import glob
import os
import re
import socket
import ssl
from . import abra, lifecycle
# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
def _recipe_dir(recipe: str) -> str:
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
def backup_capable(recipe: str, meta: dict | None = None) -> bool:
"""Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3).
`recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's
compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention)."""
if meta and "BACKUP_CAPABLE" in meta:
return bool(meta["BACKUP_CAPABLE"])
for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")):
try:
with open(path) as fh:
if _BACKUPBOT_RE.search(fh.read()):
return True
except OSError:
continue
return False
def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
"""CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
Returns (verified, detail). The pre-issued wildcard is a publicly-trusted Let's Encrypt cert, so
a real serve VERIFIES against the system CA bundle and matches the hostname; Traefik's self-signed
DEFAULT cert (served only when no router/cert matches the SNI) FAILS verification — so this is a
genuine 'not the default cert' assertion with no openssl dependency. detail carries CN+SAN on
success, or the failure reason."""
ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname
try:
with (
socket.create_connection((domain, port), timeout=20) as sock,
ctx.wrap_socket(sock, server_hostname=domain) as ssock,
):
cert = ssock.getpeercert()
except ssl.SSLCertVerificationError as e:
return (False, f"cert did not verify (Traefik default/self-signed?): {e}")
except (OSError, ssl.SSLError) as e:
return (False, f"TLS handshake error: {e}")
cn = next(
(v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"),
"",
)
sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"]
return (True, f"CN={cn} SAN={sans}")
def assert_serving(domain: str, meta: dict) -> None:
"""The single generic "is the app really serving?" assertion (DG1). Proves, end-to-end:
1. every service in the stack converged (the app's own containers, not just Traefik);
2. a real HTTP(S) response over the run domain with a status in HEALTH_OK — which EXCLUDES
404, so a Traefik unmatched-router fallback fails here;
3. the body is not Traefik's default 404 page;
4. the served TLS cert is the wildcard, not Traefik's default cert.
No bare sleeps, no health-only shortcut."""
assert lifecycle.services_converged(domain), f"{domain}: not all services converged"
path = meta["HEALTH_PATH"]
ok = tuple(meta["HEALTH_OK"])
status = lifecycle.http_get(domain, path)
assert status in ok, (
f"{domain}{path}: HTTP {status} not in {ok} — app not serving "
"(a Traefik 404 fallback or an unhealthy backend)"
)
if status == 200:
body = lifecycle.http_body(domain, path)
assert (
"404 page not found" not in body
), f"{domain}{path}: served Traefik's default 404 page, not the app"
verified, detail = served_cert(domain)
assert verified, f"{domain}: TLS cert is not the trusted wildcard — {detail}"
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
def wait_serving(domain: str, meta: dict) -> None:
"""Wait for converged + healthy (per recipe_meta timeouts), then run the full serving assertion."""
lifecycle.wait_healthy(
domain,
ok_codes=tuple(meta["HEALTH_OK"]),
path=meta["HEALTH_PATH"],
deploy_timeout=meta["DEPLOY_TIMEOUT"],
http_timeout=meta["HTTP_TIMEOUT"],
)
assert_serving(domain, meta)
def do_upgrade(domain: str, target: str | None, meta: dict) -> None:
"""UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then wait serving."""
lifecycle.upgrade_app(domain, version=target)
wait_serving(domain, meta)
def snapshots(domain: str) -> list[str]:
"""Snapshot ids backup-bot-two holds for this app (the backup 'artifact', DG3)."""
proc = abra._run(["app", "backup", "snapshots", domain, "-n", "-o"], check=False)
ids = []
for ln in proc.stdout.splitlines():
# restic snapshot rows start with an 8-hex short id
m = re.match(r"^([0-9a-f]{8})\b", ln.strip())
if m:
ids.append(m.group(1))
return ids
def do_backup(domain: str) -> list[str]:
"""BACKUP op: create a snapshot, then assert an artifact now exists (returns snapshot ids)."""
lifecycle.backup_app(domain)
snaps = snapshots(domain)
assert (
snaps
), f"{domain}: backup produced no snapshot artifact (abra app backup snapshots empty)"
return snaps
def do_restore(domain: str, meta: dict) -> None:
"""RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again."""
lifecycle.restore_app(domain)
wait_serving(domain, meta)

View File

@ -89,17 +89,63 @@ def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
return {str(k): str(v) for k, v in (ee or {}).items()}
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
def _record_deploy() -> None:
"""Increment the per-run deploy counter (DG4.1: one deploy per run). No-op unless the
orchestrator set CCCI_DEPLOY_COUNT_FILE — so it never affects standalone/manual use."""
path = os.environ.get("CCCI_DEPLOY_COUNT_FILE")
if not path:
return
n = 0
with contextlib.suppress(OSError, ValueError), open(path) as f:
n = int(f.read().strip() or "0")
with contextlib.suppress(OSError), open(path, "w") as f:
f.write(str(n + 1))
def _run_install_steps(hook: tuple[str, str], recipe: str, domain: str) -> None:
"""Run a recipe's custom install-steps hook (install_steps.sh) during the install tier — after
`abra app new` + env defaults + secret generate, before deploy (Phase 1d DG5). The hook gets the
app .env path + domain so it can insert secrets / set env / seed before the app comes up."""
source, path = hook
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
print(f" install-steps hook ({source}): {path}", flush=True)
subprocess.run(
["bash", path],
check=True,
env=dict(
os.environ,
CCCI_APP_DOMAIN=domain,
CCCI_RECIPE=recipe,
CCCI_APP_ENV=env_path,
),
)
def deploy_app(
recipe: str,
domain: str,
version: str | None = None,
secrets: bool = True,
install_steps_hook: tuple[str, str] | None = None,
) -> None:
"""Create + configure + deploy an app. Forces LETS_ENCRYPT_ENV='' so traefik serves the
wildcard cert via the file provider and NEVER attempts ACME (adversary finding A1). Applies any
per-recipe EXTRA_ENV (recipe_meta.py) before deploy."""
per-recipe EXTRA_ENV (recipe_meta.py) and the custom install-steps hook (Phase 1d) before deploy."""
_record_deploy()
abra.app_config_remove(domain) # clear any stale .env from a prior crashed run
abra.app_new(recipe, domain, version=version, secrets=secrets)
# Pin DOMAIN to the run domain explicitly. `abra app new -D` fills it for recipes whose
# .env.sample uses a literal placeholder, but NOT for ones using a `{{ .Domain }}` Go-template
# (this abra version leaves it unexpanded → deploy fails "can't evaluate field Domain"). Setting
# it ourselves is recipe-agnostic and canonical (the run domain IS the app's domain).
abra.env_set(domain, "DOMAIN", domain)
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
for k, v in _recipe_extra_env(recipe, domain).items():
abra.env_set(domain, k, v)
if secrets:
abra.secret_generate(domain)
if install_steps_hook:
_run_install_steps(install_steps_hook, recipe, domain)
abra.deploy(domain)