feat(1d): G0 — generic install + deploy-once orchestrator (DG1 green on hedgedoc)
- harness/generic.py: recipe-agnostic assert_serving (converged + real HTTP, 404-excluded + not Traefik 404 body + CA-verified trusted wildcard cert), op helpers, backup_capable detect - harness/discovery.py: per-op overlay resolution (repo-local > cc-ci > generic), custom + hook - tests/_generic/: assertion-only tiers (install/upgrade/backup/restore) on the shared deployment - run_recipe_ci.py: deploy-ONCE orchestrator, per-op summary, deploy-count guard (DG4.1) - conftest live_app fixture; lifecycle deploy-count + install-steps hook + pin DOMAIN to run domain DG1 cold-verified green on hedgedoc (pure generic, deploy-count=1, clean teardown). G0 CLAIMED. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
71
runner/harness/discovery.py
Normal file
71
runner/harness/discovery.py
Normal file
@ -0,0 +1,71 @@
|
||||
"""Overlay / custom-test / install-steps discovery + precedence (Phase 1d, plan §2.5, DG4/DG5).
|
||||
|
||||
The generic is the default for each lifecycle op; a recipe's `test_<op>.py` OVERRIDES it. Sources,
|
||||
in precedence order (machine-docs/DECISIONS.md):
|
||||
|
||||
lifecycle op (install/upgrade/backup/restore) — exactly ONE assertion file runs:
|
||||
repo-local tests/test_<op>.py (upstream-authoritative, wins same-name collisions)
|
||||
> cc-ci tests/<recipe>/test_<op>.py
|
||||
> generic tests/_generic/test_<op>.py <- always present; the floor
|
||||
|
||||
custom (non-lifecycle) test_*.py — ALL run, additively, from BOTH locations (opt-in).
|
||||
|
||||
install-steps hook — install_steps.sh: repo-local > cc-ci, or none.
|
||||
|
||||
Repo-local = the recipe repo's own tests/ dir, snapshotted after fetch (it survives abra
|
||||
re-checking-out the recipe to a version tag — see the run orchestrator).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
LIFECYCLE_OPS = ("install", "upgrade", "backup", "restore")
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
GENERIC_DIR = os.path.join(ROOT, "tests", "_generic")
|
||||
|
||||
|
||||
def cc_ci_dir(recipe: str) -> str:
|
||||
return os.path.join(ROOT, "tests", recipe)
|
||||
|
||||
|
||||
def resolve_op(recipe: str, op: str, repo_local_dir: str | None) -> tuple[str, str]:
|
||||
"""Return (source, path) for the single assertion file to run for `op`:
|
||||
source in {"repo-local","cc-ci","generic"}. The generic file is the floor and always exists."""
|
||||
fname = f"test_{op}.py"
|
||||
if repo_local_dir:
|
||||
p = os.path.join(repo_local_dir, fname)
|
||||
if os.path.isfile(p):
|
||||
return ("repo-local", p)
|
||||
p = os.path.join(cc_ci_dir(recipe), fname)
|
||||
if os.path.isfile(p):
|
||||
return ("cc-ci", p)
|
||||
return ("generic", os.path.join(GENERIC_DIR, fname))
|
||||
|
||||
|
||||
def custom_tests(recipe: str, repo_local_dir: str | None) -> list[tuple[str, str]]:
|
||||
"""All non-lifecycle test_*.py from cc-ci's tests/<recipe>/ and the recipe's repo-local tests/.
|
||||
These have no generic equivalent and run only when present (opt-in), additively from both."""
|
||||
lifecycle_names = {f"test_{op}.py" for op in LIFECYCLE_OPS}
|
||||
found: list[tuple[str, str]] = []
|
||||
for source, d in (("cc-ci", cc_ci_dir(recipe)), ("repo-local", repo_local_dir)):
|
||||
if not d or not os.path.isdir(d):
|
||||
continue
|
||||
for p in sorted(glob.glob(os.path.join(d, "test_*.py"))):
|
||||
if os.path.basename(p) not in lifecycle_names:
|
||||
found.append((source, p))
|
||||
return found
|
||||
|
||||
|
||||
def install_steps(recipe: str, repo_local_dir: str | None) -> tuple[str, str] | None:
|
||||
"""The custom install-steps hook (install_steps.sh) for a recipe, or None. repo-local > cc-ci."""
|
||||
if repo_local_dir:
|
||||
p = os.path.join(repo_local_dir, "install_steps.sh")
|
||||
if os.path.isfile(p):
|
||||
return ("repo-local", p)
|
||||
p = os.path.join(cc_ci_dir(recipe), "install_steps.sh")
|
||||
if os.path.isfile(p):
|
||||
return ("cc-ci", p)
|
||||
return None
|
||||
145
runner/harness/generic.py
Normal file
145
runner/harness/generic.py
Normal file
@ -0,0 +1,145 @@
|
||||
"""Generic, recipe-agnostic lifecycle assertions + op helpers (Phase 1d, plan §2.1).
|
||||
|
||||
These are THE default for each lifecycle op: when a recipe ships no `test_<op>.py` overlay, the
|
||||
generic tier (tests/_generic/test_<op>.py) runs these against the single shared deployment the
|
||||
orchestrator brought up. The lifecycle OPERATIONS (upgrade/backup/restore) live here too — owned by
|
||||
the shared harness, not copy-pasted per recipe (DG7 DRY) — so overlays are assertions-only and may
|
||||
reuse these by composition (`from harness import generic; generic.assert_serving(...)`).
|
||||
|
||||
Design + precedence: machine-docs/DECISIONS.md (Phase 1d).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import ssl
|
||||
|
||||
from . import abra, lifecycle
|
||||
|
||||
# A recipe is backup-capable iff a compose file carries a truthy backupbot.backup label.
|
||||
_BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _recipe_dir(recipe: str) -> str:
|
||||
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
|
||||
|
||||
def backup_capable(recipe: str, meta: dict | None = None) -> bool:
|
||||
"""Whether the harness should run the backup/restore tiers (else they are a clean N/A skip, DG3).
|
||||
|
||||
`recipe_meta.BACKUP_CAPABLE` (bool) overrides; otherwise auto-detect by scanning the recipe's
|
||||
compose*.yml for a truthy `backupbot.backup` label (the Co-op Cloud backup convention)."""
|
||||
if meta and "BACKUP_CAPABLE" in meta:
|
||||
return bool(meta["BACKUP_CAPABLE"])
|
||||
for path in glob.glob(os.path.join(_recipe_dir(recipe), "compose*.yml")):
|
||||
try:
|
||||
with open(path) as fh:
|
||||
if _BACKUPBOT_RE.search(fh.read()):
|
||||
return True
|
||||
except OSError:
|
||||
continue
|
||||
return False
|
||||
|
||||
|
||||
def served_cert(domain: str, port: int = 443) -> tuple[bool, str]:
|
||||
"""CA-verified TLS handshake to `domain` (via the gateway passthrough to cc-ci's Traefik).
|
||||
Returns (verified, detail). The pre-issued wildcard is a publicly-trusted Let's Encrypt cert, so
|
||||
a real serve VERIFIES against the system CA bundle and matches the hostname; Traefik's self-signed
|
||||
DEFAULT cert (served only when no router/cert matches the SNI) FAILS verification — so this is a
|
||||
genuine 'not the default cert' assertion with no openssl dependency. detail carries CN+SAN on
|
||||
success, or the failure reason."""
|
||||
ctx = ssl.create_default_context() # verifies chain against system CAs + checks hostname
|
||||
try:
|
||||
with (
|
||||
socket.create_connection((domain, port), timeout=20) as sock,
|
||||
ctx.wrap_socket(sock, server_hostname=domain) as ssock,
|
||||
):
|
||||
cert = ssock.getpeercert()
|
||||
except ssl.SSLCertVerificationError as e:
|
||||
return (False, f"cert did not verify (Traefik default/self-signed?): {e}")
|
||||
except (OSError, ssl.SSLError) as e:
|
||||
return (False, f"TLS handshake error: {e}")
|
||||
cn = next(
|
||||
(v for rdn in cert.get("subject", ()) for k, v in rdn if k == "commonName"),
|
||||
"",
|
||||
)
|
||||
sans = [v for typ, v in cert.get("subjectAltName", ()) if typ == "DNS"]
|
||||
return (True, f"CN={cn} SAN={sans}")
|
||||
|
||||
|
||||
def assert_serving(domain: str, meta: dict) -> None:
|
||||
"""The single generic "is the app really serving?" assertion (DG1). Proves, end-to-end:
|
||||
1. every service in the stack converged (the app's own containers, not just Traefik);
|
||||
2. a real HTTP(S) response over the run domain with a status in HEALTH_OK — which EXCLUDES
|
||||
404, so a Traefik unmatched-router fallback fails here;
|
||||
3. the body is not Traefik's default 404 page;
|
||||
4. the served TLS cert is the wildcard, not Traefik's default cert.
|
||||
No bare sleeps, no health-only shortcut."""
|
||||
assert lifecycle.services_converged(domain), f"{domain}: not all services converged"
|
||||
|
||||
path = meta["HEALTH_PATH"]
|
||||
ok = tuple(meta["HEALTH_OK"])
|
||||
status = lifecycle.http_get(domain, path)
|
||||
assert status in ok, (
|
||||
f"{domain}{path}: HTTP {status} not in {ok} — app not serving "
|
||||
"(a Traefik 404 fallback or an unhealthy backend)"
|
||||
)
|
||||
|
||||
if status == 200:
|
||||
body = lifecycle.http_body(domain, path)
|
||||
assert (
|
||||
"404 page not found" not in body
|
||||
), f"{domain}{path}: served Traefik's default 404 page, not the app"
|
||||
|
||||
verified, detail = served_cert(domain)
|
||||
assert verified, f"{domain}: TLS cert is not the trusted wildcard — {detail}"
|
||||
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
|
||||
|
||||
|
||||
def wait_serving(domain: str, meta: dict) -> None:
|
||||
"""Wait for converged + healthy (per recipe_meta timeouts), then run the full serving assertion."""
|
||||
lifecycle.wait_healthy(
|
||||
domain,
|
||||
ok_codes=tuple(meta["HEALTH_OK"]),
|
||||
path=meta["HEALTH_PATH"],
|
||||
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
||||
http_timeout=meta["HTTP_TIMEOUT"],
|
||||
)
|
||||
assert_serving(domain, meta)
|
||||
|
||||
|
||||
def do_upgrade(domain: str, target: str | None, meta: dict) -> None:
|
||||
"""UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then wait serving."""
|
||||
lifecycle.upgrade_app(domain, version=target)
|
||||
wait_serving(domain, meta)
|
||||
|
||||
|
||||
def snapshots(domain: str) -> list[str]:
|
||||
"""Snapshot ids backup-bot-two holds for this app (the backup 'artifact', DG3)."""
|
||||
proc = abra._run(["app", "backup", "snapshots", domain, "-n", "-o"], check=False)
|
||||
ids = []
|
||||
for ln in proc.stdout.splitlines():
|
||||
# restic snapshot rows start with an 8-hex short id
|
||||
m = re.match(r"^([0-9a-f]{8})\b", ln.strip())
|
||||
if m:
|
||||
ids.append(m.group(1))
|
||||
return ids
|
||||
|
||||
|
||||
def do_backup(domain: str) -> list[str]:
|
||||
"""BACKUP op: create a snapshot, then assert an artifact now exists (returns snapshot ids)."""
|
||||
lifecycle.backup_app(domain)
|
||||
snaps = snapshots(domain)
|
||||
assert (
|
||||
snaps
|
||||
), f"{domain}: backup produced no snapshot artifact (abra app backup snapshots empty)"
|
||||
return snaps
|
||||
|
||||
|
||||
def do_restore(domain: str, meta: dict) -> None:
|
||||
"""RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again."""
|
||||
lifecycle.restore_app(domain)
|
||||
wait_serving(domain, meta)
|
||||
@ -89,17 +89,63 @@ def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
|
||||
return {str(k): str(v) for k, v in (ee or {}).items()}
|
||||
|
||||
|
||||
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
|
||||
def _record_deploy() -> None:
|
||||
"""Increment the per-run deploy counter (DG4.1: one deploy per run). No-op unless the
|
||||
orchestrator set CCCI_DEPLOY_COUNT_FILE — so it never affects standalone/manual use."""
|
||||
path = os.environ.get("CCCI_DEPLOY_COUNT_FILE")
|
||||
if not path:
|
||||
return
|
||||
n = 0
|
||||
with contextlib.suppress(OSError, ValueError), open(path) as f:
|
||||
n = int(f.read().strip() or "0")
|
||||
with contextlib.suppress(OSError), open(path, "w") as f:
|
||||
f.write(str(n + 1))
|
||||
|
||||
|
||||
def _run_install_steps(hook: tuple[str, str], recipe: str, domain: str) -> None:
|
||||
"""Run a recipe's custom install-steps hook (install_steps.sh) during the install tier — after
|
||||
`abra app new` + env defaults + secret generate, before deploy (Phase 1d DG5). The hook gets the
|
||||
app .env path + domain so it can insert secrets / set env / seed before the app comes up."""
|
||||
source, path = hook
|
||||
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
|
||||
print(f" install-steps hook ({source}): {path}", flush=True)
|
||||
subprocess.run(
|
||||
["bash", path],
|
||||
check=True,
|
||||
env=dict(
|
||||
os.environ,
|
||||
CCCI_APP_DOMAIN=domain,
|
||||
CCCI_RECIPE=recipe,
|
||||
CCCI_APP_ENV=env_path,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def deploy_app(
|
||||
recipe: str,
|
||||
domain: str,
|
||||
version: str | None = None,
|
||||
secrets: bool = True,
|
||||
install_steps_hook: tuple[str, str] | None = None,
|
||||
) -> None:
|
||||
"""Create + configure + deploy an app. Forces LETS_ENCRYPT_ENV='' so traefik serves the
|
||||
wildcard cert via the file provider and NEVER attempts ACME (adversary finding A1). Applies any
|
||||
per-recipe EXTRA_ENV (recipe_meta.py) before deploy."""
|
||||
per-recipe EXTRA_ENV (recipe_meta.py) and the custom install-steps hook (Phase 1d) before deploy."""
|
||||
_record_deploy()
|
||||
abra.app_config_remove(domain) # clear any stale .env from a prior crashed run
|
||||
abra.app_new(recipe, domain, version=version, secrets=secrets)
|
||||
# Pin DOMAIN to the run domain explicitly. `abra app new -D` fills it for recipes whose
|
||||
# .env.sample uses a literal placeholder, but NOT for ones using a `{{ .Domain }}` Go-template
|
||||
# (this abra version leaves it unexpanded → deploy fails "can't evaluate field Domain"). Setting
|
||||
# it ourselves is recipe-agnostic and canonical (the run domain IS the app's domain).
|
||||
abra.env_set(domain, "DOMAIN", domain)
|
||||
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
||||
for k, v in _recipe_extra_env(recipe, domain).items():
|
||||
abra.env_set(domain, k, v)
|
||||
if secrets:
|
||||
abra.secret_generate(domain)
|
||||
if install_steps_hook:
|
||||
_run_install_steps(install_steps_hook, recipe, domain)
|
||||
abra.deploy(domain)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user