#!/usr/bin/env python3 """Top-level CI orchestrator (plan §4.3 + Phase 1d/1e), invoked by the Drone pipeline (or by hand). Model: deploy the app ONCE, then run lifecycle TIERS against that single shared deployment, then ONE teardown in `finally`. Per Phase 1e the orchestrator OWNS each mutating op (HC3): for a tier it runs the optional pre-op seed hook (recipe ops.py `pre_`), performs the op exactly ONCE (upgrade/backup/restore — install has none), then runs BOTH the generic assertion file (the floor, unless explicitly opted out) AND the recipe overlay assertion file (if any) against the shared post-op state — generic and overlay are ADDITIVE, not override (HC3). Op results an assertion needs (pre-upgrade identity, snapshot_id) pass op→assertion via a run-scoped JSON state file ($CCCI_OP_STATE_FILE). The upgrade op deploys the PR-HEAD code under test via `abra app deploy --chaos` (HC1). Repo-local (PR-authored) overlays/hooks run only for allowlist-approved recipes (HC2, gated in harness.discovery). The generic is the default for every op, so ANY recipe is testable with zero config (DG1–DG4). The lifecycle OPS live in the shared harness (harness.generic), not per-recipe (DG7 DRY). Run parameters from env (set by the comment-bridge via Drone build params): RECIPE recipe name (e.g. custom-html) [required] REF PR head commit sha [optional; used for fetch + run-domain hash] PR PR number [optional, default 0] SRC head repo full_name on the mirror [optional] VERSION upgrade target tag (else newest published) [optional] STAGES comma filter of tiers to run [optional, default install,upgrade,backup,restore,custom] Run env (python + pytest + playwright) is provided by `cc-ci-run` (nix/modules/harness.nix); invoke as: cc-ci-run runner/run_recipe_ci.py """ from __future__ import annotations import contextlib import glob import importlib.util import json import os import shutil import subprocess import sys import tempfile ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, "runner")) from harness import deps as deps_mod, discovery, generic, lifecycle, naming # noqa: E402 ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom") def _truthy(v: str | None) -> bool: return str(v or "").strip().lower() in ("1", "true", "yes", "on") def _redact_values() -> list[str]: """Values to scrub from published logs (D6 redaction filter, plan §4.4). The infra secrets materialised at /run/secrets/* — if any subprocess ever echoes one, mask it. Only >=8-char values, so it never false-positives on short strings / SHAs.""" vals = set() for p in glob.glob("/run/secrets/*"): try: with open(p) as f: v = f.read().strip() except OSError: continue if len(v) >= 8: vals.add(v) return sorted(vals, key=len, reverse=True) _REDACT = _redact_values() def _scrub(text: str) -> str: """Mask any known infra-secret value in a string (D6 redaction, plan §4.4).""" for v in _REDACT: if v in text: text = text.replace(v, "***REDACTED***") return text def run_redacted(cmd: list[str], env: dict | None = None) -> int: """Run a subprocess, streaming output live (so Drone logs stay tail-able) but masking any known infra-secret value first. Belt-and-suspenders: the harness never prints secrets and abra doesn't echo generated ones.""" proc = subprocess.Popen( cmd, cwd=ROOT, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, ) assert proc.stdout is not None for line in proc.stdout: sys.stdout.write(_scrub(line)) sys.stdout.flush() return proc.wait() def _gitea_token() -> str | None: tok = os.environ.get("GITEA_TOKEN") if not tok and os.path.exists("/run/secrets/bridge_gitea_token"): with open("/run/secrets/bridge_gitea_token") as f: tok = f.read().strip() return tok or None def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: """Make the recipe available at the code under test. If SRC+REF point at the mirror PR, clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed).""" recipes_dir = os.path.expanduser("~/.abra/recipes") os.makedirs(recipes_dir, exist_ok=True) dest = os.path.join(recipes_dir, recipe) if src and ref: url = f"https://git.autonomic.zone/{src}.git" git = ["git"] tok = _gitea_token() if tok: git += ["-c", f"http.extraHeader=Authorization: token {tok}"] subprocess.run(["rm", "-rf", dest], check=False) subprocess.run([*git, "clone", "--quiet", url, dest], check=True) subprocess.run([*git, "-C", dest, "checkout", "--quiet", ref], check=True) # Bring in published version TAGS from the public upstream so the upgrade tier can deploy a # previous published version (mirror PR branches carry no release tags). Read-only + plain git # (no bot token to a foreign host). Non-fatal: if unreachable, upgrade degrades to a skip. upstream = f"https://git.coopcloud.tech/coop-cloud/{recipe}.git" subprocess.run( ["git", "-C", dest, "fetch", "--quiet", upstream, "refs/tags/*:refs/tags/*"], check=False, ) else: # Clean re-fetch from the catalogue. rm first so a leftover dir from a prior SRC+REF run # (origin → private mirror, maybe lacking tags) can't poison the catalogue fetch. subprocess.run(["rm", "-rf", dest], check=False) subprocess.run(["abra", "recipe", "fetch", recipe, "-n"], check=True) def snapshot_recipe_tests(recipe: str) -> str | None: """Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the recipe to a version tag during the run. Returns the snapshot path, or None if no tests/.""" src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests") if not os.path.isdir(src): return None has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile( os.path.join(src, "install_steps.sh") ) if not has_overlay: return None dst = os.path.join(tempfile.gettempdir(), f"ccci-recipe-tests-{recipe}") shutil.rmtree(dst, ignore_errors=True) shutil.copytree(src, dst) return dst def _load_meta(recipe: str) -> dict: """Mirror tests/conftest._recipe_meta so the orchestrator's deploy/wait uses the same per-recipe config the tiers see (timeouts, health path/codes).""" meta = { "HEALTH_PATH": "/", "HEALTH_OK": (200, 301, 302), "DEPLOY_TIMEOUT": 600, "HTTP_TIMEOUT": 300, } path = os.path.join(ROOT, "tests", recipe, "recipe_meta.py") if os.path.exists(path): ns: dict = {} with open(path) as fh: exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo) for k in list(meta) + ["BACKUP_CAPABLE", "SKIP_GENERIC"]: if k in ns: meta[k] = ns[k] return meta def _tier_env(domain: str) -> dict: return dict(os.environ, CCCI_APP_DOMAIN=domain, CCCI_BASE_URL=f"https://{domain}") def _skip_generic(op: str, meta: dict) -> bool: """Whether the generic assertion for `op` is opted out (Phase 1e HC3). Default: run (additive). Opt-out, any of: env CCCI_SKIP_GENERIC (all ops), env CCCI_SKIP_GENERIC_, or the recipe's declarative recipe_meta.SKIP_GENERIC list (op name, or "all"/"*").""" if _truthy(os.environ.get("CCCI_SKIP_GENERIC")): return True if _truthy(os.environ.get(f"CCCI_SKIP_GENERIC_{op.upper()}")): return True sg = [str(s).lower() for s in (meta.get("SKIP_GENERIC") or [])] return "all" in sg or "*" in sg or op in sg def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, meta: dict) -> None: """Run the optional pre-op seed hook (recipe ops.py `pre_`) BEFORE the harness performs the op (HC3 op/assertion split): overlays seed data-continuity markers / the backup→restore mutation here, then assert post-op in test_.py. cc-ci's ops.py is trusted; a repo-local ops.py is consulted only for allowlist-approved recipes (HC2 gate is inside discovery.pre_op_hook). Imported in-process; the recipe dir is put on sys.path so an ops.py can import its sibling helpers.""" hook = discovery.pre_op_hook(recipe, op, repo_local) if not hook: return source, path = hook d = os.path.dirname(path) sys.path.insert(0, d) try: spec = importlib.util.spec_from_file_location(f"ccci_ops_{recipe}_{op}", path) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) print(f" pre-op seed ({source}): {os.path.relpath(path, ROOT)}::pre_{op}", flush=True) getattr(mod, f"pre_{op}")(domain, meta) finally: if d in sys.path: sys.path.remove(d) def _perform_op(op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict) -> None: """Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a new install (HC1 reconciliation).""" if op == "upgrade": before = generic.perform_upgrade(domain, recipe, head_ref) op_state["upgrade"] = {"before": before, "head_ref": head_ref} elif op == "backup": op_state["backup"] = {"snapshot_id": generic.perform_backup(domain)} elif op == "restore": generic.perform_restore(domain) # install: already deployed; no op def run_lifecycle_tier( recipe: str, op: str, repo_local: str | None, domain: str, meta: dict, head_ref: str | None, op_state: dict, ) -> str: """Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns 'pass' | 'fail' | 'skip'.""" overlay = discovery.resolve_overlay_op(recipe, op, repo_local) skip_gen = _skip_generic(op, meta) files: list[tuple[str, str]] = [] if not skip_gen: files.append(discovery.generic_op(op)) if overlay: files.append(overlay) if not files: # generic opted out AND no overlay → nothing would assert; don't perform a pointless mutating op print(f"\n===== TIER: {op} — SKIP (generic opted out, no overlay) =====", flush=True) return "skip" ov = f"{overlay[0]}:{os.path.relpath(overlay[1], ROOT)}" if overlay else "none" print( f"\n===== TIER: {op} (generic={'skip' if skip_gen else 'run'}, overlay={ov}) =====", flush=True, ) # 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail. try: _run_pre_hook(recipe, op, repo_local, domain, meta) _perform_op(op, domain, recipe, head_ref, op_state) with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f: json.dump(op_state, f) except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash print(f"!! {op} op failed: {_scrub(str(e))}", flush=True) return "fail" # 3) assertions: generic (unless opted out) + overlay, each its own pytest, all against the # single post-op deployment. Generic runs first so an overlay may assume readiness. rc_all = 0 for source, path in files: print(f" assert ({source}): {os.path.relpath(path, ROOT)}", flush=True) rc = run_redacted( [sys.executable, "-m", "pytest", "-v", "-rA", path], env=_tier_env(domain) ) if rc != 0: rc_all = rc return "pass" if rc_all == 0 else "fail" def run_custom(recipe: str, repo_local: str | None, domain: str) -> str: """Run all discovered non-lifecycle custom test_*.py (both locations, additive). Returns 'skip' if none defined, else 'pass'/'fail'.""" customs = discovery.custom_tests(recipe, repo_local) if not customs: return "skip" print("\n===== TIER: custom =====", flush=True) rc_all = 0 for source, path in customs: rel = os.path.relpath(path, ROOT) print(f" custom ({source}): {rel}", flush=True) rc = run_redacted( [sys.executable, "-m", "pytest", "-v", "-rA", path], env=_tier_env(domain) ) if rc != 0: rc_all = rc return "pass" if rc_all == 0 else "fail" def main() -> int: recipe = os.environ.get("RECIPE") if not recipe: print("RECIPE env is required", file=sys.stderr) return 2 ref = os.environ.get("REF") or None src = os.environ.get("SRC") or None target = os.environ.get("VERSION") or None stages = { s.strip() for s in os.environ.get("STAGES", ",".join(ALL_STAGES)).split(",") if s.strip() } print( f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}" ) fetch_recipe(recipe, ref, src) # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test # (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout # HEAD (the catalogue current) for a non-PR `!testme`. Captured before any version-tag checkout. head_ref = ref or lifecycle.recipe_head_commit(recipe) repo_local = snapshot_recipe_tests(recipe) meta = _load_meta(recipe) domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref) # Deploy-once base version: previous published version when the upgrade tier will run and one # exists (so upgrade goes previous→target in place), else the target (current/$REF). (DECISIONS.) want_upgrade = "upgrade" in stages prev = lifecycle.previous_version(recipe) if want_upgrade else None base = prev or target backup_cap = generic.backup_capable(recipe, meta) hook = discovery.install_steps(recipe, repo_local) # Deploy-count guard (DG4.1): exactly one deploy_app() per run. countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}") with open(countfile, "w") as f: f.write("0") os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile # Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup # snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state(). statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json") with open(statefile, "w") as f: json.dump({}, f) os.environ["CCCI_OP_STATE_FILE"] = statefile op_state: dict = {} # Run-scoped dep state (Phase 2 Q2.3): if this recipe declares DEPS in recipe_meta, the # orchestrator deploys each dep BEFORE the recipe under test, persists their per-run identity # here for dependent tests to read via the `deps_apps` fixture, and tears them down LAST in # finally (reverse order). Empty list when no deps declared. depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json") with open(depsfile, "w") as f: json.dump([], f) os.environ["CCCI_DEPS_FILE"] = depsfile declared = deps_mod.declared_deps(recipe) if declared: print(f"\n===== DEPS: {declared} =====", flush=True) deps_state: list[dict] = [] results: dict[str, str] = {} lifecycle.janitor() dep_deploy_failed = False try: # ---- deps deploy FIRST (sequentially), if declared (Q2.3) ---- if declared: try: # Build a per-dep meta map for readiness waits (timeouts/health-path/codes) dep_metas = {d: _load_meta(d) for d in declared} deps_state = deps_mod.deploy_deps( recipe, os.environ.get("PR", "0"), ref, declared, meta_for=dep_metas ) except Exception as e: # noqa: BLE001 — failed dep deploy is a recipe install failure print(f"!! dep deploy failed: {_scrub(str(e))}", flush=True) dep_deploy_failed = True # ---- deploy ONCE + wait ready (the single deployment all tiers share) ---- if dep_deploy_failed: deploy_ok = False else: try: lifecycle.deploy_app( recipe, domain, version=base, secrets=True, install_steps_hook=hook ) lifecycle.wait_healthy( domain, ok_codes=tuple(meta["HEALTH_OK"]), path=meta["HEALTH_PATH"], deploy_timeout=meta["DEPLOY_TIMEOUT"], http_timeout=meta["HTTP_TIMEOUT"], ) deploy_ok = True except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure, not a crash print(f"!! deploy/readiness failed: {e}", flush=True) deploy_ok = False # ---- INSTALL tier (always; additive generic + overlay, no op) ---- if "install" in stages: results["install"] = ( run_lifecycle_tier(recipe, "install", repo_local, domain, meta, head_ref, op_state) if deploy_ok else "fail" ) if deploy_ok: # ---- UPGRADE tier (op once → generic + overlay assert) ---- if "upgrade" in stages: results["upgrade"] = ( run_lifecycle_tier( recipe, "upgrade", repo_local, domain, meta, head_ref, op_state ) if prev else "skip" # only one published version → nothing to upgrade from ) # ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ---- if "backup" in stages: results["backup"] = ( run_lifecycle_tier( recipe, "backup", repo_local, domain, meta, head_ref, op_state ) if backup_cap else "skip" ) if "restore" in stages: results["restore"] = ( run_lifecycle_tier( recipe, "restore", repo_local, domain, meta, head_ref, op_state ) if backup_cap else "skip" ) # ---- CUSTOM tier ---- if "custom" in stages: results["custom"] = run_custom(recipe, repo_local, domain) else: # install failed → the shared deployment is dead; remaining tiers cannot run on it. for op in ("upgrade", "backup", "restore", "custom"): if op in stages: results[op] = "skip" finally: # Teardown the recipe under test FIRST, then deps in reverse declaration order. lifecycle.teardown_app(domain, verify=False) if deps_state: print("\n===== DEPS teardown =====", flush=True) deps_mod.teardown_deps(deps_state) # ---- deploy-count assertion (DG4.1) ---- with open(countfile) as f: deploy_count = int(f.read().strip() or "0") os.remove(countfile) with contextlib.suppress(OSError): os.remove(statefile) with contextlib.suppress(OSError): os.remove(depsfile) # ---- per-op summary (DG6 feed) ---- # Phase 2 Q2.3: deps each `deploy_app` once, so the expected count = 1 (recipe under test) + # len(deps). DG4.1 still holds — no extra deploys per recipe — just accommodates declared deps. expected_deploy_count = 1 + len(deps_state) print("\n===== RUN SUMMARY =====", flush=True) print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})") if deps_state: print(f" deps deployed: {[d['recipe'] for d in deps_state]}") order = [s for s in ALL_STAGES if s in results] for op in order: print(f" {op:8s}: {results[op]}") overall = 0 if deploy_count != expected_deploy_count: print( f"!! deploy-count {deploy_count} != {expected_deploy_count} (DG4.1 violation)", file=sys.stderr, ) overall = 1 if any(v == "fail" for v in results.values()): overall = 1 if not results: print("no tiers ran", file=sys.stderr) return 1 return overall if __name__ == "__main__": raise SystemExit(main())