#!/usr/bin/env python3 """Top-level CI orchestrator (plan §4.3 + Phase 1d/1e), invoked by the Drone pipeline (or by hand). Model: deploy the app ONCE, then run lifecycle TIERS against that single shared deployment, then ONE teardown in `finally`. Per Phase 1e the orchestrator OWNS each mutating op (HC3): for a tier it runs the optional pre-op seed hook (recipe ops.py `pre_`), performs the op exactly ONCE (upgrade/backup/restore — install has none), then runs BOTH the generic assertion file (the floor, unless explicitly opted out) AND the recipe overlay assertion file (if any) against the shared post-op state — generic and overlay are ADDITIVE, not override (HC3). Op results an assertion needs (pre-upgrade identity, snapshot_id) pass op→assertion via a run-scoped JSON state file ($CCCI_OP_STATE_FILE). The upgrade op deploys the PR-HEAD code under test via `abra app deploy --chaos` (HC1). Repo-local (PR-authored) overlays/hooks run only for allowlist-approved recipes (HC2, gated in harness.discovery). The generic is the default for every op, so ANY recipe is testable with zero config (DG1–DG4). The lifecycle OPS live in the shared harness (harness.generic), not per-recipe (DG7 DRY). Run parameters from env (set by the comment-bridge via Drone build params): RECIPE recipe name (e.g. custom-html) [required] REF PR head commit sha [optional; used for fetch + run-domain hash] PR PR number [optional, default 0] SRC head repo full_name on the mirror [optional] VERSION upgrade target tag (else newest published) [optional] STAGES comma filter of tiers to run [optional, default install,upgrade,backup,restore,custom] Run env (python + pytest + playwright) is provided by `cc-ci-run` (nix/modules/harness.nix); invoke as: cc-ci-run runner/run_recipe_ci.py """ from __future__ import annotations import contextlib import glob import importlib.util import json import os import shutil import subprocess import sys import tempfile import time from typing import NamedTuple ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, "runner")) import warm_reconcile # noqa: E402 (runner/ is on sys.path; owns coop-cloud version ordering) from harness import ( # noqa: E402 abra, canonical, discovery, generic, lifecycle, lifetime, naming, warm, warmsnap, ) from harness import ( # noqa: E402 card as card_mod, ) from harness import ( # noqa: E402 deps as deps_mod, ) from harness import ( # noqa: E402 lint as lint_mod, ) from harness import ( # noqa: E402 manifest as manifest_mod, ) from harness import ( # noqa: E402 meta as meta_mod, ) from harness import ( # noqa: E402 results as results_mod, ) from harness import ( # noqa: E402 screenshot as screenshot_mod, ) from harness import ( # noqa: E402 settings as settings_mod, ) ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom") def sso_dep_unverified(declared, deps_ready: bool, requires_deps_skipped: int) -> bool: """F2-11 gate predicate (pure, unit-tested). True when a recipe declares DEPS but its dep provisioning failed (deps not ready) AND that caused ≥1 `requires_deps` (SSO/OIDC) test to SKIP. In that case the recipe's characteristic SSO claim was NOT verified, so the run must NOT report GREEN — even though a skip-only pytest file exits 0 and leaves every tier 'pass'. Generic-tier failure-isolation is preserved (those results stand); only the green SIGNAL is corrected. Gated on skip>0 so a deps-declaring recipe with no requires_deps tests isn't false-failed.""" return bool(declared) and not deps_ready and requires_deps_skipped > 0 class BasePlan(NamedTuple): """Resolved upgrade-base decision (phase prevb). `kind`: - "version" → deploy a pinned published version (`version`): the last-green (warm-canonical) version (or its step-back). `previous/` may apply (version-guarded). - "ref" → deploy the target-branch (main) tip at commit `ref` (chaos): the true predecessor the PR merges onto, used when there is no last-green. `previous/` never applies to a ref base. - "skip" → no upgrade base; the single deploy is the PR head and the upgrade tier records a declared skip with `reason` (upgrade∉stages / EXPECTED_NA / new recipe / head==main tip).""" kind: str version: str | None ref: str | None reason: str @property def runs(self) -> bool: return self.kind in ("version", "ref") def resolve_upgrade_base( stages, meta, recipe: str, head_ref: str | None = None, head_version: str | None = None ) -> BasePlan: """Dynamic upgrade-base resolution (phase prevb, replaces the static `recipe_versions[-2]` default). Chain: last-green (warm canonical, with same-version step-back) → newest release tag older than head → main-tip → skip. EXPECTED_NA[upgrade] / upgrade∉stages short-circuit to a declared skip first. SKIP_CANONICALS_FOR_UPGRADE (phase settings, server settings.toml, default false): when true, the canonical lookup is bypassed entirely — the resolver behaves as if no canonical exists and takes the no-canonical release-tag-first fallback (`_no_canonical_base`). Scope is the upgrade BASE only; canonical promotion and the `--quick` warm-reattach are unaffected (see DECISIONS, phase settings). `head_version` is the head checkout's published version (the `coop-cloud..version` label; see abra.head_compose_version). When the last-green warm-canonical version EQUALS it, deploying the canonical as the base would be a vacuous same-version no-op, so the resolver STEPS BACK to the newest published version strictly older than the head (phase samever) — the upgrade tier always crosses a real version delta. This is the nightly STEADY STATE: a green cold-on-latest run promotes canonical→latest, so the next night finds canonical == head and must step back. Skip only when no older published predecessor exists. last-green is the PRIMARY base — the version cc-ci last recorded green for this recipe (the warm-canonical registry record). main-tip is the FALLBACK: the recipe repo's `main` HEAD, the real predecessor the PR merges on top of, used when there is no last-green. Else the tier is skipped with a recorded reason (structural, declared — not a silent pass). The old `UPGRADE_BASE_VERSION` explicit-override knob was REMOVED in phase canon (§2.G): the dynamic last-green/step-back resolution makes it redundant (its only remaining user, plausible, now resolves base 3.0.1 via step-back once its canonical is established). See DECISIONS.""" if "upgrade" not in stages: return BasePlan("skip", None, None, "upgrade tier not in requested stages") declared = (meta.EXPECTED_NA or {}).get("upgrade") if declared: print( f"== upgrade tier: declared EXPECTED_NA['upgrade'] — single deploy is the PR head. " f"Reason: {declared}", flush=True, ) return BasePlan("skip", None, None, f"declared EXPECTED_NA[upgrade]: {declared}") skip_canonicals = settings_mod.get().skip_canonicals_for_upgrade rec = canonical.read_registry(recipe) if rec and rec.get("version") and not skip_canonicals: canon = rec["version"] same = head_version is not None and warm_reconcile.version_key( canon ) == warm_reconcile.version_key(head_version) if not same: # canonical ≠ head version (the common version-bump PR / nightly-with-new-version case): # the green-verified primary base, unchanged from prevb. return BasePlan( "version", canon, None, f"last-green (warm canonical, status={rec.get('status')})", ) # canonical == head version → deploying it would be a same-version no-op. Step back to the # newest published version strictly older than the head (phase samever). older = warm_reconcile.newest_older_version( warm_reconcile.recipe_tags(recipe), head_version ) if older: return BasePlan( "version", older, None, f"step-back: last-green canonical ({canon}) == head version {head_version}; " f"newest older published base", ) return BasePlan( "skip", None, None, f"base == head ({head_version}) and no older published predecessor", ) # No canonical in play — none recorded, OR SKIP_CANONICALS_FOR_UPGRADE=true (canonical lookup # bypassed entirely, behaving as if none exists). Improved fallback (phase settings §2.C): prefer # a REAL published predecessor (newest release tag < head) over the raw main-tip. return _no_canonical_base(recipe, head_ref, head_version) def _no_canonical_base(recipe: str, head_ref: str | None, head_version: str | None) -> BasePlan: """Upgrade base when no canonical is used (none recorded, its promote failed, or SKIP_CANONICALS_FOR_UPGRADE is true). Release-tag-first fallback (phase settings §2.C): 1. most recent release TAG with version strictly older than the PR head — a clean published predecessor (reuses samever's `newest_older_version` helper, the single source of version ordering, so this and the step-back never diverge); 2. raw `main`-tip (target-branch tip) — only if the recipe has NO prior release tag at all; 3. skip — no predecessor (no older tag and head == main-tip, or no main at all). This replaces the old jump-straight-to-main-tip path, so an un-promoted recipe upgrades from a real release base instead of a possibly-untagged WIP commit.""" older = ( warm_reconcile.newest_older_version(warm_reconcile.recipe_tags(recipe), head_version) if head_version else None ) if older: return BasePlan( "version", older, None, f"no-canonical fallback: newest release tag older than head {head_version}", ) main_tip = lifecycle.recipe_branch_commit(recipe, "main") if main_tip and main_tip != head_ref: return BasePlan( "ref", None, main_tip, "no-canonical fallback: target-branch (main) tip (no prior release tag)", ) if main_tip and main_tip == head_ref: return BasePlan("skip", None, None, "head == main tip (no predecessor delta)") return BasePlan( "skip", None, None, "no release tag and no main tip (new recipe / no predecessor)" ) def _truthy(v: str | None) -> bool: return str(v or "").strip().lower() in ("1", "true", "yes", "on") def _redact_values() -> list[str]: """Values to scrub from published logs (D6 redaction filter, plan §4.4). The infra secrets materialised at /run/secrets/* — if any subprocess ever echoes one, mask it. Only >=8-char values, so it never false-positives on short strings / SHAs.""" vals = set() for p in glob.glob("/run/secrets/*"): try: with open(p) as f: v = f.read().strip() except OSError: continue if len(v) >= 8: vals.add(v) return sorted(vals, key=len, reverse=True) _REDACT = _redact_values() def _scrub(text: str) -> str: """Mask any known infra-secret value in a string (D6 redaction, plan §4.4).""" for v in _REDACT: if v in text: text = text.replace(v, "***REDACTED***") return text def run_redacted(cmd: list[str], env: dict | None = None) -> int: """Run a subprocess, streaming output live (so Drone logs stay tail-able) but masking any known infra-secret value first. Belt-and-suspenders: the harness never prints secrets and abra doesn't echo generated ones.""" proc = subprocess.Popen( cmd, cwd=ROOT, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, ) assert proc.stdout is not None for line in proc.stdout: sys.stdout.write(_scrub(line)) sys.stdout.flush() return proc.wait() def _gitea_token() -> str | None: tok = os.environ.get("GITEA_TOKEN") if not tok and os.path.exists("/run/secrets/bridge_gitea_token"): with open("/run/secrets/bridge_gitea_token") as f: tok = f.read().strip() return tok or None def _run_state_path(name: str) -> str: """Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain. A second run of the SAME domain overlaps this process (its main() preamble executes before it blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them via the CCCI_*_FILE env vars, so the key only has to be unique per harness process.""" rid = results_mod.run_id() return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}") def setup_run_abra_dir() -> str: """P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees. `//abra/` with: servers/ -> symlink to the canonical ~/.abra/servers. App .env files land in the shared canonical path, so janitor discovery (`abra app ls`) and env-based teardown work unchanged from any process; per-domain filenames + the app-domain lock prevent write conflicts. catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly). recipes/ fresh + empty — THE isolation that matters: each run clones and git-checkouts its own recipe trees, so concurrent runs (same recipe included) can never corrupt each other's deploy tree. Replaces the per-recipe flock. Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper (abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention.""" canonical = os.path.expanduser("~/.abra") rid = results_mod.run_id() if rid == "manual": rid = f"manual-{os.getpid()}" # two concurrent hand-runs must not share a tree run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra") os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True) for shared in ("servers", "catalogue"): link = os.path.join(run_abra_dir, shared) if not os.path.islink(link): os.symlink(os.path.join(canonical, shared), link) os.environ["ABRA_DIR"] = run_abra_dir print( f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==", flush=True, ) return run_abra_dir def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: """Make the recipe available at the code under test in THIS RUN's recipe tree ($ABRA_DIR/recipes/): a plain clone — no locking needed, no rm-rf of any shared state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered `abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR, clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed).""" dest = abra.recipe_dir(recipe) os.makedirs(os.path.dirname(dest), exist_ok=True) # CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a # fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it # being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/; # copy it into the per-run tree so the rest of the run reads the staged state. Never set in # production CI. if os.environ.get("CCCI_SKIP_FETCH") == "1": canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}") subprocess.run(["rm", "-rf", dest], check=False) if os.path.isdir(canonical): shutil.copytree(canonical, dest, symlinks=True) print( f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is " f"(copied {canonical} -> per-run tree)", flush=True, ) return if src and ref: url = f"https://git.autonomic.zone/{src}.git" git = ["git"] tok = _gitea_token() if tok: git += ["-c", f"http.extraHeader=Authorization: token {tok}"] subprocess.run(["rm", "-rf", dest], check=False) subprocess.run([*git, "clone", "--quiet", url, dest], check=True) subprocess.run([*git, "-C", dest, "checkout", "--quiet", ref], check=True) # Bring in published version TAGS from the public upstream so the upgrade tier can deploy a # previous published version (mirror PR branches carry no release tags). Read-only + plain git # (no bot token to a foreign host). Non-fatal: if unreachable, upgrade degrades to a skip. upstream = f"https://git.coopcloud.tech/coop-cloud/{recipe}.git" subprocess.run( ["git", "-C", dest, "fetch", "--quiet", upstream, "refs/tags/*:refs/tags/*"], check=False, ) else: # Clean re-fetch from the catalogue. rm first so a leftover dir from a prior SRC+REF run # (origin → private mirror, maybe lacking tags) can't poison the catalogue fetch. subprocess.run(["rm", "-rf", dest], check=False) subprocess.run(["abra", "recipe", "fetch", recipe, "-n"], check=True) def snapshot_recipe_tests(recipe: str) -> str | None: """Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the recipe to a version tag during the run. Returns the snapshot path, or None if no tests/.""" src = os.path.join(abra.recipe_dir(recipe), "tests") if not os.path.isdir(src): return None has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile( os.path.join(src, "install_steps.sh") ) if not has_overlay: return None dst = os.path.join(tempfile.gettempdir(), f"ccci-recipe-tests-{recipe}") shutil.rmtree(dst, ignore_errors=True) shutil.copytree(src, dst) return dst def _tier_env(domain: str) -> dict: return dict(os.environ, CCCI_APP_DOMAIN=domain, CCCI_BASE_URL=f"https://{domain}") def skip_generic_env_overrides() -> list[str]: """Active CCCI_SKIP_GENERIC* env overrides (rcust P2c: the meta key is deleted; the env form is a documented LOCAL-DEV-ONLY escape hatch). Surfaced loudly when set in a CI (drone) run — it reduces generic-floor coverage and must never silently ride a CI verdict.""" return sorted( k for k in os.environ if k.startswith("CCCI_SKIP_GENERIC") and _truthy(os.environ.get(k)) ) def _skip_generic(op: str) -> bool: """Whether the generic assertion for `op` is opted out (Phase 1e HC3). Default: run (additive). Opt-out via env only (dev-only escape hatch, P2c): CCCI_SKIP_GENERIC (all ops) or CCCI_SKIP_GENERIC_. The recipe_meta SKIP_GENERIC key is deleted (zero users).""" if _truthy(os.environ.get("CCCI_SKIP_GENERIC")): return True return _truthy(os.environ.get(f"CCCI_SKIP_GENERIC_{op.upper()}")) def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, meta) -> None: """Run the optional pre-op seed hook (recipe ops.py `pre_`) BEFORE the harness performs the op (HC3 op/assertion split): overlays seed data-continuity markers / the backup→restore mutation here, then assert post-op in test_.py. cc-ci's ops.py is trusted; a repo-local ops.py is consulted only for allowlist-approved recipes (HC2 gate is inside discovery.pre_op_hook). Imported in-process; the recipe dir is put on sys.path so an ops.py can import its sibling helpers.""" hook = discovery.pre_op_hook(recipe, op, repo_local) if not hook: return source, path = hook d = os.path.dirname(path) sys.path.insert(0, d) try: spec = importlib.util.spec_from_file_location(f"ccci_ops_{recipe}_{op}", path) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) print(f" pre-op seed ({source}): {os.path.relpath(path, ROOT)}::pre_{op}", flush=True) fn = getattr(mod, f"pre_{op}") # Uniform ctx convention (rcust P3): pre_(ctx). A legacy (domain, meta) hook fails # HERE with a clear migration message, not a TypeError mid-call. meta_mod.check_hook_signature(fn, ("ctx",), f"{os.path.relpath(path, ROOT)}::pre_{op}") fn(meta_mod.hook_ctx(domain, meta, op=op)) finally: if d in sys.path: sys.path.remove(d) def _perform_op( op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict, deploy_timeout: int = 900, meta=None, ) -> None: """Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a new install (HC1 reconciliation). `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the upgrade chaos redeploy so a heavy reconverge isn't SIGKILLed by the 900s default mid-wait; `meta` lets the upgrade op own a recipe-aware convergence+health wait (F2-12, READY_PROBE).""" if op == "upgrade": before = generic.perform_upgrade( domain, recipe, head_ref, deploy_timeout=deploy_timeout, meta=meta ) op_state["upgrade"] = {"before": before, "head_ref": head_ref} elif op == "backup": # Backup integrity + retry (F2-14b). A recipe may define BACKUP_VERIFY(domain) -> bool that # confirms the backup actually captured the recipe's critical data AFTER the op. This guards a # real race: a DB recipe dumps its data in a backupbot pre-hook, but if the DB container cycles # mid-dump (intermittent under host load) the dump is truncated/absent, so restic snapshots an # empty path — `abra app backup create` still "succeeds", yet a later restore silently loses the # data (ghost: backup.sql.gz never written → restore can't reimport → seeded row gone). When # verify fails we re-run the WHOLE backup (fresh restic snapshot) with a re-stabilised DB, up to # 3 attempts. Recipes without BACKUP_VERIFY are unaffected (single backup, as before). snap = generic.perform_backup(domain) verify = meta.BACKUP_VERIFY if meta else None verify_ctx = meta_mod.hook_ctx(domain, meta, op="backup") if meta else None attempt = 1 while callable(verify) and not verify(verify_ctx) and attempt < 3: attempt += 1 print( f" backup-verify FAILED (attempt {attempt - 1}/3) — backup did not capture the " f"recipe's critical data (e.g. DB cycled mid-dump); re-running backup", flush=True, ) snap = generic.perform_backup(domain) if callable(verify) and not verify(verify_ctx): print( f" !! backup-verify still FAILED after {attempt} attempts — backup is incomplete", flush=True, ) op_state["backup"] = {"snapshot_id": snap} elif op == "restore": generic.perform_restore(domain) # install: already deployed; no op def run_lifecycle_tier( recipe: str, op: str, repo_local: str | None, domain: str, meta, head_ref: str | None, op_state: dict, records: list[dict] | None = None, junit_dir: str | None = None, ) -> str: """Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns 'pass' | 'fail' | 'skip'. Phase 3 (R1/R3): when `records`/`junit_dir` are given, each pytest file is run with --junitxml and a {tier,source,file,rc,junit} record appended, so the run can assemble per-stage/per-test results.json + the level afterwards. Purely additive — does not change the verdict.""" overlay = discovery.resolve_overlay_op(recipe, op, repo_local) skip_gen = _skip_generic(op) files: list[tuple[str, str]] = [] if not skip_gen: files.append(discovery.generic_op(op)) if overlay: files.append(overlay) if not files: # generic opted out AND no overlay → nothing would assert; don't perform a pointless mutating op print(f"\n===== TIER: {op} — SKIP (generic opted out, no overlay) =====", flush=True) return "skip" ov = f"{overlay[0]}:{os.path.relpath(overlay[1], ROOT)}" if overlay else "none" print( f"\n===== TIER: {op} (generic={'skip' if skip_gen else 'run'}, overlay={ov}) =====", flush=True, ) # 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail. try: _run_pre_hook(recipe, op, repo_local, domain, meta) _perform_op( op, domain, recipe, head_ref, op_state, deploy_timeout=int(meta.DEPLOY_TIMEOUT), meta=meta, ) with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f: json.dump(op_state, f) except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash print(f"!! {op} op failed: {_scrub(str(e))}", flush=True) return "fail" # 3) assertions: generic (unless opted out) + overlay, each its own pytest, all against the # single post-op deployment. Generic runs first so an overlay may assume readiness. rc_all = 0 for source, path in files: print(f" assert ({source}): {os.path.relpath(path, ROOT)}", flush=True) cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path] jx = None if junit_dir is not None: jx = results_mod.junit_file(junit_dir, op, source, path) cmd.append(f"--junitxml={jx}") rc = run_redacted(cmd, env=_tier_env(domain)) if records is not None: records.append( { "tier": op, "source": source, "file": os.path.relpath(path, ROOT), "rc": rc, "junit": jx, } ) if rc != 0: rc_all = rc return "pass" if rc_all == 0 else "fail" def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> dict[str, dict]: """For each dep, set up a fresh realm/client + test user via the harness's provider-specific setup function, then return a recipe→entry dict carrying domain + admin + realm/client/user info — the shape the `install_steps.sh` hook (and dependent tests) read. Provider routing: keycloak (OIDC realm/client) and gitea (OAuth2 app for drone) are supported. authentik will need a parallel `setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9). """ from harness import sso, warm # local import — sso may not be needed for dep-less runs out: dict[str, dict] = {} for entry in deps_list or []: dep_recipe = entry.get("recipe") dep_domain = entry.get("domain") if not dep_recipe or not dep_domain: continue if dep_recipe == "gitea": # Gitea dep provider (phase drone): create admin user + OAuth2 app so the # dependent recipe's install_steps.sh can wire DRONE_GITEA_* before deploy. creds = sso.setup_gitea_oauth(dep_domain, parent_domain) out[dep_recipe] = { "recipe": dep_recipe, "domain": dep_domain, "admin_user": creds["admin_user"], "admin_password": creds["admin_password"], "client_id": creds["client_id"], "client_secret": creds["client_secret"], } continue if dep_recipe != "keycloak": # Provider not yet supported — record bare entry; install_steps.sh / tests will # raise if they need realm/client info they don't see. out[dep_recipe] = entry continue # The realm is the per-run isolation unit on a (possibly shared live-warm) keycloak: name it # "-<6hex>" so concurrent dependents — even two PRs of the SAME recipe — never # collide on a realm (WC1). client_id stays the parent recipe name (isolated within the # unique realm; predictable for debugging). realm = warm.realm_for(parent_recipe, parent_domain) client_id = parent_recipe creds = sso.setup_keycloak_realm( dep_domain, realm=realm, client_id=client_id, redirect_uris=[f"https://{parent_domain}/*"], web_origins=[f"https://{parent_domain}"], ) out[dep_recipe] = { "recipe": dep_recipe, "domain": dep_domain, "warm": bool(entry.get("warm")), "realm": creds["realm"], "client_id": creds["client_id"], "client_secret": creds["client_secret"], "user": creds["user"], "password": creds["password"], "email": creds["email"], "discovery_url": creds["discovery_url"], "token_url": creds["token_url"], "auth_url": creds["auth_url"], "userinfo_url": creds["userinfo_url"], "admin_user": "admin", "admin_password": sso.admin_password_inside(dep_domain), } return out def _provision_deps( recipe: str, domain: str, ref: str | None, declared: list[str] ) -> dict[str, dict]: """Provision a run's declared deps and write `$CCCI_DEPS_FILE`; return the recipe→entry deps_state. Splits deps into live-warm (shared provider at a stable domain + a per-run realm) vs cold (co-deployed per run), provisions each dep's SSO realm/client/user, and persists the enriched dict the `install_steps.sh` hooks + dependent tests read. Raises on any failure (the caller marks deps-not-ready). Install-time wiring is the ONLY mode (rcust P2b): provision BEFORE the single deploy so the install-tier `install_steps.sh` hook wires OIDC env into that one deploy — no reconverge, no post-deploy `setup_custom_tests.sh` machinery. """ warm_deps, cold_deps = [], [] for d in declared: wd = warm.warm_domain(d) if wd and warm.is_warm_up(d, wd): warm_deps.append(d) else: if wd: print(f" dep: {d} warm provider {wd} not up — cold fallback", flush=True) cold_deps.append(d) dep_metas = {d: meta_mod.load(d) for d in cold_deps} deps_list = ( deps_mod.deploy_deps(recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas) if cold_deps else [] ) for d in warm_deps: wd = warm.warm_domain(d) reaped = warm.reap_orphan_realms(d, wd) if reaped: print(f" dep: reaped {len(reaped)} orphan realm(s) on warm {d}: {reaped}", flush=True) deps_list.append({"recipe": d, "domain": wd, "warm": True}) print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True) deps_state = _enrich_deps_with_sso(recipe, domain, deps_list) deps_mod.write_run_state(deps_state) return deps_state def run_custom( recipe: str, repo_local: str | None, domain: str, records: list[dict] | None = None, junit_dir: str | None = None, ) -> str: """Run all discovered non-lifecycle custom test_*.py (both locations, additive). Returns 'skip' if none defined, else 'pass'/'fail'. Phase 3: emits JUnit + records when given.""" customs = discovery.custom_tests(recipe, repo_local) if not customs: return "skip" print("\n===== TIER: custom =====", flush=True) rc_all = 0 for source, path in customs: rel = os.path.relpath(path, ROOT) print(f" custom ({source}): {rel}", flush=True) cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path] jx = None if junit_dir is not None: jx = results_mod.junit_file(junit_dir, "custom", source, path) cmd.append(f"--junitxml={jx}") rc = run_redacted(cmd, env=_tier_env(domain)) if records is not None: records.append({"tier": "custom", "source": source, "file": rel, "rc": rc, "junit": jx}) if rc != 0: rc_all = rc return "pass" if rc_all == 0 else "fail" def _wait_undeployed(domain: str, timeout: int = 120) -> None: """Block until the stack's services are gone after an undeploy (so warmsnap.restore, which requires undeployed, doesn't race a half-removed stack).""" stack = lifecycle._stack_name(domain) # noqa: SLF001 deadline = time.time() + timeout while time.time() < deadline: if not lifecycle._docker_names("service", stack): # noqa: SLF001 return time.sleep(2) def run_quick( recipe: str, ref: str | None, head_ref: str | None, repo_local: str | None, meta ) -> int: """WC4 `--quick` opt-in fast lane (plan §2). Reattach the data-warm canonical (known-good volume) → upgrade IN PLACE to the PR head (chaos) → assert generic UPGRADE (reconverge+moved+serving) + overlay + custom. PASS → undeploy-keep-volume, **known-good UNCHANGED (NEVER promote)**; FAIL → restore the last-known-good snapshot + undeploy (roll back, data safe). Lower-confidence; does NOT gate merge (WC7). Caller has confirmed a canonical exists. NB: the deps wiring + temp-state scaffolding intentionally mirror main()'s cold path rather than refactoring it — keeping the gate-passed cold flow byte-identical (zero regression risk).""" import contextlib domain = canonical.canonical_domain(recipe) reg = canonical.read_registry(recipe) or {} print( f"\n== cc-ci run [MODE=quick]: recipe={recipe} canonical={domain} " f"known-good={reg.get('version')} ref={ref}\n" " quick = LOWER-CONFIDENCE opt-in fast lane; does NOT gate merge; NEVER promotes the canonical", flush=True, ) statefile = _run_state_path("opstate") + ".json" with open(statefile, "w") as f: json.dump({}, f) os.environ["CCCI_OP_STATE_FILE"] = statefile depsfile = _run_state_path("deps") + ".json" with open(depsfile, "w") as f: json.dump({}, f) os.environ["CCCI_DEPS_FILE"] = depsfile skipfile = _run_state_path("depskip") + ".txt" with contextlib.suppress(OSError): os.remove(skipfile) os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile op_state: dict = {} results: dict[str, str] = {} declared = list(meta.DEPS) deps_state: dict = {} deps_ready = True deps_not_ready_reason = "" dep_teardown_error: str | None = None warm_ok = False rolled_back = False lifecycle.janitor() try: # 1) reattach the canonical (warm boot at the known-good version + retained volume) try: canonical.deploy_canonical(recipe, timeout=int(meta.DEPLOY_TIMEOUT)) lifecycle.wait_healthy( domain, ok_codes=tuple(meta.HEALTH_OK), path=meta.HEALTH_PATH, deploy_timeout=meta.DEPLOY_TIMEOUT, http_timeout=meta.HTTP_TIMEOUT, ) warm_ok = True except Exception as e: # noqa: BLE001 print(f"!! canonical reattach/readiness failed: {_scrub(str(e))}", flush=True) if warm_ok: # 2) deps (warm keycloak + per-run realm) — mirrors main()'s warm/cold split. NB # (rcust P2b): deps are provisioned (realm/creds in $CCCI_DEPS_FILE) but quick mode # cannot do install-time OIDC env wiring — the canonical app pre-exists its per-run # realm. No quick-enrolled recipe declares DEPS today; if one ever does, its # requires_deps tests will exercise creds-only flows or skip (F2-11 keeps the signal). if declared: print(f"\n===== deps (quick): {declared} =====", flush=True) try: warm_deps, cold_deps = [], [] for d in declared: wd = warm.warm_domain(d) (warm_deps if (wd and warm.is_warm_up(d, wd)) else cold_deps).append(d) dep_metas = {d: meta_mod.load(d) for d in cold_deps} deps_list = ( deps_mod.deploy_deps( recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas ) if cold_deps else [] ) for d in warm_deps: wd = warm.warm_domain(d) warm.reap_orphan_realms(d, wd) deps_list.append({"recipe": d, "domain": wd, "warm": True}) print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True) deps_state = _enrich_deps_with_sso(recipe, domain, deps_list) deps_mod.write_run_state(deps_state) except Exception as e: # noqa: BLE001 deps_ready = False deps_not_ready_reason = _scrub(str(e))[:300] print( f"!! dep provisioning failed (deps-not-ready): {deps_not_ready_reason}", flush=True, ) # 3) UPGRADE to PR head (chaos) + assert (generic reconverge+moved+serving + overlay) results["upgrade"] = run_lifecycle_tier( recipe, "upgrade", repo_local, domain, meta, head_ref, op_state ) # 4) custom tier os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0" os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason results["custom"] = run_custom(recipe, repo_local, domain) else: results["upgrade"] = "fail" results["custom"] = "skip" finally: # Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py). lifetime.begin_teardown() # F2-11 skip count (read before deciding pass/fail) requires_deps_skipped = 0 try: with open(skipfile) as f: requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip()) except OSError: pass sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped) passed = ( warm_ok and bool(results) and all(v != "fail" for v in results.values()) and not sso_unverified ) # dep teardown: delete per-run warm realms; undeploy cold deps (mirrors cold) if deps_state: ordered = ( [deps_state[d] for d in declared if d in deps_state] if isinstance(deps_state, dict) else deps_state ) for e in [x for x in ordered if x.get("warm")]: try: from harness import sso sso.delete_keycloak_realm(e["domain"], e["realm"]) print( f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True, ) except Exception as ex: # noqa: BLE001 dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}" print(f"!! {dep_teardown_error}", flush=True) try: deps_mod.teardown_deps([x for x in ordered if not x.get("warm")]) except lifecycle.TeardownError as e: dep_teardown_error = str(e) print(f"!! {dep_teardown_error}", flush=True) # canonical teardown — the WC4 contract: # PASS → undeploy, KEEP volume, known-good UNCHANGED (never promote) # FAIL → restore last-known-good snapshot (data safe) then leave undeployed (idle) try: if warm_ok and passed: canonical.undeploy_keep_volume(recipe) print( " quick PASS → canonical undeployed, volume retained, known-good UNCHANGED", flush=True, ) elif warm_ok: print( " quick FAIL → rolling back canonical to last-known-good snapshot", flush=True ) abra.undeploy(domain) _wait_undeployed(domain) warmsnap.restore(recipe, domain) # reset recorded version to the known-good (the failed upgrade set TYPE to the broken # PR commit) so the idle canonical's .env agrees with the registry + re-warms cleanly. if reg.get("version"): abra.env_set(domain, "TYPE", f"{recipe}:{reg['version']}") canonical._set_status(recipe, "idle") # noqa: SLF001 rolled_back = True print( " quick FAIL → restored known-good data; canonical idle (NOT promoted)", flush=True, ) except Exception as e: # noqa: BLE001 dep_teardown_error = (dep_teardown_error or "") + f" | quick teardown/rollback: {e}" print(f"!! quick teardown/rollback error: {e}", flush=True) with contextlib.suppress(OSError): os.remove(statefile) with contextlib.suppress(OSError): os.remove(depsfile) with contextlib.suppress(OSError): os.remove(skipfile) print("\n===== RUN SUMMARY =====", flush=True) print("mode = quick (LOWER-CONFIDENCE; opt-in; does not gate merge)") print( f"canonical = {domain} known-good = {reg.get('version')} (UNCHANGED; quick never promotes)" ) if rolled_back: print("rolled-back = yes (restored last-known-good snapshot)") for op in ("upgrade", "custom"): if op in results: suffix = "" if op == "custom" and requires_deps_skipped: suffix = f" ({requires_deps_skipped} requires_deps SKIPPED — SSO UNVERIFIED)" print(f" {op:8s}: {results[op]}{suffix}") overall = 0 if any(v == "fail" for v in results.values()) or not warm_ok: overall = 1 if sso_unverified: print( f"!! DEPS={declared} but dep provisioning failed and {requires_deps_skipped} " "requires_deps SKIPPED — SSO NOT verified (F2-11)", file=sys.stderr, ) overall = 1 if dep_teardown_error: print(f"!! teardown leaked/erred: {dep_teardown_error}", file=sys.stderr) overall = 1 if not results: print("no tiers ran", file=sys.stderr) return 1 return overall def should_promote_canonical( recipe: str, ref: str | None, overall: int, quick: bool, tagged: bool ) -> bool: """WC5 gate (pure): a run advances/seeds the canonical iff the recipe is enrolled (WARM_CANONICAL), the run was GREEN (overall==0), it was COLD (not --quick), it ran on LATEST (no PR head → `ref` empty: the nightly sweep or a manual `RECIPE=` run), AND the tested head version corresponds to a published release TAG (`tagged`, phase canon §2.A). A PR `!testme` carries REF=PR-head and must NOT promote to a PR's code. An UNTAGGED head (a `main` commit with no release tag for its version) must never become a canonical — the canonical is always a real release. `tagged` is computed by the caller via warm_reconcile.is_released_version so this gate stays pure. Only cold-on-latest-and-tagged advances it.""" return canonical.is_enrolled(recipe) and overall == 0 and not quick and not ref and tagged def promote_canonical( recipe: str, head_ref: str | None, version: str | None, repo_local: str | None = None ) -> None: """canon §2.A / WC5: (re)seed the canonical at the GREEN-VERIFIED TESTED RELEASE `version` — the exact version under test (head_version), which the should_promote tagged-gate guarantees is a published release tag. Deploy `warm-` at that version as a FAITHFUL install (the same wiring the cold install used — deps + install_steps + overlay + secrets), wait healthy, undeploy, snapshot + record the registry (atomic replace of the last-known-good). The warm deploy must reproduce the cold install, not a bare `abra app deploy` (canon M2 finding): - CLEAN the recipe tree first. The sweep's run_on_tag sets CCCI_SKIP_FETCH=1 so the cold run stages the tag; by promote time that per-run tree was mutated by the tier suite (chaos head checkout + the untracked compose.ccci.yml overlay), which makes `abra app new` FATA "locally unstaged changes". A forced re-checkout of the tag + `git clean -fd` restores a pristine tree. - PROVISION DEPS (OIDC realms) + run INSTALL_STEPS, exactly like the cold install. Without these, recipes whose healthy state depends on them fail the warm deploy though the cold test was green — e.g. bluesky-pds (install_steps inserts the non-generatable pds_plc_rotation_key), custom-html-tiny (install_steps seeds index.html), and any DEPS recipe (OIDC env). Promotes EXACTLY the tested version (never re-derives `latest_version`). The OLD known-good is replaced ONLY here, after green (never lost on a red run).""" domain = canonical.canonical_domain(recipe) if not version: print(f"WC5 promote: no tested release version for {recipe} — skip", flush=True) return warm_reconcile.fetch_recipe( recipe ) # no-op under CCCI_SKIP_FETCH; real fetch on the manual path meta = meta_mod.load(recipe) # The cold run's deploy-count was already asserted + the countfile removed; don't perturb it. os.environ.pop("CCCI_DEPLOY_COUNT_FILE", None) # FRESH SEED only (no existing canonical): clear any leftover warm- stack state from a # PRIOR FAILED promote attempt (secrets/.env/partial volumes). Without this, a recipe whose # install_steps inserts a non-generatable secret (e.g. drone's gitea client_secret) FATAs # "secret … already exists" on the retry, so a once-failed promote can never recover. A # re-promote (canonical EXISTS) must NOT teardown — it reattaches its retained known-good volume. if not canonical.read_registry(recipe): lifecycle.teardown_app(canonical.canonical_domain(recipe), verify=False) # Pristine tree at the tag: discard the cold run's tier mutations + untracked overlay so the # pinned `abra app new` clean-tree gate passes (deploy_app re-applies the overlay + auto-chaos). abra.recipe_checkout(recipe, version) subprocess.run( ["git", "-C", abra.recipe_dir(recipe), "clean", "-fd"], capture_output=True, text=True ) print( f"\n===== WC5 promote-on-green-cold: (re)seed canonical {recipe} @ {version} =====", flush=True, ) # Faithful install wiring: deps (OIDC) then install_steps (via deploy_app's hook), same as cold. # Release the cold run's process-lifetime app/dep locks first: the cold test + its deps are torn # down by now, but their locks are still held by THIS process, so re-provisioning a COLD dep # (e.g. drone→gitea) would self-deadlock on acquire_app_lock. Serial sweep → safe to release. lifecycle.release_app_locks() declared = list(meta.DEPS) if declared: try: _provision_deps(recipe, domain, None, declared) print(f" WC5 promote: provisioned deps {declared} for warm {domain}", flush=True) except Exception as e: # noqa: BLE001 — log; deploy may still come up for non-blocking deps print( f" WC5 promote: dep provisioning failed ({_scrub(str(e))}) — deploying anyway", flush=True, ) hook = discovery.install_steps(recipe, repo_local) lifecycle.deploy_app( recipe, domain, version=version, secrets=True, install_steps_hook=hook, deploy_timeout=int(meta.DEPLOY_TIMEOUT), meta=meta, ) lifecycle.wait_healthy( domain, ok_codes=tuple(meta.HEALTH_OK), path=meta.HEALTH_PATH, deploy_timeout=meta.DEPLOY_TIMEOUT, http_timeout=meta.HTTP_TIMEOUT, ) lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="install") abra.undeploy(domain) _wait_undeployed(domain) canonical.seed_canonical(recipe, version, commit=head_ref) print( f"WC5 promote: canonical {recipe} advanced to known-good {version} (idle, volume retained)", flush=True, ) def main() -> int: # P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard # deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py). lifetime.install_lifetime_guards() recipe = os.environ.get("RECIPE") if not recipe: print("RECIPE env is required", file=sys.stderr) return 2 ref = os.environ.get("REF") or None src = os.environ.get("SRC") or None target = os.environ.get("VERSION") or None stages = { s.strip() for s in os.environ.get("STAGES", ",".join(ALL_STAGES)).split(",") if s.strip() } print( f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}" ) # P2c: the CCCI_SKIP_GENERIC* env escape hatch is LOCAL-DEV-ONLY. If it rides a CI (drone) # run, shout — generic-floor coverage is reduced and the verdict must not look routine. for ov in skip_generic_env_overrides(): if os.environ.get("DRONE"): print( f"!! {ov}=1 — dev-only generic-floor override ACTIVE IN A CI RUN; generic " "assertions are suppressed for the affected op(s). This must never gate a merge.", flush=True, ) else: print(f"== {ov}=1 (dev-only generic-floor override active)", flush=True) # Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR # (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs # serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md). setup_run_abra_dir() fetch_recipe(recipe, ref, src) # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test # (HC1). Always resolve to the actual git SHA — `ref` may be a branch name ("main") which fails # the HC1 commit-identity check (chaos-version is always a SHA). recipe_head_commit runs # git-rev-parse HEAD, which returns the SHA of wherever the fetch/checkout landed. head_ref = lifecycle.recipe_head_commit(recipe) repo_local = snapshot_recipe_tests(recipe) meta = meta_mod.load(recipe) # Customization manifest (rcust P5, R4): ONE block answering "what does this recipe # customize?" across all surfaces — printed here and embedded verbatim in results.json under # "customization". Pure presentation; never influences a verdict. customization = manifest_mod.build(recipe, meta, repo_local) print("\n" + manifest_mod.render(recipe, customization) + "\n", flush=True) # WC4/WC7: opt-in `--quick` fast lane. Requires an existing data-warm canonical; if none, fall # back cleanly to the full COLD run below so the PR is still tested (DECISIONS Phase-2w). if os.environ.get("CCCI_QUICK") == "1" or os.environ.get("MODE") == "quick": if canonical.has_canonical(recipe): return run_quick(recipe, ref, head_ref, repo_local, meta) print( f"MODE=quick requested but no canonical for {recipe} — falling back to COLD run " "(no-canonical fallback, WC7)", flush=True, ) domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref) head_version = abra.head_compose_version(recipe) base_plan = resolve_upgrade_base( stages, meta, recipe, head_ref=head_ref, head_version=head_version ) prev = base_plan.runs # gates the upgrade tier # base deploy target: a pinned published version (kind=version) or main-tip commit (kind=ref); # on skip fall back to the run's VERSION/head (target=None → chaos head deploy, as before). base = base_plan.version or target base_ref = base_plan.ref prev_status = lifecycle.previous_status(recipe, base_plan.kind, base_plan.version) print( f"== upgrade base: kind={base_plan.kind} " f"{('version=' + base) if base_plan.kind == 'version' else ''}" f"{('ref=' + (base_ref or '')[:12]) if base_plan.kind == 'ref' else ''}" f"{(' SKIP: ' + base_plan.reason) if base_plan.kind == 'skip' else ''} " f"({base_plan.reason if base_plan.kind != 'skip' else ''})", flush=True, ) if prev_status["stale"]: print(f"!! previous/ STALE — {prev_status['reason']}", flush=True) elif prev_status["apply"]: print(f"== previous/ applies to the base deploy (targets {base})", flush=True) backup_cap = generic.backup_capable(recipe, meta) hook = discovery.install_steps(recipe, repo_local) # Deploy-count guard (DG4.1): exactly one deploy_app() per run. countfile = _run_state_path("deploys") with open(countfile, "w") as f: f.write("0") os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile # Phase 3 (R1/R3): per-run artifact dir + JUnit dir. The tiers emit JUnit per file and append a # {tier,source,file,rc,junit} record; after the run we assemble results.json (per-stage/per-test + # level) into the artifact dir. Best-effort — never changes the verdict (R7). run_artifact_dir = os.path.join(results_mod.runs_dir(), results_mod.run_id()) junit_dir = os.path.join(run_artifact_dir, "junit") records: list[dict] = [] # L5 lint rung (phase lvl5): `abra recipe lint` against the EXACT tested ref, in a pristine # scratch clone (harness.lint — the per-run tree is still at head_ref here, before any # version-pinning checkout). Level rung only — NEVER the verdict: run_lint catches every # failure mode into status "unver" (60s hard budget) and this belt-and-braces wrap makes a # crashed executor identical to "could not verify". lint_result = {"status": "unver", "detail": "lint executor crashed", "rules_failed": []} try: lint_result = lint_mod.run_lint(recipe, head_ref, run_artifact_dir) except Exception as e: # noqa: BLE001 — lint is a rung, not a gate; never touches the verdict print( f"!! lint rung executor crashed (non-fatal, rung=unver): {_scrub(str(e))}", flush=True ) print( f"lint rung: {lint_result['status']}" f"{' — ' + lint_result['detail'] if lint_result.get('detail') else ''}", flush=True, ) with contextlib.suppress(OSError): os.makedirs(junit_dir, exist_ok=True) # Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup # snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state(). statefile = _run_state_path("opstate") + ".json" with open(statefile, "w") as f: json.dump({}, f) os.environ["CCCI_OP_STATE_FILE"] = statefile op_state: dict = {} # Run-scoped dep state (Phase 2 Q2.3; install-time-only since rcust P2b): deps are provisioned # BEFORE the single deploy so install_steps.sh wires OIDC env into that one deploy. # `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable). depsfile = _run_state_path("deps") + ".json" with open(depsfile, "w") as f: json.dump({}, f) os.environ["CCCI_DEPS_FILE"] = depsfile # F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here. skipfile = _run_state_path("depskip") + ".txt" with contextlib.suppress(OSError): os.remove(skipfile) os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile declared = list(meta.DEPS) if declared: print(f"\n===== DEPS declared (provision BEFORE deploy): {declared} =====", flush=True) deps_state: dict[str, dict] = {} # new shape: recipe→entry dict (sso-dep plan §1) deps_ready = True deps_not_ready_reason: str = "" results: dict[str, str] = {} lifecycle.janitor() dep_teardown_error: str | None = None screenshot_rel: str | None = None # Phase 3 U1 (R4): set once the app screenshot is captured try: # ---- (Q3.2a) install-time OIDC: provision the warm-dep realm BEFORE the single deploy so # install_steps.sh can read $CCCI_DEPS_FILE and wire the OIDC env into that one deploy. On # failure we mark deps-not-ready but STILL deploy the recipe alone (install_steps.sh no-ops # on an empty deps file) so the generic tiers run; the OIDC custom test then skips → F2-11. ---- if declared: print( f"\n===== install-time OIDC: provisioning deps {declared} BEFORE deploy =====", flush=True, ) try: deps_state = _provision_deps(recipe, domain, ref, declared) print( " install-time OIDC: deps provisioned; install_steps.sh will wire OIDC env", flush=True, ) except Exception as e: # noqa: BLE001 — isolated; recipe still deploys, OIDC test skips deps_ready = False deps_not_ready_reason = _scrub(str(e))[:300] print( f"!! install-time dep provisioning failed (deps-not-ready): {deps_not_ready_reason}", flush=True, ) # ---- deploy RECIPE FIRST, alone (no deps yet — generic tiers run recipe-only) ---- try: lifecycle.deploy_app( recipe, domain, version=base, base_ref=base_ref, apply_previous=prev_status["apply"], secrets=True, install_steps_hook=hook, deploy_timeout=int(meta.DEPLOY_TIMEOUT), meta=meta, ) lifecycle.wait_healthy( domain, ok_codes=tuple(meta.HEALTH_OK), path=meta.HEALTH_PATH, deploy_timeout=meta.DEPLOY_TIMEOUT, http_timeout=meta.HTTP_TIMEOUT, ) # Recipe READY_PROBE (e.g. lasuite-drive collabora WOPI discovery) — readiness beyond # replica convergence + app HEALTH_PATH; no-op for recipes without one. lifecycle.wait_ready_probes( meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="install" ) deploy_ok = True except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure print(f"!! deploy/readiness failed: {e}", flush=True) deploy_ok = False # ---- Phase 3 U1 (R4): capture a real app screenshot while the app is up, at the cleanest # "freshly installed + healthy" moment (before any tier mutates state and before teardown). # Placed OUTSIDE the deploy try/except so a screenshot issue can NEVER flip deploy_ok. # Secret-safe by default (landing page, never a credentials page; recipes opt into a # post-login view via a SCREENSHOT meta hook). Best-effort — capture() swallows all errors and # returns None, so this never blocks or fails the run (R7). None → results.json `screenshot` # stays null → the card shows the "no screenshot" placeholder (cosmetics never change verdict). if deploy_ok: # capture() already swallows all errors → None; the extra try/except is defense-in-depth # (U5 R7 hardening) so a screenshot can NEVER fail/crash the run even if that internal # contract regresses or a recipe SCREENSHOT hook raises. Cosmetics never change the verdict. try: shot = screenshot_mod.capture( domain, screenshot_mod.screenshot_path(run_artifact_dir), recipe_meta=meta ) screenshot_rel = os.path.basename(shot) if shot else None except Exception as e: # noqa: BLE001 — screenshot is cosmetic; never fail a run on it (R7) print( f"!! screenshot capture raised (non-fatal, verdict unaffected): {_scrub(str(e))}", flush=True, ) # ---- INSTALL tier (always; additive generic + overlay, no op) ---- if "install" in stages: results["install"] = ( run_lifecycle_tier( recipe, "install", repo_local, domain, meta, head_ref, op_state, records=records, junit_dir=junit_dir, ) if deploy_ok else "fail" ) if deploy_ok: # ---- UPGRADE tier (op once → generic + overlay assert) ---- if "upgrade" in stages: results["upgrade"] = ( run_lifecycle_tier( recipe, "upgrade", repo_local, domain, meta, head_ref, op_state, records=records, junit_dir=junit_dir, ) if prev else "skip" # base_plan.kind == "skip": no predecessor / EXPECTED_NA / head==main ) # ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ---- if "backup" in stages: results["backup"] = ( run_lifecycle_tier( recipe, "backup", repo_local, domain, meta, head_ref, op_state, records=records, junit_dir=junit_dir, ) if backup_cap else "skip" ) if "restore" in stages: results["restore"] = ( run_lifecycle_tier( recipe, "restore", repo_local, domain, meta, head_ref, op_state, records=records, junit_dir=junit_dir, ) if backup_cap else "skip" ) # (rcust P2b: install-time deps wiring is the ONLY mode — deps were provisioned BEFORE # the single deploy and install_steps.sh wired the OIDC env into it. The legacy # post-deploy provisioning + setup_custom_tests.sh redeploy machinery is deleted; a # recipe's post-deploy seeding belongs in ops.py pre_install, e.g. lasuite-drive's # MinIO bucket one-shot.) # ---- CUSTOM tier ---- if "custom" in stages: # Pass deps-ready state via env; conftest.py skips @pytest.mark.requires_deps # tests when CCCI_DEPS_READY=0. os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0" os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason results["custom"] = run_custom( recipe, repo_local, domain, records=records, junit_dir=junit_dir ) else: # install failed → the shared deployment is dead; remaining tiers cannot run on it. for op in ("upgrade", "backup", "restore", "custom"): if op in stages: results[op] = "skip" finally: # From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored # (lifetime.py) so a second signal can't abort the cleanup the first one asked for. lifetime.begin_teardown() # Teardown the recipe under test FIRST, then deps in reverse declaration order. # Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier # failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are # captured into dep_teardown_error and surfaced in the run summary + exit code, but # we still print the diagnosable summary first. lifecycle.teardown_app(domain, verify=False) if deps_state: print("\n===== DEPS teardown =====", flush=True) # Flatten the dict-shape state in declaration order; teardown_deps reverses for cold. if isinstance(deps_state, dict): ordered = [deps_state[d] for d in declared if d in deps_state] else: ordered = deps_state # WC1: warm deps are NOT undeployed — we only delete the per-run realm on the shared # live-warm provider (the app stays up for the next run). Cold deps undeploy as before. warm_entries = [e for e in ordered if e.get("warm")] cold_entries = [e for e in ordered if not e.get("warm")] for e in warm_entries: try: from harness import sso sso.delete_keycloak_realm(e["domain"], e["realm"]) print( f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True, ) except Exception as ex: # noqa: BLE001 — a leaked realm is a teardown failure (§9) dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}" print(f"!! {dep_teardown_error}", flush=True) try: deps_mod.teardown_deps(cold_entries) except lifecycle.TeardownError as e: dep_teardown_error = str(e) print(f"!! {dep_teardown_error}", flush=True) else: # ADV-drone-02 fix: deps_state is empty (enrichment failed after a successful # deploy_deps call). The raw deployed list is still in $CCCI_DEPS_FILE — read it # and tear down any cold deps so they don't orphan at their deterministic domain. raw = deps_mod.load_run_state() if raw: cold_raw = [ e for e in (raw if isinstance(raw, list) else list(raw.values())) if isinstance(e, dict) and not e.get("warm") ] if cold_raw: print("\n===== DEPS teardown (enrichment-failure fallback) =====", flush=True) with contextlib.suppress(lifecycle.TeardownError): deps_mod.teardown_deps(cold_raw) # ---- deploy-count assertion (DG4.1) ---- with open(countfile) as f: deploy_count = int(f.read().strip() or "0") os.remove(countfile) with contextlib.suppress(OSError): os.remove(statefile) with contextlib.suppress(OSError): os.remove(depsfile) # F2-11: sum the requires_deps skip counts conftest recorded across the custom files. requires_deps_skipped = 0 try: with open(skipfile) as f: requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip()) except OSError: pass with contextlib.suppress(OSError): os.remove(skipfile) # ---- per-op summary (DG6 feed) ---- # SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each # COLD dep). Chaos redeploys are NOT a fresh `app_new` and do NOT increment the count. # WC1: a live-warm dep (keycloak) is NOT deployed by the run — it only gets a per-run realm — so # warm deps contribute 0. So expected = 1 + (number of COLD deps that actually got deployed). _dep_entries = deps_state.values() if isinstance(deps_state, dict) else (deps_state or []) deps_deployed_count = sum( 1 for e in _dep_entries if not (isinstance(e, dict) and e.get("warm")) ) expected_deploy_count = 1 + deps_deployed_count print("\n===== RUN SUMMARY =====", flush=True) print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})") if deps_state: deps_list_for_summary = ( list(deps_state.keys()) if isinstance(deps_state, dict) else [d.get("recipe", "?") for d in deps_state] ) print(f" deps deployed: {deps_list_for_summary}") if not deps_ready: print(f" deps-not-ready: {deps_not_ready_reason}") order = [s for s in ALL_STAGES if s in results] for op in order: suffix = "" # F2-11: annotate the custom tier when requires_deps (SSO) tests were skipped, so a reader # of the summary can't mistake a green custom tier for "SSO verified". if op == "custom" and requires_deps_skipped: suffix = f" ({requires_deps_skipped} requires_deps SKIPPED: deps-not-ready — SSO UNVERIFIED)" print(f" {op:8s}: {results[op]}{suffix}") overall = 0 if deploy_count != expected_deploy_count: print( f"!! deploy-count {deploy_count} != {expected_deploy_count} (DG4.1 violation)", file=sys.stderr, ) overall = 1 if dep_teardown_error: # F2-5: dep teardown leaks violate §9 (teardown sacred); fail the run loudly. print(f"!! dep teardown leaked state: {dep_teardown_error}", file=sys.stderr) overall = 1 if any(v == "fail" for v in results.values()): overall = 1 # F2-11: a deps-declaring recipe whose dep provisioning failed has NOT verified its SSO/OIDC # claim — its requires_deps tests SKIPPED (a skip-only file exits 0, so without this the run # would report GREEN). Fail the run for that recipe; generic-tier results above are untouched. if sso_dep_unverified(declared, deps_ready, requires_deps_skipped): print( f"!! recipe declares DEPS={declared} but dep provisioning failed and " f"{requires_deps_skipped} requires_deps (SSO) test(s) were SKIPPED — SSO claim NOT " f"verified; failing run (F2-11). deps-not-ready: {deps_not_ready_reason}", file=sys.stderr, ) overall = 1 if not results: print("no tiers ran", file=sys.stderr) return 1 # ---- Phase 3 (R1/R3): assemble results.json (per-stage/per-test + computed level). Best-effort: # a failure here NEVER changes `overall` (R7 — cosmetics never block the pipeline). ---- data: dict | None = None try: clean_teardown = (deploy_count == expected_deploy_count) and not dep_teardown_error data = results_mod.build_results( recipe=recipe, version=target or (head_ref[:12] if head_ref else None), pr=os.environ.get("PR", "0"), ref=ref, records=records, results=results, backup_capable=backup_cap, has_upgrade_target=prev is not None, # structural: a deployable upgrade base exists lint=lint_result, # L5 rung (phase lvl5) clean_teardown=clean_teardown, no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded finished_ts=time.time(), expected_na=meta.EXPECTED_NA, # declared intentional-skip map (recipe_meta) customization=customization, # rcust P5: the run-start manifest, verbatim ) # Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7). blob = json.dumps(data) leaked = any(v in blob for v in _REDACT) data["flags"]["no_secret_leak"] = not leaked if leaked: print( "!! results.json leak-scan: a known secret value appeared — scrubbing flag set False", file=sys.stderr, ) path = results_mod.write_results(data) print(f"results.json written: {path} (level={data['level']} of 5)", flush=True) # Surface UNVERIFIED rungs in the CI log (non-blocking, R7): a rung that should have run # and wasn't verified blocks the level above it — fill the coverage, or (where a # declared/structural reason genuinely applies) declare it in EXPECTED_NA. for rung in data.get("skips", {}).get("unintentional", []): print( f"⚠ coverage: rung '{rung}' is UNVERIFIED (did not run / could not be checked) — " f"the level cannot rise above it. Add the missing test/coverage, or declare a " f"genuine inapplicability in tests/{recipe}/recipe_meta.py " f"EXPECTED_NA = {{'{rung}': ''}}.", flush=True, ) except Exception as e: # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7) print( f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}", file=sys.stderr, ) # ---- Phase 3 U2 (R3/R6): render the summary CARD (HTML→PNG) + level BADGE (SVG) from the # results dict into the run artifact dir, alongside results.json + screenshot.png. The card # REPORTS results.json verbatim — it computes nothing, so it can never look greener than the tiers # (cardinal invariant, plan §6). Separate best-effort block (results.json is already written by # here) — any failure is swallowed and NEVER changes `overall` (R7); a render failure simply means # no summary.png, and U3/U4 fall back to text. ---- if data is not None: try: html_path = os.path.join(run_artifact_dir, "summary.html") with open(html_path, "w", encoding="utf-8") as f: f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot"))) png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png")) # Badge = level only (number + colour) — the per-rung table on the card is the sole # carrier of "why isn't this higher" (operator-specified, phase lvl5). with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f: f.write(card_mod.level_badge_svg(data["level"])) print( f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + " f"badge.svg written into {run_artifact_dir}", flush=True, ) except Exception as e: # noqa: BLE001 — card/badge are cosmetic; never fail a run (R7) print(f"!! summary card/badge render failed (non-fatal): {_scrub(str(e))}", flush=True) # WC5 promote-on-green-cold: a GREEN COLD run on LATEST (no PR head) of an enrolled # (WARM_CANONICAL) recipe advances/seeds the canonical. ONLY cold-on-latest advances it (a PR # `!testme` carries REF and must NOT promote; `--quick` never promotes — handled in run_quick). # Non-fatal: a promote failure leaves the OLD known-good intact (never lose it) and is logged. # canon §2.A tagged-promote gate: only promote when the tested head version is a published # release tag (never an arbitrary untagged `main` commit). head_version is the compose `version` # label of the code under test; is_released_version checks it against the recipe's release tags. tagged = warm_reconcile.is_released_version(recipe, head_version) if should_promote_canonical(recipe, ref, overall, quick=False, tagged=tagged): try: promote_canonical(recipe, head_ref, head_version, repo_local) except Exception as e: # noqa: BLE001 — promote is a post-green bonus; never fail a green run print( f"!! WC5 promote failed (non-fatal; known-good unchanged): {_scrub(str(e))}", flush=True, ) return overall if __name__ == "__main__": raise SystemExit(main())