All checks were successful
continuous-integration/drone/push Build is passing
capacity=2 went live with three stale capacity=1-era assumptions that corrupted concurrent runs (immich 229/230 '/pg_backup.sh: No such file'): - ~/.abra/recipes/<recipe> is ONE shared working tree that fetch_recipe rm-rf's/ reclones and the upgrade tier git-checkouts mid-run. Same-recipe runs now serialise on an exclusive flock (/run/lock/cc-ci-recipe-<recipe>.lock), taken in main() BEFORE fetch_recipe and held for the whole run; the kernel releases it on any process death, so there is no stale-lock failure mode. Different recipes still run in parallel. - CCCI_JANITOR_MAX_AGE=0 made a starting build reap ANY in-flight run app. Every run now registers its app domain + pid in /run/cc-ci-active/<domain> before app creation; the janitor checks the owner: alive (pid is a live run_recipe_ci process) -> never reaped; dead -> reaped immediately; unknown (pre-registry or post-reboot) -> age fallback (default 2h). The MAX_AGE=0 env override is gone from .drone.yml. - .drone.yml: concurrency.limit 1 -> 2 to match DRONE_RUNNER_CAPACITY=2; the 'safe because capacity=1' comments now describe the flock+registry model. lint: PASS, unit tests: 138 passed.
1340 lines
62 KiB
Python
1340 lines
62 KiB
Python
#!/usr/bin/env python3
|
||
"""Top-level CI orchestrator (plan §4.3 + Phase 1d/1e), invoked by the Drone pipeline (or by hand).
|
||
|
||
Model: deploy the app ONCE, then run lifecycle TIERS against that single shared deployment, then ONE
|
||
teardown in `finally`. Per Phase 1e the orchestrator OWNS each mutating op (HC3): for a tier it runs
|
||
the optional pre-op seed hook (recipe ops.py `pre_<op>`), performs the op exactly ONCE
|
||
(upgrade/backup/restore — install has none), then runs BOTH the generic assertion file (the floor,
|
||
unless explicitly opted out) AND the recipe overlay assertion file (if any) against the shared
|
||
post-op state — generic and overlay are ADDITIVE, not override (HC3). Op results an assertion needs
|
||
(pre-upgrade identity, snapshot_id) pass op→assertion via a run-scoped JSON state file
|
||
($CCCI_OP_STATE_FILE). The upgrade op deploys the PR-HEAD code under test via `abra app deploy
|
||
--chaos` (HC1). Repo-local (PR-authored) overlays/hooks run only for allowlist-approved recipes (HC2,
|
||
gated in harness.discovery). The generic is the default for every op, so ANY recipe is testable with
|
||
zero config (DG1–DG4). The lifecycle OPS live in the shared harness (harness.generic), not per-recipe
|
||
(DG7 DRY).
|
||
|
||
Run parameters from env (set by the comment-bridge via Drone build params):
|
||
RECIPE recipe name (e.g. custom-html) [required]
|
||
REF PR head commit sha [optional; used for fetch + run-domain hash]
|
||
PR PR number [optional, default 0]
|
||
SRC head repo full_name on the mirror [optional]
|
||
VERSION upgrade target tag (else newest published) [optional]
|
||
STAGES comma filter of tiers to run [optional, default install,upgrade,backup,restore,custom]
|
||
|
||
Run env (python + pytest + playwright) is provided by `cc-ci-run` (nix/modules/harness.nix);
|
||
invoke as: cc-ci-run runner/run_recipe_ci.py
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import contextlib
|
||
import glob
|
||
import importlib.util
|
||
import json
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import time
|
||
|
||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, os.path.join(ROOT, "runner"))
|
||
from harness import ( # noqa: E402
|
||
abra,
|
||
canonical,
|
||
discovery,
|
||
generic,
|
||
lifecycle,
|
||
naming,
|
||
warm,
|
||
warmsnap,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
card as card_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
deps as deps_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
results as results_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
screenshot as screenshot_mod,
|
||
)
|
||
|
||
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
|
||
|
||
|
||
def sso_dep_unverified(declared, deps_ready: bool, requires_deps_skipped: int) -> bool:
|
||
"""F2-11 gate predicate (pure, unit-tested). True when a recipe declares DEPS but its
|
||
setup_custom_tests failed (deps not ready) AND that caused ≥1 `requires_deps` (SSO/OIDC) test
|
||
to SKIP. In that case the recipe's characteristic SSO claim was NOT verified, so the run must
|
||
NOT report GREEN — even though a skip-only pytest file exits 0 and leaves every tier 'pass'.
|
||
Generic-tier failure-isolation is preserved (those results stand); only the green SIGNAL is
|
||
corrected. Gated on skip>0 so a deps-declaring recipe with no requires_deps tests isn't
|
||
false-failed."""
|
||
return bool(declared) and not deps_ready and requires_deps_skipped > 0
|
||
|
||
|
||
def _truthy(v: str | None) -> bool:
|
||
return str(v or "").strip().lower() in ("1", "true", "yes", "on")
|
||
|
||
|
||
def _redact_values() -> list[str]:
|
||
"""Values to scrub from published logs (D6 redaction filter, plan §4.4). The infra secrets
|
||
materialised at /run/secrets/* — if any subprocess ever echoes one, mask it. Only >=8-char
|
||
values, so it never false-positives on short strings / SHAs."""
|
||
vals = set()
|
||
for p in glob.glob("/run/secrets/*"):
|
||
try:
|
||
with open(p) as f:
|
||
v = f.read().strip()
|
||
except OSError:
|
||
continue
|
||
if len(v) >= 8:
|
||
vals.add(v)
|
||
return sorted(vals, key=len, reverse=True)
|
||
|
||
|
||
_REDACT = _redact_values()
|
||
|
||
|
||
def _scrub(text: str) -> str:
|
||
"""Mask any known infra-secret value in a string (D6 redaction, plan §4.4)."""
|
||
for v in _REDACT:
|
||
if v in text:
|
||
text = text.replace(v, "***REDACTED***")
|
||
return text
|
||
|
||
|
||
def run_redacted(cmd: list[str], env: dict | None = None) -> int:
|
||
"""Run a subprocess, streaming output live (so Drone logs stay tail-able) but masking any known
|
||
infra-secret value first. Belt-and-suspenders: the harness never prints secrets and abra doesn't
|
||
echo generated ones."""
|
||
proc = subprocess.Popen(
|
||
cmd,
|
||
cwd=ROOT,
|
||
env=env,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.STDOUT,
|
||
text=True,
|
||
bufsize=1,
|
||
)
|
||
assert proc.stdout is not None
|
||
for line in proc.stdout:
|
||
sys.stdout.write(_scrub(line))
|
||
sys.stdout.flush()
|
||
return proc.wait()
|
||
|
||
|
||
def _gitea_token() -> str | None:
|
||
tok = os.environ.get("GITEA_TOKEN")
|
||
if not tok and os.path.exists("/run/secrets/bridge_gitea_token"):
|
||
with open("/run/secrets/bridge_gitea_token") as f:
|
||
tok = f.read().strip()
|
||
return tok or None
|
||
|
||
|
||
def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
|
||
"""Make the recipe available at the code under test. If SRC+REF point at the mirror PR,
|
||
clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot
|
||
token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed)."""
|
||
recipes_dir = os.path.expanduser("~/.abra/recipes")
|
||
os.makedirs(recipes_dir, exist_ok=True)
|
||
dest = os.path.join(recipes_dir, recipe)
|
||
# CCCI_SKIP_FETCH=1: use the local recipe clone as-is (lets a test/Adversary stage a fake/broken
|
||
# ref — e.g. a simulated broken PR head for the --quick rollback proof — without it being clobbered
|
||
# by a re-fetch). Never set in production CI.
|
||
if os.environ.get("CCCI_SKIP_FETCH") == "1":
|
||
print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True)
|
||
return
|
||
if src and ref:
|
||
url = f"https://git.autonomic.zone/{src}.git"
|
||
git = ["git"]
|
||
tok = _gitea_token()
|
||
if tok:
|
||
git += ["-c", f"http.extraHeader=Authorization: token {tok}"]
|
||
subprocess.run(["rm", "-rf", dest], check=False)
|
||
subprocess.run([*git, "clone", "--quiet", url, dest], check=True)
|
||
subprocess.run([*git, "-C", dest, "checkout", "--quiet", ref], check=True)
|
||
# Bring in published version TAGS from the public upstream so the upgrade tier can deploy a
|
||
# previous published version (mirror PR branches carry no release tags). Read-only + plain git
|
||
# (no bot token to a foreign host). Non-fatal: if unreachable, upgrade degrades to a skip.
|
||
upstream = f"https://git.coopcloud.tech/coop-cloud/{recipe}.git"
|
||
subprocess.run(
|
||
["git", "-C", dest, "fetch", "--quiet", upstream, "refs/tags/*:refs/tags/*"],
|
||
check=False,
|
||
)
|
||
else:
|
||
# Clean re-fetch from the catalogue. rm first so a leftover dir from a prior SRC+REF run
|
||
# (origin → private mirror, maybe lacking tags) can't poison the catalogue fetch.
|
||
subprocess.run(["rm", "-rf", dest], check=False)
|
||
subprocess.run(["abra", "recipe", "fetch", recipe, "-n"], check=True)
|
||
|
||
|
||
def snapshot_recipe_tests(recipe: str) -> str | None:
|
||
"""Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the
|
||
recipe to a version tag during the run. Returns the snapshot path, or None if no tests/."""
|
||
src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests")
|
||
if not os.path.isdir(src):
|
||
return None
|
||
has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile(
|
||
os.path.join(src, "install_steps.sh")
|
||
)
|
||
if not has_overlay:
|
||
return None
|
||
dst = os.path.join(tempfile.gettempdir(), f"ccci-recipe-tests-{recipe}")
|
||
shutil.rmtree(dst, ignore_errors=True)
|
||
shutil.copytree(src, dst)
|
||
return dst
|
||
|
||
|
||
def _load_meta(recipe: str) -> dict:
|
||
"""Mirror tests/conftest._recipe_meta so the orchestrator's deploy/wait uses the same per-recipe
|
||
config the tiers see (timeouts, health path/codes)."""
|
||
meta = {
|
||
"HEALTH_PATH": "/",
|
||
"HEALTH_OK": (200, 301, 302),
|
||
"DEPLOY_TIMEOUT": 600,
|
||
"HTTP_TIMEOUT": 300,
|
||
}
|
||
path = os.path.join(ROOT, "tests", recipe, "recipe_meta.py")
|
||
if os.path.exists(path):
|
||
ns: dict = {}
|
||
with open(path) as fh:
|
||
exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo)
|
||
for k in list(meta) + [
|
||
"BACKUP_CAPABLE",
|
||
"SKIP_GENERIC",
|
||
"EXPECTED_NA",
|
||
"OIDC_AT_INSTALL",
|
||
"READY_PROBE",
|
||
"UPGRADE_BASE_VERSION",
|
||
"BACKUP_VERIFY",
|
||
"UPGRADE_EXTRA_ENV",
|
||
]:
|
||
if k in ns:
|
||
meta[k] = ns[k]
|
||
return meta
|
||
|
||
|
||
def _tier_env(domain: str) -> dict:
|
||
return dict(os.environ, CCCI_APP_DOMAIN=domain, CCCI_BASE_URL=f"https://{domain}")
|
||
|
||
|
||
def _skip_generic(op: str, meta: dict) -> bool:
|
||
"""Whether the generic assertion for `op` is opted out (Phase 1e HC3). Default: run (additive).
|
||
Opt-out, any of: env CCCI_SKIP_GENERIC (all ops), env CCCI_SKIP_GENERIC_<OP>, or the recipe's
|
||
declarative recipe_meta.SKIP_GENERIC list (op name, or "all"/"*")."""
|
||
if _truthy(os.environ.get("CCCI_SKIP_GENERIC")):
|
||
return True
|
||
if _truthy(os.environ.get(f"CCCI_SKIP_GENERIC_{op.upper()}")):
|
||
return True
|
||
sg = [str(s).lower() for s in (meta.get("SKIP_GENERIC") or [])]
|
||
return "all" in sg or "*" in sg or op in sg
|
||
|
||
|
||
def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, meta: dict) -> None:
|
||
"""Run the optional pre-op seed hook (recipe ops.py `pre_<op>`) BEFORE the harness performs the
|
||
op (HC3 op/assertion split): overlays seed data-continuity markers / the backup→restore mutation
|
||
here, then assert post-op in test_<op>.py. cc-ci's ops.py is trusted; a repo-local ops.py is
|
||
consulted only for allowlist-approved recipes (HC2 gate is inside discovery.pre_op_hook). Imported
|
||
in-process; the recipe dir is put on sys.path so an ops.py can import its sibling helpers."""
|
||
hook = discovery.pre_op_hook(recipe, op, repo_local)
|
||
if not hook:
|
||
return
|
||
source, path = hook
|
||
d = os.path.dirname(path)
|
||
sys.path.insert(0, d)
|
||
try:
|
||
spec = importlib.util.spec_from_file_location(f"ccci_ops_{recipe}_{op}", path)
|
||
mod = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(mod)
|
||
print(f" pre-op seed ({source}): {os.path.relpath(path, ROOT)}::pre_{op}", flush=True)
|
||
getattr(mod, f"pre_{op}")(domain, meta)
|
||
finally:
|
||
if d in sys.path:
|
||
sys.path.remove(d)
|
||
|
||
|
||
def _perform_op(
|
||
op: str,
|
||
domain: str,
|
||
recipe: str,
|
||
head_ref: str | None,
|
||
op_state: dict,
|
||
deploy_timeout: int = 900,
|
||
meta: dict | None = None,
|
||
) -> None:
|
||
"""Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records
|
||
what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these
|
||
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a
|
||
new install (HC1 reconciliation). `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the
|
||
upgrade chaos redeploy so a heavy reconverge isn't SIGKILLed by the 900s default mid-wait; `meta`
|
||
lets the upgrade op own a recipe-aware convergence+health wait (F2-12, READY_PROBE)."""
|
||
if op == "upgrade":
|
||
before = generic.perform_upgrade(
|
||
domain, recipe, head_ref, deploy_timeout=deploy_timeout, meta=meta
|
||
)
|
||
op_state["upgrade"] = {"before": before, "head_ref": head_ref}
|
||
elif op == "backup":
|
||
# Backup integrity + retry (F2-14b). A recipe may define BACKUP_VERIFY(domain) -> bool that
|
||
# confirms the backup actually captured the recipe's critical data AFTER the op. This guards a
|
||
# real race: a DB recipe dumps its data in a backupbot pre-hook, but if the DB container cycles
|
||
# mid-dump (intermittent under host load) the dump is truncated/absent, so restic snapshots an
|
||
# empty path — `abra app backup create` still "succeeds", yet a later restore silently loses the
|
||
# data (ghost: backup.sql.gz never written → restore can't reimport → seeded row gone). When
|
||
# verify fails we re-run the WHOLE backup (fresh restic snapshot) with a re-stabilised DB, up to
|
||
# 3 attempts. Recipes without BACKUP_VERIFY are unaffected (single backup, as before).
|
||
snap = generic.perform_backup(domain)
|
||
verify = meta.get("BACKUP_VERIFY") if meta else None
|
||
attempt = 1
|
||
while callable(verify) and not verify(domain) and attempt < 3:
|
||
attempt += 1
|
||
print(
|
||
f" backup-verify FAILED (attempt {attempt - 1}/3) — backup did not capture the "
|
||
f"recipe's critical data (e.g. DB cycled mid-dump); re-running backup",
|
||
flush=True,
|
||
)
|
||
snap = generic.perform_backup(domain)
|
||
if callable(verify) and not verify(domain):
|
||
print(
|
||
f" !! backup-verify still FAILED after {attempt} attempts — backup is incomplete",
|
||
flush=True,
|
||
)
|
||
op_state["backup"] = {"snapshot_id": snap}
|
||
elif op == "restore":
|
||
generic.perform_restore(domain)
|
||
# install: already deployed; no op
|
||
|
||
|
||
def run_lifecycle_tier(
|
||
recipe: str,
|
||
op: str,
|
||
repo_local: str | None,
|
||
domain: str,
|
||
meta: dict,
|
||
head_ref: str | None,
|
||
op_state: dict,
|
||
records: list[dict] | None = None,
|
||
junit_dir: str | None = None,
|
||
) -> str:
|
||
"""Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic
|
||
assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op
|
||
deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns
|
||
'pass' | 'fail' | 'skip'.
|
||
|
||
Phase 3 (R1/R3): when `records`/`junit_dir` are given, each pytest file is run with --junitxml and
|
||
a {tier,source,file,rc,junit} record appended, so the run can assemble per-stage/per-test
|
||
results.json + the level afterwards. Purely additive — does not change the verdict."""
|
||
overlay = discovery.resolve_overlay_op(recipe, op, repo_local)
|
||
skip_gen = _skip_generic(op, meta)
|
||
files: list[tuple[str, str]] = []
|
||
if not skip_gen:
|
||
files.append(discovery.generic_op(op))
|
||
if overlay:
|
||
files.append(overlay)
|
||
if not files:
|
||
# generic opted out AND no overlay → nothing would assert; don't perform a pointless mutating op
|
||
print(f"\n===== TIER: {op} — SKIP (generic opted out, no overlay) =====", flush=True)
|
||
return "skip"
|
||
|
||
ov = f"{overlay[0]}:{os.path.relpath(overlay[1], ROOT)}" if overlay else "none"
|
||
print(
|
||
f"\n===== TIER: {op} (generic={'skip' if skip_gen else 'run'}, overlay={ov}) =====",
|
||
flush=True,
|
||
)
|
||
# 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail.
|
||
try:
|
||
_run_pre_hook(recipe, op, repo_local, domain, meta)
|
||
_perform_op(
|
||
op,
|
||
domain,
|
||
recipe,
|
||
head_ref,
|
||
op_state,
|
||
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", 900)),
|
||
meta=meta,
|
||
)
|
||
with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f:
|
||
json.dump(op_state, f)
|
||
except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash
|
||
print(f"!! {op} op failed: {_scrub(str(e))}", flush=True)
|
||
return "fail"
|
||
|
||
# 3) assertions: generic (unless opted out) + overlay, each its own pytest, all against the
|
||
# single post-op deployment. Generic runs first so an overlay may assume readiness.
|
||
rc_all = 0
|
||
for source, path in files:
|
||
print(f" assert ({source}): {os.path.relpath(path, ROOT)}", flush=True)
|
||
cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path]
|
||
jx = None
|
||
if junit_dir is not None:
|
||
jx = results_mod.junit_file(junit_dir, op, source, path)
|
||
cmd.append(f"--junitxml={jx}")
|
||
rc = run_redacted(cmd, env=_tier_env(domain))
|
||
if records is not None:
|
||
records.append(
|
||
{
|
||
"tier": op,
|
||
"source": source,
|
||
"file": os.path.relpath(path, ROOT),
|
||
"rc": rc,
|
||
"junit": jx,
|
||
}
|
||
)
|
||
if rc != 0:
|
||
rc_all = rc
|
||
return "pass" if rc_all == 0 else "fail"
|
||
|
||
|
||
def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> dict[str, dict]:
|
||
"""For each dep, set up a fresh realm/client + test user via the harness's provider-specific
|
||
setup function, then return a recipe→entry dict carrying domain + admin + realm/client/user
|
||
info — the shape the `setup_custom_tests.sh` hook (and dependent tests) read.
|
||
|
||
Provider routing: today only `keycloak` is supported. authentik will need a parallel
|
||
`setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9).
|
||
"""
|
||
from harness import sso, warm # local import — sso may not be needed for dep-less runs
|
||
|
||
out: dict[str, dict] = {}
|
||
for entry in deps_list or []:
|
||
dep_recipe = entry.get("recipe")
|
||
dep_domain = entry.get("domain")
|
||
if not dep_recipe or not dep_domain:
|
||
continue
|
||
if dep_recipe != "keycloak":
|
||
# Provider not yet supported — record bare entry; setup_custom_tests.sh / tests will
|
||
# raise if they need realm/client info they don't see.
|
||
out[dep_recipe] = entry
|
||
continue
|
||
# The realm is the per-run isolation unit on a (possibly shared live-warm) keycloak: name it
|
||
# "<parent>-<6hex>" so concurrent dependents — even two PRs of the SAME recipe — never
|
||
# collide on a realm (WC1). client_id stays the parent recipe name (isolated within the
|
||
# unique realm; predictable for debugging).
|
||
realm = warm.realm_for(parent_recipe, parent_domain)
|
||
client_id = parent_recipe
|
||
creds = sso.setup_keycloak_realm(
|
||
dep_domain,
|
||
realm=realm,
|
||
client_id=client_id,
|
||
redirect_uris=[f"https://{parent_domain}/*"],
|
||
web_origins=[f"https://{parent_domain}"],
|
||
)
|
||
out[dep_recipe] = {
|
||
"recipe": dep_recipe,
|
||
"domain": dep_domain,
|
||
"warm": bool(entry.get("warm")),
|
||
"realm": creds["realm"],
|
||
"client_id": creds["client_id"],
|
||
"client_secret": creds["client_secret"],
|
||
"user": creds["user"],
|
||
"password": creds["password"],
|
||
"email": creds["email"],
|
||
"discovery_url": creds["discovery_url"],
|
||
"token_url": creds["token_url"],
|
||
"auth_url": creds["auth_url"],
|
||
"userinfo_url": creds["userinfo_url"],
|
||
"admin_user": "admin",
|
||
"admin_password": sso.admin_password_inside(dep_domain),
|
||
}
|
||
return out
|
||
|
||
|
||
def _provision_deps(
|
||
recipe: str, domain: str, ref: str | None, declared: list[str]
|
||
) -> dict[str, dict]:
|
||
"""Provision a run's declared deps and write `$CCCI_DEPS_FILE`; return the recipe→entry deps_state.
|
||
|
||
Splits deps into live-warm (shared provider at a stable domain + a per-run realm) vs cold
|
||
(co-deployed per run), provisions each dep's SSO realm/client/user, and persists the enriched
|
||
dict the `setup_custom_tests.sh`/`install_steps.sh` hooks + dependent tests read. Raises on any
|
||
failure (the caller marks deps-not-ready). Used by BOTH wiring paths:
|
||
- post-deploy (legacy): provision AFTER generic tiers, then `setup_custom_tests.sh` does an
|
||
in-place OIDC redeploy.
|
||
- install-time (`OIDC_AT_INSTALL`, Q3.2a): provision BEFORE the single deploy so the
|
||
install-tier `install_steps.sh` hook wires OIDC env into that one deploy — no reconverge.
|
||
"""
|
||
warm_deps, cold_deps = [], []
|
||
for d in declared:
|
||
wd = warm.warm_domain(d)
|
||
if wd and warm.is_warm_up(d, wd):
|
||
warm_deps.append(d)
|
||
else:
|
||
if wd:
|
||
print(f" dep: {d} warm provider {wd} not up — cold fallback", flush=True)
|
||
cold_deps.append(d)
|
||
dep_metas = {d: _load_meta(d) for d in cold_deps}
|
||
deps_list = (
|
||
deps_mod.deploy_deps(recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas)
|
||
if cold_deps
|
||
else []
|
||
)
|
||
for d in warm_deps:
|
||
wd = warm.warm_domain(d)
|
||
reaped = warm.reap_orphan_realms(d, wd)
|
||
if reaped:
|
||
print(f" dep: reaped {len(reaped)} orphan realm(s) on warm {d}: {reaped}", flush=True)
|
||
deps_list.append({"recipe": d, "domain": wd, "warm": True})
|
||
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
|
||
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
||
deps_mod.write_run_state(deps_state)
|
||
return deps_state
|
||
|
||
|
||
def _run_setup_custom_tests_hook(recipe: str, domain: str, deps_file: str) -> None:
|
||
"""Run `tests/<recipe>/setup_custom_tests.sh` if present (operator-2026-05-28 SSO-dep plan
|
||
§3.2). The hook reads `$CCCI_DEPS_FILE`, sets OIDC env via `abra app config set` + secret
|
||
insert, and triggers an in-place `abra app deploy --force --chaos`. Failure here propagates
|
||
to mark deps-not-ready (caught in main())."""
|
||
path = os.path.join(ROOT, "tests", recipe, "setup_custom_tests.sh")
|
||
if not os.path.isfile(path):
|
||
# No hook = recipe doesn't need post-deps wiring; deps are deployed + creds available
|
||
# via deps_apps fixture as-is.
|
||
print(
|
||
f" setup_custom_tests: no hook at {os.path.relpath(path, ROOT)} (deps creds ready in $CCCI_DEPS_FILE)",
|
||
flush=True,
|
||
)
|
||
return
|
||
print(f" setup_custom_tests hook: {os.path.relpath(path, ROOT)}", flush=True)
|
||
rc = subprocess.run(
|
||
["bash", path],
|
||
check=False,
|
||
env=dict(os.environ, CCCI_APP_DOMAIN=domain, CCCI_RECIPE=recipe, CCCI_DEPS_FILE=deps_file),
|
||
)
|
||
if rc.returncode != 0:
|
||
raise RuntimeError(
|
||
f"setup_custom_tests.sh exited {rc.returncode} (deps env not wired into parent)"
|
||
)
|
||
|
||
|
||
def run_custom(
|
||
recipe: str,
|
||
repo_local: str | None,
|
||
domain: str,
|
||
records: list[dict] | None = None,
|
||
junit_dir: str | None = None,
|
||
) -> str:
|
||
"""Run all discovered non-lifecycle custom test_*.py (both locations, additive). Returns
|
||
'skip' if none defined, else 'pass'/'fail'. Phase 3: emits JUnit + records when given."""
|
||
customs = discovery.custom_tests(recipe, repo_local)
|
||
if not customs:
|
||
return "skip"
|
||
print("\n===== TIER: custom =====", flush=True)
|
||
rc_all = 0
|
||
for source, path in customs:
|
||
rel = os.path.relpath(path, ROOT)
|
||
print(f" custom ({source}): {rel}", flush=True)
|
||
cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path]
|
||
jx = None
|
||
if junit_dir is not None:
|
||
jx = results_mod.junit_file(junit_dir, "custom", source, path)
|
||
cmd.append(f"--junitxml={jx}")
|
||
rc = run_redacted(cmd, env=_tier_env(domain))
|
||
if records is not None:
|
||
records.append({"tier": "custom", "source": source, "file": rel, "rc": rc, "junit": jx})
|
||
if rc != 0:
|
||
rc_all = rc
|
||
return "pass" if rc_all == 0 else "fail"
|
||
|
||
|
||
def _wait_undeployed(domain: str, timeout: int = 120) -> None:
|
||
"""Block until the stack's services are gone after an undeploy (so warmsnap.restore, which
|
||
requires undeployed, doesn't race a half-removed stack)."""
|
||
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
||
deadline = time.time() + timeout
|
||
while time.time() < deadline:
|
||
if not lifecycle._docker_names("service", stack): # noqa: SLF001
|
||
return
|
||
time.sleep(2)
|
||
|
||
|
||
def run_quick(
|
||
recipe: str, ref: str | None, head_ref: str | None, repo_local: str | None, meta: dict
|
||
) -> int:
|
||
"""WC4 `--quick` opt-in fast lane (plan §2). Reattach the data-warm canonical (known-good volume)
|
||
→ upgrade IN PLACE to the PR head (chaos) → assert generic UPGRADE (reconverge+moved+serving) +
|
||
overlay + custom. PASS → undeploy-keep-volume, **known-good UNCHANGED (NEVER promote)**; FAIL →
|
||
restore the last-known-good snapshot + undeploy (roll back, data safe). Lower-confidence; does
|
||
NOT gate merge (WC7). Caller has confirmed a canonical exists.
|
||
|
||
NB: the deps wiring + temp-state scaffolding intentionally mirror main()'s cold path rather than
|
||
refactoring it — keeping the gate-passed cold flow byte-identical (zero regression risk)."""
|
||
import contextlib
|
||
|
||
domain = canonical.canonical_domain(recipe)
|
||
reg = canonical.read_registry(recipe) or {}
|
||
print(
|
||
f"\n== cc-ci run [MODE=quick]: recipe={recipe} canonical={domain} "
|
||
f"known-good={reg.get('version')} ref={ref}\n"
|
||
" quick = LOWER-CONFIDENCE opt-in fast lane; does NOT gate merge; NEVER promotes the canonical",
|
||
flush=True,
|
||
)
|
||
|
||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
||
with open(statefile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
||
with open(depsfile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||
|
||
op_state: dict = {}
|
||
results: dict[str, str] = {}
|
||
declared = deps_mod.declared_deps(recipe)
|
||
deps_state: dict = {}
|
||
deps_ready = True
|
||
deps_not_ready_reason = ""
|
||
dep_teardown_error: str | None = None
|
||
warm_ok = False
|
||
rolled_back = False
|
||
|
||
lifecycle.janitor()
|
||
try:
|
||
# 1) reattach the canonical (warm boot at the known-good version + retained volume)
|
||
try:
|
||
canonical.deploy_canonical(recipe, timeout=int(meta.get("DEPLOY_TIMEOUT", 900)))
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta["HEALTH_OK"]),
|
||
path=meta["HEALTH_PATH"],
|
||
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
||
http_timeout=meta["HTTP_TIMEOUT"],
|
||
)
|
||
warm_ok = True
|
||
except Exception as e: # noqa: BLE001
|
||
print(f"!! canonical reattach/readiness failed: {_scrub(str(e))}", flush=True)
|
||
|
||
if warm_ok:
|
||
# 2) deps (warm keycloak + per-run realm) — mirrors main()'s warm/cold split
|
||
if declared:
|
||
print(f"\n===== setup_custom_tests (quick): deps {declared} =====", flush=True)
|
||
try:
|
||
warm_deps, cold_deps = [], []
|
||
for d in declared:
|
||
wd = warm.warm_domain(d)
|
||
(warm_deps if (wd and warm.is_warm_up(d, wd)) else cold_deps).append(d)
|
||
dep_metas = {d: _load_meta(d) for d in cold_deps}
|
||
deps_list = (
|
||
deps_mod.deploy_deps(
|
||
recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas
|
||
)
|
||
if cold_deps
|
||
else []
|
||
)
|
||
for d in warm_deps:
|
||
wd = warm.warm_domain(d)
|
||
warm.reap_orphan_realms(d, wd)
|
||
deps_list.append({"recipe": d, "domain": wd, "warm": True})
|
||
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
|
||
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
||
deps_mod.write_run_state(deps_state)
|
||
_run_setup_custom_tests_hook(recipe, domain, depsfile)
|
||
except Exception as e: # noqa: BLE001
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! setup_custom_tests failed (deps-not-ready): {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
|
||
# 3) UPGRADE to PR head (chaos) + assert (generic reconverge+moved+serving + overlay)
|
||
results["upgrade"] = run_lifecycle_tier(
|
||
recipe, "upgrade", repo_local, domain, meta, head_ref, op_state
|
||
)
|
||
# 4) custom tier
|
||
os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0"
|
||
os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason
|
||
results["custom"] = run_custom(recipe, repo_local, domain)
|
||
else:
|
||
results["upgrade"] = "fail"
|
||
results["custom"] = "skip"
|
||
finally:
|
||
# F2-11 skip count (read before deciding pass/fail)
|
||
requires_deps_skipped = 0
|
||
try:
|
||
with open(skipfile) as f:
|
||
requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip())
|
||
except OSError:
|
||
pass
|
||
sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped)
|
||
passed = (
|
||
warm_ok
|
||
and bool(results)
|
||
and all(v != "fail" for v in results.values())
|
||
and not sso_unverified
|
||
)
|
||
|
||
# dep teardown: delete per-run warm realms; undeploy cold deps (mirrors cold)
|
||
if deps_state:
|
||
ordered = (
|
||
[deps_state[d] for d in declared if d in deps_state]
|
||
if isinstance(deps_state, dict)
|
||
else deps_state
|
||
)
|
||
for e in [x for x in ordered if x.get("warm")]:
|
||
try:
|
||
from harness import sso
|
||
|
||
sso.delete_keycloak_realm(e["domain"], e["realm"])
|
||
print(
|
||
f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}",
|
||
flush=True,
|
||
)
|
||
except Exception as ex: # noqa: BLE001
|
||
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
try:
|
||
deps_mod.teardown_deps([x for x in ordered if not x.get("warm")])
|
||
except lifecycle.TeardownError as e:
|
||
dep_teardown_error = str(e)
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
|
||
# canonical teardown — the WC4 contract:
|
||
# PASS → undeploy, KEEP volume, known-good UNCHANGED (never promote)
|
||
# FAIL → restore last-known-good snapshot (data safe) then leave undeployed (idle)
|
||
try:
|
||
if warm_ok and passed:
|
||
canonical.undeploy_keep_volume(recipe)
|
||
print(
|
||
" quick PASS → canonical undeployed, volume retained, known-good UNCHANGED",
|
||
flush=True,
|
||
)
|
||
elif warm_ok:
|
||
print(
|
||
" quick FAIL → rolling back canonical to last-known-good snapshot", flush=True
|
||
)
|
||
abra.undeploy(domain)
|
||
_wait_undeployed(domain)
|
||
warmsnap.restore(recipe, domain)
|
||
# reset recorded version to the known-good (the failed upgrade set TYPE to the broken
|
||
# PR commit) so the idle canonical's .env agrees with the registry + re-warms cleanly.
|
||
if reg.get("version"):
|
||
abra.env_set(domain, "TYPE", f"{recipe}:{reg['version']}")
|
||
canonical._set_status(recipe, "idle") # noqa: SLF001
|
||
rolled_back = True
|
||
print(
|
||
" quick FAIL → restored known-good data; canonical idle (NOT promoted)",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001
|
||
dep_teardown_error = (dep_teardown_error or "") + f" | quick teardown/rollback: {e}"
|
||
print(f"!! quick teardown/rollback error: {e}", flush=True)
|
||
|
||
with contextlib.suppress(OSError):
|
||
os.remove(statefile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(depsfile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
|
||
print("\n===== RUN SUMMARY =====", flush=True)
|
||
print("mode = quick (LOWER-CONFIDENCE; opt-in; does not gate merge)")
|
||
print(
|
||
f"canonical = {domain} known-good = {reg.get('version')} (UNCHANGED; quick never promotes)"
|
||
)
|
||
if rolled_back:
|
||
print("rolled-back = yes (restored last-known-good snapshot)")
|
||
for op in ("upgrade", "custom"):
|
||
if op in results:
|
||
suffix = ""
|
||
if op == "custom" and requires_deps_skipped:
|
||
suffix = f" ({requires_deps_skipped} requires_deps SKIPPED — SSO UNVERIFIED)"
|
||
print(f" {op:8s}: {results[op]}{suffix}")
|
||
|
||
overall = 0
|
||
if any(v == "fail" for v in results.values()) or not warm_ok:
|
||
overall = 1
|
||
if sso_unverified:
|
||
print(
|
||
f"!! DEPS={declared} but setup_custom_tests failed and {requires_deps_skipped} "
|
||
"requires_deps SKIPPED — SSO NOT verified (F2-11)",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if dep_teardown_error:
|
||
print(f"!! teardown leaked/erred: {dep_teardown_error}", file=sys.stderr)
|
||
overall = 1
|
||
if not results:
|
||
print("no tiers ran", file=sys.stderr)
|
||
return 1
|
||
return overall
|
||
|
||
|
||
def should_promote_canonical(recipe: str, ref: str | None, overall: int, quick: bool) -> bool:
|
||
"""WC5 gate (pure): a run advances/seeds the canonical iff the recipe is enrolled
|
||
(WARM_CANONICAL), the run was GREEN (overall==0), it was COLD (not --quick), and it ran on LATEST
|
||
(no PR head → `ref` empty: the nightly sweep or a manual `RECIPE=<r>` run). A PR `!testme` carries
|
||
REF=PR-head and must NOT promote the canonical to a PR's code. Only cold-on-latest advances it."""
|
||
return canonical.is_enrolled(recipe) and overall == 0 and not quick and not ref
|
||
|
||
|
||
def promote_canonical(recipe: str, head_ref: str | None) -> None:
|
||
"""WC5: (re)seed the canonical at the green-verified LATEST. Deploy `warm-<recipe>` at latest
|
||
(reattaching the retained canonical volume if one exists — an in-place version bump — else a fresh
|
||
install), wait healthy, undeploy, snapshot + record the registry (atomic replace of the
|
||
last-known-good). The OLD known-good is replaced ONLY here, after green (never lost on a red run)."""
|
||
import warm_reconcile as wr
|
||
|
||
domain = canonical.canonical_domain(recipe)
|
||
wr.fetch_recipe(recipe)
|
||
latest = wr.latest_version(wr.recipe_tags(recipe))
|
||
if not latest:
|
||
print(f"WC5 promote: no version tags for {recipe} — skip", flush=True)
|
||
return
|
||
meta = _load_meta(recipe)
|
||
# The cold run's deploy-count was already asserted + the countfile removed; don't perturb it.
|
||
os.environ.pop("CCCI_DEPLOY_COUNT_FILE", None)
|
||
print(
|
||
f"\n===== WC5 promote-on-green-cold: (re)seed canonical {recipe} @ {latest} =====",
|
||
flush=True,
|
||
)
|
||
lifecycle.deploy_app(
|
||
recipe,
|
||
domain,
|
||
version=latest,
|
||
secrets=True,
|
||
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", 900)),
|
||
)
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta["HEALTH_OK"]),
|
||
path=meta["HEALTH_PATH"],
|
||
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
||
http_timeout=meta["HTTP_TIMEOUT"],
|
||
)
|
||
abra.undeploy(domain)
|
||
_wait_undeployed(domain)
|
||
canonical.seed_canonical(recipe, latest, commit=head_ref)
|
||
print(
|
||
f"WC5 promote: canonical {recipe} advanced to known-good {latest} (idle, volume retained)",
|
||
flush=True,
|
||
)
|
||
|
||
|
||
def main() -> int:
|
||
recipe = os.environ.get("RECIPE")
|
||
if not recipe:
|
||
print("RECIPE env is required", file=sys.stderr)
|
||
return 2
|
||
ref = os.environ.get("REF") or None
|
||
src = os.environ.get("SRC") or None
|
||
target = os.environ.get("VERSION") or None
|
||
stages = {
|
||
s.strip() for s in os.environ.get("STAGES", ",".join(ALL_STAGES)).split(",") if s.strip()
|
||
}
|
||
|
||
print(
|
||
f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}"
|
||
)
|
||
# Concurrent-run safety: runs of the SAME recipe serialise on a per-recipe flock — they share
|
||
# ONE ~/.abra/recipes/<recipe> working tree which fetch_recipe (below) rm-rf's/reclones and the
|
||
# upgrade tier git-checkouts mid-run. Must be taken BEFORE fetch_recipe. Different recipes run
|
||
# in parallel (capacity=2). The reference must stay alive for the whole run: the kernel drops
|
||
# the flock when the fd closes (including on any crash/SIGKILL — no stale-lock failure mode).
|
||
_recipe_lock = lifecycle.acquire_recipe_lock(recipe) # noqa: F841
|
||
fetch_recipe(recipe, ref, src)
|
||
# The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test
|
||
# (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout
|
||
# HEAD (the catalogue current) for a non-PR `!testme`. Captured before any version-tag checkout.
|
||
head_ref = ref or lifecycle.recipe_head_commit(recipe)
|
||
repo_local = snapshot_recipe_tests(recipe)
|
||
meta = _load_meta(recipe)
|
||
|
||
# WC4/WC7: opt-in `--quick` fast lane. Requires an existing data-warm canonical; if none, fall
|
||
# back cleanly to the full COLD run below so the PR is still tested (DECISIONS Phase-2w).
|
||
if os.environ.get("CCCI_QUICK") == "1" or os.environ.get("MODE") == "quick":
|
||
if canonical.has_canonical(recipe):
|
||
return run_quick(recipe, ref, head_ref, repo_local, meta)
|
||
print(
|
||
f"MODE=quick requested but no canonical for {recipe} — falling back to COLD run "
|
||
"(no-canonical fallback, WC7)",
|
||
flush=True,
|
||
)
|
||
|
||
domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref)
|
||
|
||
# Deploy-once base version: previous published version when the upgrade tier will run and one
|
||
# exists (so upgrade goes previous→target in place), else the target (current/$REF). (DECISIONS.)
|
||
# A recipe may override the base via recipe_meta UPGRADE_BASE_VERSION when the harness default
|
||
# (recipe_versions[-2]) is NOT the PR's true predecessor — e.g. a PR that adds a version ABOVE the
|
||
# newest published tag, where the correct base is [-1] (the newest published), not [-2]. The
|
||
# override must be an exact published version tag (deployed as a pinned base). (Adversary §7.1.)
|
||
want_upgrade = "upgrade" in stages
|
||
prev = (
|
||
(meta.get("UPGRADE_BASE_VERSION") or lifecycle.previous_version(recipe))
|
||
if want_upgrade
|
||
else None
|
||
)
|
||
base = prev or target
|
||
backup_cap = generic.backup_capable(recipe, meta)
|
||
hook = discovery.install_steps(recipe, repo_local)
|
||
|
||
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
|
||
countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}")
|
||
with open(countfile, "w") as f:
|
||
f.write("0")
|
||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||
|
||
# Phase 3 (R1/R3): per-run artifact dir + JUnit dir. The tiers emit JUnit per file and append a
|
||
# {tier,source,file,rc,junit} record; after the run we assemble results.json (per-stage/per-test +
|
||
# level) into the artifact dir. Best-effort — never changes the verdict (R7).
|
||
run_artifact_dir = os.path.join(results_mod.runs_dir(), results_mod.run_id())
|
||
junit_dir = os.path.join(run_artifact_dir, "junit")
|
||
records: list[dict] = []
|
||
with contextlib.suppress(OSError):
|
||
os.makedirs(junit_dir, exist_ok=True)
|
||
|
||
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
|
||
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
|
||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
||
with open(statefile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||
op_state: dict = {}
|
||
|
||
# Run-scoped dep state (Phase 2 Q2.3, refined per operator-2026-05-28 SSO-dep plan §1):
|
||
# deps now deploy AFTER generic tiers (between RESTORE and CUSTOM) so a failed dep deploy
|
||
# cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs
|
||
# `tests/<recipe>/setup_custom_tests.sh` to wire OIDC env via in-place redeploy.
|
||
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
|
||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
||
with open(depsfile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
|
||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||
declared = deps_mod.declared_deps(recipe)
|
||
# Q3.2a: a recipe that tolerates OIDC env at first boot AND whose deps are live-warm wires OIDC
|
||
# at INSTALL time (provision the realm BEFORE the single deploy; install_steps.sh writes the env
|
||
# into it) instead of the post-deploy in-place `--chaos` redeploy — which is flaky on the heavy
|
||
# 12-service lasuite-drive stack (collabora WOPI race; see JOURNAL Step 0). Opt-in per recipe.
|
||
oidc_at_install = bool(meta.get("OIDC_AT_INSTALL")) and bool(declared)
|
||
if declared:
|
||
when = "BEFORE deploy (install-time OIDC)" if oidc_at_install else "AFTER generic tiers"
|
||
print(f"\n===== DEPS declared (provision {when}): {declared} =====", flush=True)
|
||
deps_state: dict[str, dict] = {} # new shape: recipe→entry dict (sso-dep plan §1)
|
||
deps_ready = True
|
||
deps_not_ready_reason: str = ""
|
||
|
||
results: dict[str, str] = {}
|
||
lifecycle.janitor()
|
||
dep_teardown_error: str | None = None
|
||
screenshot_rel: str | None = None # Phase 3 U1 (R4): set once the app screenshot is captured
|
||
try:
|
||
# ---- (Q3.2a) install-time OIDC: provision the warm-dep realm BEFORE the single deploy so
|
||
# install_steps.sh can read $CCCI_DEPS_FILE and wire the OIDC env into that one deploy. On
|
||
# failure we mark deps-not-ready but STILL deploy the recipe alone (install_steps.sh no-ops
|
||
# on an empty deps file) so the generic tiers run; the OIDC custom test then skips → F2-11. ----
|
||
if oidc_at_install:
|
||
print(
|
||
f"\n===== install-time OIDC: provisioning deps {declared} BEFORE deploy =====",
|
||
flush=True,
|
||
)
|
||
try:
|
||
deps_state = _provision_deps(recipe, domain, ref, declared)
|
||
print(
|
||
" install-time OIDC: deps provisioned; install_steps.sh will wire OIDC env",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — isolated; recipe still deploys, OIDC test skips
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! install-time dep provisioning failed (deps-not-ready): {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
|
||
# ---- deploy RECIPE FIRST, alone (no deps yet — generic tiers run recipe-only) ----
|
||
try:
|
||
lifecycle.deploy_app(
|
||
recipe,
|
||
domain,
|
||
version=base,
|
||
secrets=True,
|
||
install_steps_hook=hook,
|
||
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", 900)),
|
||
)
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta["HEALTH_OK"]),
|
||
path=meta["HEALTH_PATH"],
|
||
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
||
http_timeout=meta["HTTP_TIMEOUT"],
|
||
)
|
||
# Recipe READY_PROBE (e.g. lasuite-drive collabora WOPI discovery) — readiness beyond
|
||
# replica convergence + app HEALTH_PATH; no-op for recipes without one.
|
||
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", 900)))
|
||
deploy_ok = True
|
||
except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure
|
||
print(f"!! deploy/readiness failed: {e}", flush=True)
|
||
deploy_ok = False
|
||
|
||
# ---- Phase 3 U1 (R4): capture a real app screenshot while the app is up, at the cleanest
|
||
# "freshly installed + healthy" moment (before any tier mutates state and before teardown).
|
||
# Placed OUTSIDE the deploy try/except so a screenshot issue can NEVER flip deploy_ok.
|
||
# Secret-safe by default (landing page, never a credentials page; recipes opt into a
|
||
# post-login view via a SCREENSHOT meta hook). Best-effort — capture() swallows all errors and
|
||
# returns None, so this never blocks or fails the run (R7). None → results.json `screenshot`
|
||
# stays null → the card shows the "no screenshot" placeholder (cosmetics never change verdict).
|
||
if deploy_ok:
|
||
# capture() already swallows all errors → None; the extra try/except is defense-in-depth
|
||
# (U5 R7 hardening) so a screenshot can NEVER fail/crash the run even if that internal
|
||
# contract regresses or a recipe SCREENSHOT hook raises. Cosmetics never change the verdict.
|
||
try:
|
||
shot = screenshot_mod.capture(
|
||
domain, screenshot_mod.screenshot_path(run_artifact_dir), recipe_meta=meta
|
||
)
|
||
screenshot_rel = os.path.basename(shot) if shot else None
|
||
except Exception as e: # noqa: BLE001 — screenshot is cosmetic; never fail a run on it (R7)
|
||
print(
|
||
f"!! screenshot capture raised (non-fatal, verdict unaffected): {_scrub(str(e))}",
|
||
flush=True,
|
||
)
|
||
|
||
# ---- INSTALL tier (always; additive generic + overlay, no op) ----
|
||
if "install" in stages:
|
||
results["install"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"install",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if deploy_ok
|
||
else "fail"
|
||
)
|
||
|
||
if deploy_ok:
|
||
# ---- UPGRADE tier (op once → generic + overlay assert) ----
|
||
if "upgrade" in stages:
|
||
results["upgrade"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"upgrade",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if prev
|
||
else "skip" # only one published version → nothing to upgrade from
|
||
)
|
||
# ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ----
|
||
if "backup" in stages:
|
||
results["backup"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"backup",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if backup_cap
|
||
else "skip"
|
||
)
|
||
if "restore" in stages:
|
||
results["restore"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"restore",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if backup_cap
|
||
else "skip"
|
||
)
|
||
# ---- setup_custom_tests step (NEW, operator-2026-05-28 SSO-dep plan §3.2) ----
|
||
# Deploy each declared dep + wire OIDC env into the parent app via the per-recipe
|
||
# setup_custom_tests.sh hook + in-place redeploy. Failure here marks deps-not-ready
|
||
# but does NOT abort the run — @pytest.mark.requires_deps tests skip with reason;
|
||
# non-deps custom tests still run normally.
|
||
if declared and not oidc_at_install:
|
||
# LEGACY post-deploy path: provision deps AFTER generic tiers, then wire OIDC env
|
||
# into the parent via the setup_custom_tests.sh hook + an in-place `--chaos` redeploy.
|
||
print("\n===== setup_custom_tests: deps + OIDC wiring =====", flush=True)
|
||
try:
|
||
deps_state = _provision_deps(recipe, domain, ref, declared)
|
||
# Run the per-recipe post-deps hook (jq-driven OIDC wiring + in-place redeploy)
|
||
_run_setup_custom_tests_hook(recipe, domain, depsfile)
|
||
except Exception as e: # noqa: BLE001 — setup failure is ISOLATED to dep-marked tests
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! setup_custom_tests failed (deps-not-ready): {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
elif declared and oidc_at_install and deps_ready:
|
||
# INSTALL-TIME path (Q3.2a): deps were provisioned BEFORE the single deploy and the
|
||
# install-tier install_steps.sh hook already wired OIDC env into that one deploy —
|
||
# so NO re-provision, NO reconverge here. Run only the post-deploy setup hook
|
||
# (e.g. lasuite-drive's minio-createbuckets one-shot), which needs the live stack.
|
||
print("\n===== post-deploy setup (OIDC already wired at install) =====", flush=True)
|
||
try:
|
||
_run_setup_custom_tests_hook(recipe, domain, depsfile)
|
||
except Exception as e: # noqa: BLE001 — isolated to dep-marked / state-dependent tests
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! post-deploy setup failed: {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
|
||
# ---- CUSTOM tier ----
|
||
if "custom" in stages:
|
||
# Pass deps-ready state via env; conftest.py skips @pytest.mark.requires_deps
|
||
# tests when CCCI_DEPS_READY=0.
|
||
os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0"
|
||
os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason
|
||
results["custom"] = run_custom(
|
||
recipe, repo_local, domain, records=records, junit_dir=junit_dir
|
||
)
|
||
else:
|
||
# install failed → the shared deployment is dead; remaining tiers cannot run on it.
|
||
for op in ("upgrade", "backup", "restore", "custom"):
|
||
if op in stages:
|
||
results[op] = "skip"
|
||
finally:
|
||
# Teardown the recipe under test FIRST, then deps in reverse declaration order.
|
||
# Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier
|
||
# failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are
|
||
# captured into dep_teardown_error and surfaced in the run summary + exit code, but
|
||
# we still print the diagnosable summary first.
|
||
lifecycle.teardown_app(domain, verify=False)
|
||
if deps_state:
|
||
print("\n===== DEPS teardown =====", flush=True)
|
||
# Flatten the dict-shape state in declaration order; teardown_deps reverses for cold.
|
||
if isinstance(deps_state, dict):
|
||
ordered = [deps_state[d] for d in declared if d in deps_state]
|
||
else:
|
||
ordered = deps_state
|
||
# WC1: warm deps are NOT undeployed — we only delete the per-run realm on the shared
|
||
# live-warm provider (the app stays up for the next run). Cold deps undeploy as before.
|
||
warm_entries = [e for e in ordered if e.get("warm")]
|
||
cold_entries = [e for e in ordered if not e.get("warm")]
|
||
for e in warm_entries:
|
||
try:
|
||
from harness import sso
|
||
|
||
sso.delete_keycloak_realm(e["domain"], e["realm"])
|
||
print(
|
||
f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}",
|
||
flush=True,
|
||
)
|
||
except Exception as ex: # noqa: BLE001 — a leaked realm is a teardown failure (§9)
|
||
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
try:
|
||
deps_mod.teardown_deps(cold_entries)
|
||
except lifecycle.TeardownError as e:
|
||
dep_teardown_error = str(e)
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
|
||
# ---- deploy-count assertion (DG4.1) ----
|
||
with open(countfile) as f:
|
||
deploy_count = int(f.read().strip() or "0")
|
||
os.remove(countfile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(statefile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(depsfile)
|
||
# F2-11: sum the requires_deps skip counts conftest recorded across the custom files.
|
||
requires_deps_skipped = 0
|
||
try:
|
||
with open(skipfile) as f:
|
||
requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip())
|
||
except OSError:
|
||
pass
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
|
||
# ---- per-op summary (DG6 feed) ----
|
||
# SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each
|
||
# COLD dep). In-place reconfigure-and-redeploy (the setup_custom_tests step's
|
||
# `abra app deploy --force --chaos`) is NOT a fresh `app_new` and does NOT increment the count.
|
||
# WC1: a live-warm dep (keycloak) is NOT deployed by the run — it only gets a per-run realm — so
|
||
# warm deps contribute 0. So expected = 1 + (number of COLD deps that actually got deployed).
|
||
_dep_entries = deps_state.values() if isinstance(deps_state, dict) else (deps_state or [])
|
||
deps_deployed_count = sum(
|
||
1 for e in _dep_entries if not (isinstance(e, dict) and e.get("warm"))
|
||
)
|
||
expected_deploy_count = 1 + deps_deployed_count
|
||
print("\n===== RUN SUMMARY =====", flush=True)
|
||
print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})")
|
||
if deps_state:
|
||
deps_list_for_summary = (
|
||
list(deps_state.keys())
|
||
if isinstance(deps_state, dict)
|
||
else [d.get("recipe", "?") for d in deps_state]
|
||
)
|
||
print(f" deps deployed: {deps_list_for_summary}")
|
||
if not deps_ready:
|
||
print(f" deps-not-ready: {deps_not_ready_reason}")
|
||
order = [s for s in ALL_STAGES if s in results]
|
||
for op in order:
|
||
suffix = ""
|
||
# F2-11: annotate the custom tier when requires_deps (SSO) tests were skipped, so a reader
|
||
# of the summary can't mistake a green custom tier for "SSO verified".
|
||
if op == "custom" and requires_deps_skipped:
|
||
suffix = f" ({requires_deps_skipped} requires_deps SKIPPED: deps-not-ready — SSO UNVERIFIED)"
|
||
print(f" {op:8s}: {results[op]}{suffix}")
|
||
|
||
overall = 0
|
||
if deploy_count != expected_deploy_count:
|
||
print(
|
||
f"!! deploy-count {deploy_count} != {expected_deploy_count} (DG4.1 violation)",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if dep_teardown_error:
|
||
# F2-5: dep teardown leaks violate §9 (teardown sacred); fail the run loudly.
|
||
print(f"!! dep teardown leaked state: {dep_teardown_error}", file=sys.stderr)
|
||
overall = 1
|
||
if any(v == "fail" for v in results.values()):
|
||
overall = 1
|
||
# F2-11: a deps-declaring recipe whose setup_custom_tests failed has NOT verified its SSO/OIDC
|
||
# claim — its requires_deps tests SKIPPED (a skip-only file exits 0, so without this the run
|
||
# would report GREEN). Fail the run for that recipe; generic-tier results above are untouched.
|
||
if sso_dep_unverified(declared, deps_ready, requires_deps_skipped):
|
||
print(
|
||
f"!! recipe declares DEPS={declared} but setup_custom_tests failed and "
|
||
f"{requires_deps_skipped} requires_deps (SSO) test(s) were SKIPPED — SSO claim NOT "
|
||
f"verified; failing run (F2-11). deps-not-ready: {deps_not_ready_reason}",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if not results:
|
||
print("no tiers ran", file=sys.stderr)
|
||
return 1
|
||
|
||
# ---- Phase 3 (R1/R3): assemble results.json (per-stage/per-test + computed level). Best-effort:
|
||
# a failure here NEVER changes `overall` (R7 — cosmetics never block the pipeline). ----
|
||
data: dict | None = None
|
||
try:
|
||
clean_teardown = (deploy_count == expected_deploy_count) and not dep_teardown_error
|
||
data = results_mod.build_results(
|
||
recipe=recipe,
|
||
version=target or (head_ref[:12] if head_ref else None),
|
||
pr=os.environ.get("PR", "0"),
|
||
ref=ref,
|
||
records=records,
|
||
results=results,
|
||
backup_capable=backup_cap,
|
||
clean_teardown=clean_teardown,
|
||
no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact
|
||
screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded
|
||
finished_ts=time.time(),
|
||
expected_na=meta.get("EXPECTED_NA"), # declared intentional-skip map (recipe_meta)
|
||
)
|
||
# Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7).
|
||
blob = json.dumps(data)
|
||
leaked = any(v in blob for v in _REDACT)
|
||
data["flags"]["no_secret_leak"] = not leaked
|
||
if leaked:
|
||
print(
|
||
"!! results.json leak-scan: a known secret value appeared — scrubbing flag set False",
|
||
file=sys.stderr,
|
||
)
|
||
path = results_mod.write_results(data)
|
||
print(
|
||
f"results.json written: {path} (level={data['level']}"
|
||
f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
|
||
flush=True,
|
||
)
|
||
# Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
|
||
# but is not in the recipe's intentional list — either add the missing coverage or declare it.
|
||
for rung in data.get("skips", {}).get("unintentional", []):
|
||
print(
|
||
f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
|
||
f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
|
||
f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
|
||
print(
|
||
f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}",
|
||
file=sys.stderr,
|
||
)
|
||
|
||
# ---- Phase 3 U2 (R3/R6): render the summary CARD (HTML→PNG) + level BADGE (SVG) from the
|
||
# results dict into the run artifact dir, alongside results.json + screenshot.png. The card
|
||
# REPORTS results.json verbatim — it computes nothing, so it can never look greener than the tiers
|
||
# (cardinal invariant, plan §6). Separate best-effort block (results.json is already written by
|
||
# here) — any failure is swallowed and NEVER changes `overall` (R7); a render failure simply means
|
||
# no summary.png, and U3/U4 fall back to text. ----
|
||
if data is not None:
|
||
try:
|
||
html_path = os.path.join(run_artifact_dir, "summary.html")
|
||
with open(html_path, "w", encoding="utf-8") as f:
|
||
f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
|
||
png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
|
||
capped = data.get("level_cap_rung")
|
||
sk = data.get("skips", {})
|
||
cap_skip = (
|
||
"intentional"
|
||
if capped in (sk.get("intentional") or {})
|
||
else "unintentional"
|
||
if capped in (sk.get("unintentional") or [])
|
||
else ""
|
||
)
|
||
with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
|
||
f.write(
|
||
card_mod.level_badge_svg(
|
||
data["level"], data.get("level_cap_reason", ""), cap_skip
|
||
)
|
||
)
|
||
print(
|
||
f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
|
||
f"badge.svg written into {run_artifact_dir}",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — card/badge are cosmetic; never fail a run (R7)
|
||
print(f"!! summary card/badge render failed (non-fatal): {_scrub(str(e))}", flush=True)
|
||
|
||
# WC5 promote-on-green-cold: a GREEN COLD run on LATEST (no PR head) of an enrolled
|
||
# (WARM_CANONICAL) recipe advances/seeds the canonical. ONLY cold-on-latest advances it (a PR
|
||
# `!testme` carries REF and must NOT promote; `--quick` never promotes — handled in run_quick).
|
||
# Non-fatal: a promote failure leaves the OLD known-good intact (never lose it) and is logged.
|
||
if should_promote_canonical(recipe, ref, overall, quick=False):
|
||
try:
|
||
promote_canonical(recipe, head_ref)
|
||
except Exception as e: # noqa: BLE001 — promote is a post-green bonus; never fail a green run
|
||
print(
|
||
f"!! WC5 promote failed (non-fatal; known-good unchanged): {_scrub(str(e))}",
|
||
flush=True,
|
||
)
|
||
|
||
return overall
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|