1379 lines
64 KiB
Python
1379 lines
64 KiB
Python
#!/usr/bin/env python3
|
||
"""Top-level CI orchestrator (plan §4.3 + Phase 1d/1e), invoked by the Drone pipeline (or by hand).
|
||
|
||
Model: deploy the app ONCE, then run lifecycle TIERS against that single shared deployment, then ONE
|
||
teardown in `finally`. Per Phase 1e the orchestrator OWNS each mutating op (HC3): for a tier it runs
|
||
the optional pre-op seed hook (recipe ops.py `pre_<op>`), performs the op exactly ONCE
|
||
(upgrade/backup/restore — install has none), then runs BOTH the generic assertion file (the floor,
|
||
unless explicitly opted out) AND the recipe overlay assertion file (if any) against the shared
|
||
post-op state — generic and overlay are ADDITIVE, not override (HC3). Op results an assertion needs
|
||
(pre-upgrade identity, snapshot_id) pass op→assertion via a run-scoped JSON state file
|
||
($CCCI_OP_STATE_FILE). The upgrade op deploys the PR-HEAD code under test via `abra app deploy
|
||
--chaos` (HC1). Repo-local (PR-authored) overlays/hooks run only for allowlist-approved recipes (HC2,
|
||
gated in harness.discovery). The generic is the default for every op, so ANY recipe is testable with
|
||
zero config (DG1–DG4). The lifecycle OPS live in the shared harness (harness.generic), not per-recipe
|
||
(DG7 DRY).
|
||
|
||
Run parameters from env (set by the comment-bridge via Drone build params):
|
||
RECIPE recipe name (e.g. custom-html) [required]
|
||
REF PR head commit sha [optional; used for fetch + run-domain hash]
|
||
PR PR number [optional, default 0]
|
||
SRC head repo full_name on the mirror [optional]
|
||
VERSION upgrade target tag (else newest published) [optional]
|
||
STAGES comma filter of tiers to run [optional, default install,upgrade,backup,restore,custom]
|
||
|
||
Run env (python + pytest + playwright) is provided by `cc-ci-run` (nix/modules/harness.nix);
|
||
invoke as: cc-ci-run runner/run_recipe_ci.py
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import contextlib
|
||
import glob
|
||
import importlib.util
|
||
import json
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import time
|
||
|
||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, os.path.join(ROOT, "runner"))
|
||
from harness import ( # noqa: E402
|
||
abra,
|
||
canonical,
|
||
discovery,
|
||
generic,
|
||
lifecycle,
|
||
lifetime,
|
||
naming,
|
||
warm,
|
||
warmsnap,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
card as card_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
deps as deps_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
lint as lint_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
manifest as manifest_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
meta as meta_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
results as results_mod,
|
||
)
|
||
from harness import ( # noqa: E402
|
||
screenshot as screenshot_mod,
|
||
)
|
||
|
||
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
|
||
|
||
|
||
def sso_dep_unverified(declared, deps_ready: bool, requires_deps_skipped: int) -> bool:
|
||
"""F2-11 gate predicate (pure, unit-tested). True when a recipe declares DEPS but its
|
||
dep provisioning failed (deps not ready) AND that caused ≥1 `requires_deps` (SSO/OIDC) test
|
||
to SKIP. In that case the recipe's characteristic SSO claim was NOT verified, so the run must
|
||
NOT report GREEN — even though a skip-only pytest file exits 0 and leaves every tier 'pass'.
|
||
Generic-tier failure-isolation is preserved (those results stand); only the green SIGNAL is
|
||
corrected. Gated on skip>0 so a deps-declaring recipe with no requires_deps tests isn't
|
||
false-failed."""
|
||
return bool(declared) and not deps_ready and requires_deps_skipped > 0
|
||
|
||
|
||
def upgrade_base(stages, meta, recipe: str) -> str | None:
|
||
"""Deploy-once base version decision (pure given meta + the published-version lookup):
|
||
previous published version when the upgrade tier will run and one exists (so upgrade goes
|
||
previous→target in place), else None (the caller falls back to the target / PR head).
|
||
(DECISIONS.)
|
||
|
||
A recipe may override the base via recipe_meta UPGRADE_BASE_VERSION when the harness default
|
||
(recipe_versions[-2]) is NOT the PR's true predecessor — e.g. a PR that adds a version ABOVE the
|
||
newest published tag, where the correct base is [-1] (the newest published), not [-2]. The
|
||
override must be an exact published version tag (deployed as a pinned base). (Adversary §7.1.)
|
||
|
||
A recipe that declares the upgrade rung in EXPECTED_NA gets NO base: published versions may
|
||
exist yet be genuinely undeployable — e.g. bluesky-pds, where every published tag pins the
|
||
moving image tag `:0.4` that upstream republished with incompatible main builds, so no
|
||
published version can come up as an upgrade base (phase bsky, DECISIONS). Deploying one would
|
||
fail the INSTALL tier before the PR-head code is ever exercised. With no base, the single
|
||
deploy is the PR head itself and the upgrade tier records "skip", which derive_rungs
|
||
classifies as the DECLARED intentional skip (reason from EXPECTED_NA — visible in
|
||
results.json `skips.intentional`, never reported as a pass)."""
|
||
if "upgrade" not in stages:
|
||
return None
|
||
if "upgrade" in (meta.EXPECTED_NA or {}):
|
||
print(
|
||
"== upgrade tier: declared EXPECTED_NA['upgrade'] — no upgrade base will be "
|
||
f"deployed; the single deploy is the target/PR head. Reason: "
|
||
f"{(meta.EXPECTED_NA or {}).get('upgrade')}",
|
||
flush=True,
|
||
)
|
||
return None
|
||
return meta.UPGRADE_BASE_VERSION or lifecycle.previous_version(recipe)
|
||
|
||
|
||
def _truthy(v: str | None) -> bool:
|
||
return str(v or "").strip().lower() in ("1", "true", "yes", "on")
|
||
|
||
|
||
def _redact_values() -> list[str]:
|
||
"""Values to scrub from published logs (D6 redaction filter, plan §4.4). The infra secrets
|
||
materialised at /run/secrets/* — if any subprocess ever echoes one, mask it. Only >=8-char
|
||
values, so it never false-positives on short strings / SHAs."""
|
||
vals = set()
|
||
for p in glob.glob("/run/secrets/*"):
|
||
try:
|
||
with open(p) as f:
|
||
v = f.read().strip()
|
||
except OSError:
|
||
continue
|
||
if len(v) >= 8:
|
||
vals.add(v)
|
||
return sorted(vals, key=len, reverse=True)
|
||
|
||
|
||
_REDACT = _redact_values()
|
||
|
||
|
||
def _scrub(text: str) -> str:
|
||
"""Mask any known infra-secret value in a string (D6 redaction, plan §4.4)."""
|
||
for v in _REDACT:
|
||
if v in text:
|
||
text = text.replace(v, "***REDACTED***")
|
||
return text
|
||
|
||
|
||
def run_redacted(cmd: list[str], env: dict | None = None) -> int:
|
||
"""Run a subprocess, streaming output live (so Drone logs stay tail-able) but masking any known
|
||
infra-secret value first. Belt-and-suspenders: the harness never prints secrets and abra doesn't
|
||
echo generated ones."""
|
||
proc = subprocess.Popen(
|
||
cmd,
|
||
cwd=ROOT,
|
||
env=env,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.STDOUT,
|
||
text=True,
|
||
bufsize=1,
|
||
)
|
||
assert proc.stdout is not None
|
||
for line in proc.stdout:
|
||
sys.stdout.write(_scrub(line))
|
||
sys.stdout.flush()
|
||
return proc.wait()
|
||
|
||
|
||
def _gitea_token() -> str | None:
|
||
tok = os.environ.get("GITEA_TOKEN")
|
||
if not tok and os.path.exists("/run/secrets/bridge_gitea_token"):
|
||
with open("/run/secrets/bridge_gitea_token") as f:
|
||
tok = f.read().strip()
|
||
return tok or None
|
||
|
||
|
||
def _run_state_path(name: str) -> str:
|
||
"""Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain.
|
||
A second run of the SAME domain overlaps this process (its main() preamble executes before it
|
||
blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the
|
||
live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile
|
||
FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them
|
||
via the CCCI_*_FILE env vars, so the key only has to be unique per harness process."""
|
||
rid = results_mod.run_id()
|
||
return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}")
|
||
|
||
|
||
def setup_run_abra_dir() -> str:
|
||
"""P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
|
||
|
||
`<runs_dir>/<run-id>/abra/` with:
|
||
servers/ -> symlink to the canonical ~/.abra/servers. App .env files land in the shared
|
||
canonical path, so janitor discovery (`abra app ls`) and env-based teardown
|
||
work unchanged from any process; per-domain filenames + the app-domain lock
|
||
prevent write conflicts.
|
||
catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly).
|
||
recipes/ fresh + empty — THE isolation that matters: each run clones and git-checkouts
|
||
its own recipe trees, so concurrent runs (same recipe included) can never
|
||
corrupt each other's deploy tree. Replaces the per-recipe flock.
|
||
Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper
|
||
(abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention."""
|
||
canonical = os.path.expanduser("~/.abra")
|
||
rid = results_mod.run_id()
|
||
if rid == "manual":
|
||
rid = f"manual-{os.getpid()}" # two concurrent hand-runs must not share a tree
|
||
run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra")
|
||
os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True)
|
||
for shared in ("servers", "catalogue"):
|
||
link = os.path.join(run_abra_dir, shared)
|
||
if not os.path.islink(link):
|
||
os.symlink(os.path.join(canonical, shared), link)
|
||
os.environ["ABRA_DIR"] = run_abra_dir
|
||
print(
|
||
f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==",
|
||
flush=True,
|
||
)
|
||
return run_abra_dir
|
||
|
||
|
||
def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
|
||
"""Make the recipe available at the code under test in THIS RUN's recipe tree
|
||
($ABRA_DIR/recipes/<recipe>): a plain clone — no locking needed, no rm-rf of any shared
|
||
state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered
|
||
`abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR,
|
||
clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot
|
||
token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed)."""
|
||
dest = abra.recipe_dir(recipe)
|
||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||
# CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a
|
||
# fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it
|
||
# being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/<recipe>;
|
||
# copy it into the per-run tree so the rest of the run reads the staged state. Never set in
|
||
# production CI.
|
||
if os.environ.get("CCCI_SKIP_FETCH") == "1":
|
||
canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||
subprocess.run(["rm", "-rf", dest], check=False)
|
||
if os.path.isdir(canonical):
|
||
shutil.copytree(canonical, dest, symlinks=True)
|
||
print(
|
||
f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is "
|
||
f"(copied {canonical} -> per-run tree)",
|
||
flush=True,
|
||
)
|
||
return
|
||
if src and ref:
|
||
url = f"https://git.autonomic.zone/{src}.git"
|
||
git = ["git"]
|
||
tok = _gitea_token()
|
||
if tok:
|
||
git += ["-c", f"http.extraHeader=Authorization: token {tok}"]
|
||
subprocess.run(["rm", "-rf", dest], check=False)
|
||
subprocess.run([*git, "clone", "--quiet", url, dest], check=True)
|
||
subprocess.run([*git, "-C", dest, "checkout", "--quiet", ref], check=True)
|
||
# Bring in published version TAGS from the public upstream so the upgrade tier can deploy a
|
||
# previous published version (mirror PR branches carry no release tags). Read-only + plain git
|
||
# (no bot token to a foreign host). Non-fatal: if unreachable, upgrade degrades to a skip.
|
||
upstream = f"https://git.coopcloud.tech/coop-cloud/{recipe}.git"
|
||
subprocess.run(
|
||
["git", "-C", dest, "fetch", "--quiet", upstream, "refs/tags/*:refs/tags/*"],
|
||
check=False,
|
||
)
|
||
else:
|
||
# Clean re-fetch from the catalogue. rm first so a leftover dir from a prior SRC+REF run
|
||
# (origin → private mirror, maybe lacking tags) can't poison the catalogue fetch.
|
||
subprocess.run(["rm", "-rf", dest], check=False)
|
||
subprocess.run(["abra", "recipe", "fetch", recipe, "-n"], check=True)
|
||
|
||
|
||
def snapshot_recipe_tests(recipe: str) -> str | None:
|
||
"""Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the
|
||
recipe to a version tag during the run. Returns the snapshot path, or None if no tests/."""
|
||
src = os.path.join(abra.recipe_dir(recipe), "tests")
|
||
if not os.path.isdir(src):
|
||
return None
|
||
has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile(
|
||
os.path.join(src, "install_steps.sh")
|
||
)
|
||
if not has_overlay:
|
||
return None
|
||
dst = os.path.join(tempfile.gettempdir(), f"ccci-recipe-tests-{recipe}")
|
||
shutil.rmtree(dst, ignore_errors=True)
|
||
shutil.copytree(src, dst)
|
||
return dst
|
||
|
||
|
||
def _tier_env(domain: str) -> dict:
|
||
return dict(os.environ, CCCI_APP_DOMAIN=domain, CCCI_BASE_URL=f"https://{domain}")
|
||
|
||
|
||
def skip_generic_env_overrides() -> list[str]:
|
||
"""Active CCCI_SKIP_GENERIC* env overrides (rcust P2c: the meta key is deleted; the env form
|
||
is a documented LOCAL-DEV-ONLY escape hatch). Surfaced loudly when set in a CI (drone) run —
|
||
it reduces generic-floor coverage and must never silently ride a CI verdict."""
|
||
return sorted(
|
||
k for k in os.environ if k.startswith("CCCI_SKIP_GENERIC") and _truthy(os.environ.get(k))
|
||
)
|
||
|
||
|
||
def _skip_generic(op: str) -> bool:
|
||
"""Whether the generic assertion for `op` is opted out (Phase 1e HC3). Default: run (additive).
|
||
Opt-out via env only (dev-only escape hatch, P2c): CCCI_SKIP_GENERIC (all ops) or
|
||
CCCI_SKIP_GENERIC_<OP>. The recipe_meta SKIP_GENERIC key is deleted (zero users)."""
|
||
if _truthy(os.environ.get("CCCI_SKIP_GENERIC")):
|
||
return True
|
||
return _truthy(os.environ.get(f"CCCI_SKIP_GENERIC_{op.upper()}"))
|
||
|
||
|
||
def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, meta) -> None:
|
||
"""Run the optional pre-op seed hook (recipe ops.py `pre_<op>`) BEFORE the harness performs the
|
||
op (HC3 op/assertion split): overlays seed data-continuity markers / the backup→restore mutation
|
||
here, then assert post-op in test_<op>.py. cc-ci's ops.py is trusted; a repo-local ops.py is
|
||
consulted only for allowlist-approved recipes (HC2 gate is inside discovery.pre_op_hook). Imported
|
||
in-process; the recipe dir is put on sys.path so an ops.py can import its sibling helpers."""
|
||
hook = discovery.pre_op_hook(recipe, op, repo_local)
|
||
if not hook:
|
||
return
|
||
source, path = hook
|
||
d = os.path.dirname(path)
|
||
sys.path.insert(0, d)
|
||
try:
|
||
spec = importlib.util.spec_from_file_location(f"ccci_ops_{recipe}_{op}", path)
|
||
mod = importlib.util.module_from_spec(spec)
|
||
spec.loader.exec_module(mod)
|
||
print(f" pre-op seed ({source}): {os.path.relpath(path, ROOT)}::pre_{op}", flush=True)
|
||
fn = getattr(mod, f"pre_{op}")
|
||
# Uniform ctx convention (rcust P3): pre_<op>(ctx). A legacy (domain, meta) hook fails
|
||
# HERE with a clear migration message, not a TypeError mid-call.
|
||
meta_mod.check_hook_signature(fn, ("ctx",), f"{os.path.relpath(path, ROOT)}::pre_{op}")
|
||
fn(meta_mod.hook_ctx(domain, meta, op=op))
|
||
finally:
|
||
if d in sys.path:
|
||
sys.path.remove(d)
|
||
|
||
|
||
def _perform_op(
|
||
op: str,
|
||
domain: str,
|
||
recipe: str,
|
||
head_ref: str | None,
|
||
op_state: dict,
|
||
deploy_timeout: int = 900,
|
||
meta=None,
|
||
) -> None:
|
||
"""Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records
|
||
what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these
|
||
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a
|
||
new install (HC1 reconciliation). `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the
|
||
upgrade chaos redeploy so a heavy reconverge isn't SIGKILLed by the 900s default mid-wait; `meta`
|
||
lets the upgrade op own a recipe-aware convergence+health wait (F2-12, READY_PROBE)."""
|
||
if op == "upgrade":
|
||
before = generic.perform_upgrade(
|
||
domain, recipe, head_ref, deploy_timeout=deploy_timeout, meta=meta
|
||
)
|
||
op_state["upgrade"] = {"before": before, "head_ref": head_ref}
|
||
elif op == "backup":
|
||
# Backup integrity + retry (F2-14b). A recipe may define BACKUP_VERIFY(domain) -> bool that
|
||
# confirms the backup actually captured the recipe's critical data AFTER the op. This guards a
|
||
# real race: a DB recipe dumps its data in a backupbot pre-hook, but if the DB container cycles
|
||
# mid-dump (intermittent under host load) the dump is truncated/absent, so restic snapshots an
|
||
# empty path — `abra app backup create` still "succeeds", yet a later restore silently loses the
|
||
# data (ghost: backup.sql.gz never written → restore can't reimport → seeded row gone). When
|
||
# verify fails we re-run the WHOLE backup (fresh restic snapshot) with a re-stabilised DB, up to
|
||
# 3 attempts. Recipes without BACKUP_VERIFY are unaffected (single backup, as before).
|
||
snap = generic.perform_backup(domain)
|
||
verify = meta.BACKUP_VERIFY if meta else None
|
||
verify_ctx = meta_mod.hook_ctx(domain, meta, op="backup") if meta else None
|
||
attempt = 1
|
||
while callable(verify) and not verify(verify_ctx) and attempt < 3:
|
||
attempt += 1
|
||
print(
|
||
f" backup-verify FAILED (attempt {attempt - 1}/3) — backup did not capture the "
|
||
f"recipe's critical data (e.g. DB cycled mid-dump); re-running backup",
|
||
flush=True,
|
||
)
|
||
snap = generic.perform_backup(domain)
|
||
if callable(verify) and not verify(verify_ctx):
|
||
print(
|
||
f" !! backup-verify still FAILED after {attempt} attempts — backup is incomplete",
|
||
flush=True,
|
||
)
|
||
op_state["backup"] = {"snapshot_id": snap}
|
||
elif op == "restore":
|
||
generic.perform_restore(domain)
|
||
# install: already deployed; no op
|
||
|
||
|
||
def run_lifecycle_tier(
|
||
recipe: str,
|
||
op: str,
|
||
repo_local: str | None,
|
||
domain: str,
|
||
meta,
|
||
head_ref: str | None,
|
||
op_state: dict,
|
||
records: list[dict] | None = None,
|
||
junit_dir: str | None = None,
|
||
) -> str:
|
||
"""Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic
|
||
assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op
|
||
deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns
|
||
'pass' | 'fail' | 'skip'.
|
||
|
||
Phase 3 (R1/R3): when `records`/`junit_dir` are given, each pytest file is run with --junitxml and
|
||
a {tier,source,file,rc,junit} record appended, so the run can assemble per-stage/per-test
|
||
results.json + the level afterwards. Purely additive — does not change the verdict."""
|
||
overlay = discovery.resolve_overlay_op(recipe, op, repo_local)
|
||
skip_gen = _skip_generic(op)
|
||
files: list[tuple[str, str]] = []
|
||
if not skip_gen:
|
||
files.append(discovery.generic_op(op))
|
||
if overlay:
|
||
files.append(overlay)
|
||
if not files:
|
||
# generic opted out AND no overlay → nothing would assert; don't perform a pointless mutating op
|
||
print(f"\n===== TIER: {op} — SKIP (generic opted out, no overlay) =====", flush=True)
|
||
return "skip"
|
||
|
||
ov = f"{overlay[0]}:{os.path.relpath(overlay[1], ROOT)}" if overlay else "none"
|
||
print(
|
||
f"\n===== TIER: {op} (generic={'skip' if skip_gen else 'run'}, overlay={ov}) =====",
|
||
flush=True,
|
||
)
|
||
# 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail.
|
||
try:
|
||
_run_pre_hook(recipe, op, repo_local, domain, meta)
|
||
_perform_op(
|
||
op,
|
||
domain,
|
||
recipe,
|
||
head_ref,
|
||
op_state,
|
||
deploy_timeout=int(meta.DEPLOY_TIMEOUT),
|
||
meta=meta,
|
||
)
|
||
with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f:
|
||
json.dump(op_state, f)
|
||
except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash
|
||
print(f"!! {op} op failed: {_scrub(str(e))}", flush=True)
|
||
return "fail"
|
||
|
||
# 3) assertions: generic (unless opted out) + overlay, each its own pytest, all against the
|
||
# single post-op deployment. Generic runs first so an overlay may assume readiness.
|
||
rc_all = 0
|
||
for source, path in files:
|
||
print(f" assert ({source}): {os.path.relpath(path, ROOT)}", flush=True)
|
||
cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path]
|
||
jx = None
|
||
if junit_dir is not None:
|
||
jx = results_mod.junit_file(junit_dir, op, source, path)
|
||
cmd.append(f"--junitxml={jx}")
|
||
rc = run_redacted(cmd, env=_tier_env(domain))
|
||
if records is not None:
|
||
records.append(
|
||
{
|
||
"tier": op,
|
||
"source": source,
|
||
"file": os.path.relpath(path, ROOT),
|
||
"rc": rc,
|
||
"junit": jx,
|
||
}
|
||
)
|
||
if rc != 0:
|
||
rc_all = rc
|
||
return "pass" if rc_all == 0 else "fail"
|
||
|
||
|
||
def _enrich_deps_with_sso(parent_recipe: str, parent_domain: str, deps_list) -> dict[str, dict]:
|
||
"""For each dep, set up a fresh realm/client + test user via the harness's provider-specific
|
||
setup function, then return a recipe→entry dict carrying domain + admin + realm/client/user
|
||
info — the shape the `install_steps.sh` hook (and dependent tests) read.
|
||
|
||
Provider routing: today only `keycloak` is supported. authentik will need a parallel
|
||
`setup_authentik_realm` when an authentik-dep recipe enrolls (DEFERRED.md #9).
|
||
"""
|
||
from harness import sso, warm # local import — sso may not be needed for dep-less runs
|
||
|
||
out: dict[str, dict] = {}
|
||
for entry in deps_list or []:
|
||
dep_recipe = entry.get("recipe")
|
||
dep_domain = entry.get("domain")
|
||
if not dep_recipe or not dep_domain:
|
||
continue
|
||
if dep_recipe != "keycloak":
|
||
# Provider not yet supported — record bare entry; install_steps.sh / tests will
|
||
# raise if they need realm/client info they don't see.
|
||
out[dep_recipe] = entry
|
||
continue
|
||
# The realm is the per-run isolation unit on a (possibly shared live-warm) keycloak: name it
|
||
# "<parent>-<6hex>" so concurrent dependents — even two PRs of the SAME recipe — never
|
||
# collide on a realm (WC1). client_id stays the parent recipe name (isolated within the
|
||
# unique realm; predictable for debugging).
|
||
realm = warm.realm_for(parent_recipe, parent_domain)
|
||
client_id = parent_recipe
|
||
creds = sso.setup_keycloak_realm(
|
||
dep_domain,
|
||
realm=realm,
|
||
client_id=client_id,
|
||
redirect_uris=[f"https://{parent_domain}/*"],
|
||
web_origins=[f"https://{parent_domain}"],
|
||
)
|
||
out[dep_recipe] = {
|
||
"recipe": dep_recipe,
|
||
"domain": dep_domain,
|
||
"warm": bool(entry.get("warm")),
|
||
"realm": creds["realm"],
|
||
"client_id": creds["client_id"],
|
||
"client_secret": creds["client_secret"],
|
||
"user": creds["user"],
|
||
"password": creds["password"],
|
||
"email": creds["email"],
|
||
"discovery_url": creds["discovery_url"],
|
||
"token_url": creds["token_url"],
|
||
"auth_url": creds["auth_url"],
|
||
"userinfo_url": creds["userinfo_url"],
|
||
"admin_user": "admin",
|
||
"admin_password": sso.admin_password_inside(dep_domain),
|
||
}
|
||
return out
|
||
|
||
|
||
def _provision_deps(
|
||
recipe: str, domain: str, ref: str | None, declared: list[str]
|
||
) -> dict[str, dict]:
|
||
"""Provision a run's declared deps and write `$CCCI_DEPS_FILE`; return the recipe→entry deps_state.
|
||
|
||
Splits deps into live-warm (shared provider at a stable domain + a per-run realm) vs cold
|
||
(co-deployed per run), provisions each dep's SSO realm/client/user, and persists the enriched
|
||
dict the `install_steps.sh` hooks + dependent tests read. Raises on any failure (the caller
|
||
marks deps-not-ready). Install-time wiring is the ONLY mode (rcust P2b): provision BEFORE the
|
||
single deploy so the install-tier `install_steps.sh` hook wires OIDC env into that one deploy —
|
||
no reconverge, no post-deploy `setup_custom_tests.sh` machinery.
|
||
"""
|
||
warm_deps, cold_deps = [], []
|
||
for d in declared:
|
||
wd = warm.warm_domain(d)
|
||
if wd and warm.is_warm_up(d, wd):
|
||
warm_deps.append(d)
|
||
else:
|
||
if wd:
|
||
print(f" dep: {d} warm provider {wd} not up — cold fallback", flush=True)
|
||
cold_deps.append(d)
|
||
dep_metas = {d: meta_mod.load(d) for d in cold_deps}
|
||
deps_list = (
|
||
deps_mod.deploy_deps(recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas)
|
||
if cold_deps
|
||
else []
|
||
)
|
||
for d in warm_deps:
|
||
wd = warm.warm_domain(d)
|
||
reaped = warm.reap_orphan_realms(d, wd)
|
||
if reaped:
|
||
print(f" dep: reaped {len(reaped)} orphan realm(s) on warm {d}: {reaped}", flush=True)
|
||
deps_list.append({"recipe": d, "domain": wd, "warm": True})
|
||
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
|
||
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
||
deps_mod.write_run_state(deps_state)
|
||
return deps_state
|
||
|
||
|
||
def run_custom(
|
||
recipe: str,
|
||
repo_local: str | None,
|
||
domain: str,
|
||
records: list[dict] | None = None,
|
||
junit_dir: str | None = None,
|
||
) -> str:
|
||
"""Run all discovered non-lifecycle custom test_*.py (both locations, additive). Returns
|
||
'skip' if none defined, else 'pass'/'fail'. Phase 3: emits JUnit + records when given."""
|
||
customs = discovery.custom_tests(recipe, repo_local)
|
||
if not customs:
|
||
return "skip"
|
||
print("\n===== TIER: custom =====", flush=True)
|
||
rc_all = 0
|
||
for source, path in customs:
|
||
rel = os.path.relpath(path, ROOT)
|
||
print(f" custom ({source}): {rel}", flush=True)
|
||
cmd = [sys.executable, "-m", "pytest", "-v", "-rA", path]
|
||
jx = None
|
||
if junit_dir is not None:
|
||
jx = results_mod.junit_file(junit_dir, "custom", source, path)
|
||
cmd.append(f"--junitxml={jx}")
|
||
rc = run_redacted(cmd, env=_tier_env(domain))
|
||
if records is not None:
|
||
records.append({"tier": "custom", "source": source, "file": rel, "rc": rc, "junit": jx})
|
||
if rc != 0:
|
||
rc_all = rc
|
||
return "pass" if rc_all == 0 else "fail"
|
||
|
||
|
||
def _wait_undeployed(domain: str, timeout: int = 120) -> None:
|
||
"""Block until the stack's services are gone after an undeploy (so warmsnap.restore, which
|
||
requires undeployed, doesn't race a half-removed stack)."""
|
||
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
||
deadline = time.time() + timeout
|
||
while time.time() < deadline:
|
||
if not lifecycle._docker_names("service", stack): # noqa: SLF001
|
||
return
|
||
time.sleep(2)
|
||
|
||
|
||
def run_quick(
|
||
recipe: str, ref: str | None, head_ref: str | None, repo_local: str | None, meta
|
||
) -> int:
|
||
"""WC4 `--quick` opt-in fast lane (plan §2). Reattach the data-warm canonical (known-good volume)
|
||
→ upgrade IN PLACE to the PR head (chaos) → assert generic UPGRADE (reconverge+moved+serving) +
|
||
overlay + custom. PASS → undeploy-keep-volume, **known-good UNCHANGED (NEVER promote)**; FAIL →
|
||
restore the last-known-good snapshot + undeploy (roll back, data safe). Lower-confidence; does
|
||
NOT gate merge (WC7). Caller has confirmed a canonical exists.
|
||
|
||
NB: the deps wiring + temp-state scaffolding intentionally mirror main()'s cold path rather than
|
||
refactoring it — keeping the gate-passed cold flow byte-identical (zero regression risk)."""
|
||
import contextlib
|
||
|
||
domain = canonical.canonical_domain(recipe)
|
||
reg = canonical.read_registry(recipe) or {}
|
||
print(
|
||
f"\n== cc-ci run [MODE=quick]: recipe={recipe} canonical={domain} "
|
||
f"known-good={reg.get('version')} ref={ref}\n"
|
||
" quick = LOWER-CONFIDENCE opt-in fast lane; does NOT gate merge; NEVER promotes the canonical",
|
||
flush=True,
|
||
)
|
||
|
||
statefile = _run_state_path("opstate") + ".json"
|
||
with open(statefile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||
depsfile = _run_state_path("deps") + ".json"
|
||
with open(depsfile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||
skipfile = _run_state_path("depskip") + ".txt"
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||
|
||
op_state: dict = {}
|
||
results: dict[str, str] = {}
|
||
declared = list(meta.DEPS)
|
||
deps_state: dict = {}
|
||
deps_ready = True
|
||
deps_not_ready_reason = ""
|
||
dep_teardown_error: str | None = None
|
||
warm_ok = False
|
||
rolled_back = False
|
||
|
||
lifecycle.janitor()
|
||
try:
|
||
# 1) reattach the canonical (warm boot at the known-good version + retained volume)
|
||
try:
|
||
canonical.deploy_canonical(recipe, timeout=int(meta.DEPLOY_TIMEOUT))
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta.HEALTH_OK),
|
||
path=meta.HEALTH_PATH,
|
||
deploy_timeout=meta.DEPLOY_TIMEOUT,
|
||
http_timeout=meta.HTTP_TIMEOUT,
|
||
)
|
||
warm_ok = True
|
||
except Exception as e: # noqa: BLE001
|
||
print(f"!! canonical reattach/readiness failed: {_scrub(str(e))}", flush=True)
|
||
|
||
if warm_ok:
|
||
# 2) deps (warm keycloak + per-run realm) — mirrors main()'s warm/cold split. NB
|
||
# (rcust P2b): deps are provisioned (realm/creds in $CCCI_DEPS_FILE) but quick mode
|
||
# cannot do install-time OIDC env wiring — the canonical app pre-exists its per-run
|
||
# realm. No quick-enrolled recipe declares DEPS today; if one ever does, its
|
||
# requires_deps tests will exercise creds-only flows or skip (F2-11 keeps the signal).
|
||
if declared:
|
||
print(f"\n===== deps (quick): {declared} =====", flush=True)
|
||
try:
|
||
warm_deps, cold_deps = [], []
|
||
for d in declared:
|
||
wd = warm.warm_domain(d)
|
||
(warm_deps if (wd and warm.is_warm_up(d, wd)) else cold_deps).append(d)
|
||
dep_metas = {d: meta_mod.load(d) for d in cold_deps}
|
||
deps_list = (
|
||
deps_mod.deploy_deps(
|
||
recipe, os.environ.get("PR", "0"), ref, cold_deps, meta_for=dep_metas
|
||
)
|
||
if cold_deps
|
||
else []
|
||
)
|
||
for d in warm_deps:
|
||
wd = warm.warm_domain(d)
|
||
warm.reap_orphan_realms(d, wd)
|
||
deps_list.append({"recipe": d, "domain": wd, "warm": True})
|
||
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
|
||
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
|
||
deps_mod.write_run_state(deps_state)
|
||
except Exception as e: # noqa: BLE001
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! dep provisioning failed (deps-not-ready): {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
|
||
# 3) UPGRADE to PR head (chaos) + assert (generic reconverge+moved+serving + overlay)
|
||
results["upgrade"] = run_lifecycle_tier(
|
||
recipe, "upgrade", repo_local, domain, meta, head_ref, op_state
|
||
)
|
||
# 4) custom tier
|
||
os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0"
|
||
os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason
|
||
results["custom"] = run_custom(recipe, repo_local, domain)
|
||
else:
|
||
results["upgrade"] = "fail"
|
||
results["custom"] = "skip"
|
||
finally:
|
||
# Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py).
|
||
lifetime.begin_teardown()
|
||
# F2-11 skip count (read before deciding pass/fail)
|
||
requires_deps_skipped = 0
|
||
try:
|
||
with open(skipfile) as f:
|
||
requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip())
|
||
except OSError:
|
||
pass
|
||
sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped)
|
||
passed = (
|
||
warm_ok
|
||
and bool(results)
|
||
and all(v != "fail" for v in results.values())
|
||
and not sso_unverified
|
||
)
|
||
|
||
# dep teardown: delete per-run warm realms; undeploy cold deps (mirrors cold)
|
||
if deps_state:
|
||
ordered = (
|
||
[deps_state[d] for d in declared if d in deps_state]
|
||
if isinstance(deps_state, dict)
|
||
else deps_state
|
||
)
|
||
for e in [x for x in ordered if x.get("warm")]:
|
||
try:
|
||
from harness import sso
|
||
|
||
sso.delete_keycloak_realm(e["domain"], e["realm"])
|
||
print(
|
||
f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}",
|
||
flush=True,
|
||
)
|
||
except Exception as ex: # noqa: BLE001
|
||
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
try:
|
||
deps_mod.teardown_deps([x for x in ordered if not x.get("warm")])
|
||
except lifecycle.TeardownError as e:
|
||
dep_teardown_error = str(e)
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
|
||
# canonical teardown — the WC4 contract:
|
||
# PASS → undeploy, KEEP volume, known-good UNCHANGED (never promote)
|
||
# FAIL → restore last-known-good snapshot (data safe) then leave undeployed (idle)
|
||
try:
|
||
if warm_ok and passed:
|
||
canonical.undeploy_keep_volume(recipe)
|
||
print(
|
||
" quick PASS → canonical undeployed, volume retained, known-good UNCHANGED",
|
||
flush=True,
|
||
)
|
||
elif warm_ok:
|
||
print(
|
||
" quick FAIL → rolling back canonical to last-known-good snapshot", flush=True
|
||
)
|
||
abra.undeploy(domain)
|
||
_wait_undeployed(domain)
|
||
warmsnap.restore(recipe, domain)
|
||
# reset recorded version to the known-good (the failed upgrade set TYPE to the broken
|
||
# PR commit) so the idle canonical's .env agrees with the registry + re-warms cleanly.
|
||
if reg.get("version"):
|
||
abra.env_set(domain, "TYPE", f"{recipe}:{reg['version']}")
|
||
canonical._set_status(recipe, "idle") # noqa: SLF001
|
||
rolled_back = True
|
||
print(
|
||
" quick FAIL → restored known-good data; canonical idle (NOT promoted)",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001
|
||
dep_teardown_error = (dep_teardown_error or "") + f" | quick teardown/rollback: {e}"
|
||
print(f"!! quick teardown/rollback error: {e}", flush=True)
|
||
|
||
with contextlib.suppress(OSError):
|
||
os.remove(statefile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(depsfile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
|
||
print("\n===== RUN SUMMARY =====", flush=True)
|
||
print("mode = quick (LOWER-CONFIDENCE; opt-in; does not gate merge)")
|
||
print(
|
||
f"canonical = {domain} known-good = {reg.get('version')} (UNCHANGED; quick never promotes)"
|
||
)
|
||
if rolled_back:
|
||
print("rolled-back = yes (restored last-known-good snapshot)")
|
||
for op in ("upgrade", "custom"):
|
||
if op in results:
|
||
suffix = ""
|
||
if op == "custom" and requires_deps_skipped:
|
||
suffix = f" ({requires_deps_skipped} requires_deps SKIPPED — SSO UNVERIFIED)"
|
||
print(f" {op:8s}: {results[op]}{suffix}")
|
||
|
||
overall = 0
|
||
if any(v == "fail" for v in results.values()) or not warm_ok:
|
||
overall = 1
|
||
if sso_unverified:
|
||
print(
|
||
f"!! DEPS={declared} but dep provisioning failed and {requires_deps_skipped} "
|
||
"requires_deps SKIPPED — SSO NOT verified (F2-11)",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if dep_teardown_error:
|
||
print(f"!! teardown leaked/erred: {dep_teardown_error}", file=sys.stderr)
|
||
overall = 1
|
||
if not results:
|
||
print("no tiers ran", file=sys.stderr)
|
||
return 1
|
||
return overall
|
||
|
||
|
||
def should_promote_canonical(recipe: str, ref: str | None, overall: int, quick: bool) -> bool:
|
||
"""WC5 gate (pure): a run advances/seeds the canonical iff the recipe is enrolled
|
||
(WARM_CANONICAL), the run was GREEN (overall==0), it was COLD (not --quick), and it ran on LATEST
|
||
(no PR head → `ref` empty: the nightly sweep or a manual `RECIPE=<r>` run). A PR `!testme` carries
|
||
REF=PR-head and must NOT promote the canonical to a PR's code. Only cold-on-latest advances it."""
|
||
return canonical.is_enrolled(recipe) and overall == 0 and not quick and not ref
|
||
|
||
|
||
def promote_canonical(recipe: str, head_ref: str | None) -> None:
|
||
"""WC5: (re)seed the canonical at the green-verified LATEST. Deploy `warm-<recipe>` at latest
|
||
(reattaching the retained canonical volume if one exists — an in-place version bump — else a fresh
|
||
install), wait healthy, undeploy, snapshot + record the registry (atomic replace of the
|
||
last-known-good). The OLD known-good is replaced ONLY here, after green (never lost on a red run)."""
|
||
import warm_reconcile as wr
|
||
|
||
domain = canonical.canonical_domain(recipe)
|
||
wr.fetch_recipe(recipe)
|
||
latest = wr.latest_version(wr.recipe_tags(recipe))
|
||
if not latest:
|
||
print(f"WC5 promote: no version tags for {recipe} — skip", flush=True)
|
||
return
|
||
meta = meta_mod.load(recipe)
|
||
# The cold run's deploy-count was already asserted + the countfile removed; don't perturb it.
|
||
os.environ.pop("CCCI_DEPLOY_COUNT_FILE", None)
|
||
print(
|
||
f"\n===== WC5 promote-on-green-cold: (re)seed canonical {recipe} @ {latest} =====",
|
||
flush=True,
|
||
)
|
||
lifecycle.deploy_app(
|
||
recipe,
|
||
domain,
|
||
version=latest,
|
||
secrets=True,
|
||
deploy_timeout=int(meta.DEPLOY_TIMEOUT),
|
||
meta=meta,
|
||
)
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta.HEALTH_OK),
|
||
path=meta.HEALTH_PATH,
|
||
deploy_timeout=meta.DEPLOY_TIMEOUT,
|
||
http_timeout=meta.HTTP_TIMEOUT,
|
||
)
|
||
abra.undeploy(domain)
|
||
_wait_undeployed(domain)
|
||
canonical.seed_canonical(recipe, latest, commit=head_ref)
|
||
print(
|
||
f"WC5 promote: canonical {recipe} advanced to known-good {latest} (idle, volume retained)",
|
||
flush=True,
|
||
)
|
||
|
||
|
||
def main() -> int:
|
||
# P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard
|
||
# deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py).
|
||
lifetime.install_lifetime_guards()
|
||
recipe = os.environ.get("RECIPE")
|
||
if not recipe:
|
||
print("RECIPE env is required", file=sys.stderr)
|
||
return 2
|
||
ref = os.environ.get("REF") or None
|
||
src = os.environ.get("SRC") or None
|
||
target = os.environ.get("VERSION") or None
|
||
stages = {
|
||
s.strip() for s in os.environ.get("STAGES", ",".join(ALL_STAGES)).split(",") if s.strip()
|
||
}
|
||
|
||
print(
|
||
f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}"
|
||
)
|
||
# P2c: the CCCI_SKIP_GENERIC* env escape hatch is LOCAL-DEV-ONLY. If it rides a CI (drone)
|
||
# run, shout — generic-floor coverage is reduced and the verdict must not look routine.
|
||
for ov in skip_generic_env_overrides():
|
||
if os.environ.get("DRONE"):
|
||
print(
|
||
f"!! {ov}=1 — dev-only generic-floor override ACTIVE IN A CI RUN; generic "
|
||
"assertions are suppressed for the affected op(s). This must never gate a merge.",
|
||
flush=True,
|
||
)
|
||
else:
|
||
print(f"== {ov}=1 (dev-only generic-floor override active)", flush=True)
|
||
# Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR
|
||
# (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs
|
||
# serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md).
|
||
setup_run_abra_dir()
|
||
fetch_recipe(recipe, ref, src)
|
||
# The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test
|
||
# (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout
|
||
# HEAD (the catalogue current) for a non-PR `!testme`. Captured before any version-tag checkout.
|
||
head_ref = ref or lifecycle.recipe_head_commit(recipe)
|
||
repo_local = snapshot_recipe_tests(recipe)
|
||
meta = meta_mod.load(recipe)
|
||
|
||
# Customization manifest (rcust P5, R4): ONE block answering "what does this recipe
|
||
# customize?" across all surfaces — printed here and embedded verbatim in results.json under
|
||
# "customization". Pure presentation; never influences a verdict.
|
||
customization = manifest_mod.build(recipe, meta, repo_local)
|
||
print("\n" + manifest_mod.render(recipe, customization) + "\n", flush=True)
|
||
|
||
# WC4/WC7: opt-in `--quick` fast lane. Requires an existing data-warm canonical; if none, fall
|
||
# back cleanly to the full COLD run below so the PR is still tested (DECISIONS Phase-2w).
|
||
if os.environ.get("CCCI_QUICK") == "1" or os.environ.get("MODE") == "quick":
|
||
if canonical.has_canonical(recipe):
|
||
return run_quick(recipe, ref, head_ref, repo_local, meta)
|
||
print(
|
||
f"MODE=quick requested but no canonical for {recipe} — falling back to COLD run "
|
||
"(no-canonical fallback, WC7)",
|
||
flush=True,
|
||
)
|
||
|
||
domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref)
|
||
|
||
prev = upgrade_base(stages, meta, recipe)
|
||
base = prev or target
|
||
backup_cap = generic.backup_capable(recipe, meta)
|
||
hook = discovery.install_steps(recipe, repo_local)
|
||
|
||
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
|
||
countfile = _run_state_path("deploys")
|
||
with open(countfile, "w") as f:
|
||
f.write("0")
|
||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||
|
||
# Phase 3 (R1/R3): per-run artifact dir + JUnit dir. The tiers emit JUnit per file and append a
|
||
# {tier,source,file,rc,junit} record; after the run we assemble results.json (per-stage/per-test +
|
||
# level) into the artifact dir. Best-effort — never changes the verdict (R7).
|
||
run_artifact_dir = os.path.join(results_mod.runs_dir(), results_mod.run_id())
|
||
junit_dir = os.path.join(run_artifact_dir, "junit")
|
||
records: list[dict] = []
|
||
|
||
# L5 lint rung (phase lvl5): `abra recipe lint` against the EXACT tested ref, in a pristine
|
||
# scratch clone (harness.lint — the per-run tree is still at head_ref here, before any
|
||
# version-pinning checkout). Level rung only — NEVER the verdict: run_lint catches every
|
||
# failure mode into status "unver" (60s hard budget) and this belt-and-braces wrap makes a
|
||
# crashed executor identical to "could not verify".
|
||
lint_result = {"status": "unver", "detail": "lint executor crashed", "rules_failed": []}
|
||
try:
|
||
lint_result = lint_mod.run_lint(recipe, head_ref, run_artifact_dir)
|
||
except Exception as e: # noqa: BLE001 — lint is a rung, not a gate; never touches the verdict
|
||
print(
|
||
f"!! lint rung executor crashed (non-fatal, rung=unver): {_scrub(str(e))}", flush=True
|
||
)
|
||
print(
|
||
f"lint rung: {lint_result['status']}"
|
||
f"{' — ' + lint_result['detail'] if lint_result.get('detail') else ''}",
|
||
flush=True,
|
||
)
|
||
with contextlib.suppress(OSError):
|
||
os.makedirs(junit_dir, exist_ok=True)
|
||
|
||
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
|
||
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
|
||
statefile = _run_state_path("opstate") + ".json"
|
||
with open(statefile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||
op_state: dict = {}
|
||
|
||
# Run-scoped dep state (Phase 2 Q2.3; install-time-only since rcust P2b): deps are provisioned
|
||
# BEFORE the single deploy so install_steps.sh wires OIDC env into that one deploy.
|
||
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
|
||
depsfile = _run_state_path("deps") + ".json"
|
||
with open(depsfile, "w") as f:
|
||
json.dump({}, f)
|
||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
|
||
skipfile = _run_state_path("depskip") + ".txt"
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||
declared = list(meta.DEPS)
|
||
if declared:
|
||
print(f"\n===== DEPS declared (provision BEFORE deploy): {declared} =====", flush=True)
|
||
deps_state: dict[str, dict] = {} # new shape: recipe→entry dict (sso-dep plan §1)
|
||
deps_ready = True
|
||
deps_not_ready_reason: str = ""
|
||
|
||
results: dict[str, str] = {}
|
||
lifecycle.janitor()
|
||
dep_teardown_error: str | None = None
|
||
screenshot_rel: str | None = None # Phase 3 U1 (R4): set once the app screenshot is captured
|
||
try:
|
||
# ---- (Q3.2a) install-time OIDC: provision the warm-dep realm BEFORE the single deploy so
|
||
# install_steps.sh can read $CCCI_DEPS_FILE and wire the OIDC env into that one deploy. On
|
||
# failure we mark deps-not-ready but STILL deploy the recipe alone (install_steps.sh no-ops
|
||
# on an empty deps file) so the generic tiers run; the OIDC custom test then skips → F2-11. ----
|
||
if declared:
|
||
print(
|
||
f"\n===== install-time OIDC: provisioning deps {declared} BEFORE deploy =====",
|
||
flush=True,
|
||
)
|
||
try:
|
||
deps_state = _provision_deps(recipe, domain, ref, declared)
|
||
print(
|
||
" install-time OIDC: deps provisioned; install_steps.sh will wire OIDC env",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — isolated; recipe still deploys, OIDC test skips
|
||
deps_ready = False
|
||
deps_not_ready_reason = _scrub(str(e))[:300]
|
||
print(
|
||
f"!! install-time dep provisioning failed (deps-not-ready): {deps_not_ready_reason}",
|
||
flush=True,
|
||
)
|
||
|
||
# ---- deploy RECIPE FIRST, alone (no deps yet — generic tiers run recipe-only) ----
|
||
try:
|
||
lifecycle.deploy_app(
|
||
recipe,
|
||
domain,
|
||
version=base,
|
||
secrets=True,
|
||
install_steps_hook=hook,
|
||
deploy_timeout=int(meta.DEPLOY_TIMEOUT),
|
||
meta=meta,
|
||
)
|
||
lifecycle.wait_healthy(
|
||
domain,
|
||
ok_codes=tuple(meta.HEALTH_OK),
|
||
path=meta.HEALTH_PATH,
|
||
deploy_timeout=meta.DEPLOY_TIMEOUT,
|
||
http_timeout=meta.HTTP_TIMEOUT,
|
||
)
|
||
# Recipe READY_PROBE (e.g. lasuite-drive collabora WOPI discovery) — readiness beyond
|
||
# replica convergence + app HEALTH_PATH; no-op for recipes without one.
|
||
lifecycle.wait_ready_probes(
|
||
meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="install"
|
||
)
|
||
deploy_ok = True
|
||
except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure
|
||
print(f"!! deploy/readiness failed: {e}", flush=True)
|
||
deploy_ok = False
|
||
|
||
# ---- Phase 3 U1 (R4): capture a real app screenshot while the app is up, at the cleanest
|
||
# "freshly installed + healthy" moment (before any tier mutates state and before teardown).
|
||
# Placed OUTSIDE the deploy try/except so a screenshot issue can NEVER flip deploy_ok.
|
||
# Secret-safe by default (landing page, never a credentials page; recipes opt into a
|
||
# post-login view via a SCREENSHOT meta hook). Best-effort — capture() swallows all errors and
|
||
# returns None, so this never blocks or fails the run (R7). None → results.json `screenshot`
|
||
# stays null → the card shows the "no screenshot" placeholder (cosmetics never change verdict).
|
||
if deploy_ok:
|
||
# capture() already swallows all errors → None; the extra try/except is defense-in-depth
|
||
# (U5 R7 hardening) so a screenshot can NEVER fail/crash the run even if that internal
|
||
# contract regresses or a recipe SCREENSHOT hook raises. Cosmetics never change the verdict.
|
||
try:
|
||
shot = screenshot_mod.capture(
|
||
domain, screenshot_mod.screenshot_path(run_artifact_dir), recipe_meta=meta
|
||
)
|
||
screenshot_rel = os.path.basename(shot) if shot else None
|
||
except Exception as e: # noqa: BLE001 — screenshot is cosmetic; never fail a run on it (R7)
|
||
print(
|
||
f"!! screenshot capture raised (non-fatal, verdict unaffected): {_scrub(str(e))}",
|
||
flush=True,
|
||
)
|
||
|
||
# ---- INSTALL tier (always; additive generic + overlay, no op) ----
|
||
if "install" in stages:
|
||
results["install"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"install",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if deploy_ok
|
||
else "fail"
|
||
)
|
||
|
||
if deploy_ok:
|
||
# ---- UPGRADE tier (op once → generic + overlay assert) ----
|
||
if "upgrade" in stages:
|
||
results["upgrade"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"upgrade",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if prev
|
||
else "skip" # no upgrade base: single published version, or declared EXPECTED_NA
|
||
)
|
||
# ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ----
|
||
if "backup" in stages:
|
||
results["backup"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"backup",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if backup_cap
|
||
else "skip"
|
||
)
|
||
if "restore" in stages:
|
||
results["restore"] = (
|
||
run_lifecycle_tier(
|
||
recipe,
|
||
"restore",
|
||
repo_local,
|
||
domain,
|
||
meta,
|
||
head_ref,
|
||
op_state,
|
||
records=records,
|
||
junit_dir=junit_dir,
|
||
)
|
||
if backup_cap
|
||
else "skip"
|
||
)
|
||
# (rcust P2b: install-time deps wiring is the ONLY mode — deps were provisioned BEFORE
|
||
# the single deploy and install_steps.sh wired the OIDC env into it. The legacy
|
||
# post-deploy provisioning + setup_custom_tests.sh redeploy machinery is deleted; a
|
||
# recipe's post-deploy seeding belongs in ops.py pre_install, e.g. lasuite-drive's
|
||
# MinIO bucket one-shot.)
|
||
|
||
# ---- CUSTOM tier ----
|
||
if "custom" in stages:
|
||
# Pass deps-ready state via env; conftest.py skips @pytest.mark.requires_deps
|
||
# tests when CCCI_DEPS_READY=0.
|
||
os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0"
|
||
os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason
|
||
results["custom"] = run_custom(
|
||
recipe, repo_local, domain, records=records, junit_dir=junit_dir
|
||
)
|
||
else:
|
||
# install failed → the shared deployment is dead; remaining tiers cannot run on it.
|
||
for op in ("upgrade", "backup", "restore", "custom"):
|
||
if op in stages:
|
||
results[op] = "skip"
|
||
finally:
|
||
# From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored
|
||
# (lifetime.py) so a second signal can't abort the cleanup the first one asked for.
|
||
lifetime.begin_teardown()
|
||
# Teardown the recipe under test FIRST, then deps in reverse declaration order.
|
||
# Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier
|
||
# failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are
|
||
# captured into dep_teardown_error and surfaced in the run summary + exit code, but
|
||
# we still print the diagnosable summary first.
|
||
lifecycle.teardown_app(domain, verify=False)
|
||
if deps_state:
|
||
print("\n===== DEPS teardown =====", flush=True)
|
||
# Flatten the dict-shape state in declaration order; teardown_deps reverses for cold.
|
||
if isinstance(deps_state, dict):
|
||
ordered = [deps_state[d] for d in declared if d in deps_state]
|
||
else:
|
||
ordered = deps_state
|
||
# WC1: warm deps are NOT undeployed — we only delete the per-run realm on the shared
|
||
# live-warm provider (the app stays up for the next run). Cold deps undeploy as before.
|
||
warm_entries = [e for e in ordered if e.get("warm")]
|
||
cold_entries = [e for e in ordered if not e.get("warm")]
|
||
for e in warm_entries:
|
||
try:
|
||
from harness import sso
|
||
|
||
sso.delete_keycloak_realm(e["domain"], e["realm"])
|
||
print(
|
||
f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}",
|
||
flush=True,
|
||
)
|
||
except Exception as ex: # noqa: BLE001 — a leaked realm is a teardown failure (§9)
|
||
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
try:
|
||
deps_mod.teardown_deps(cold_entries)
|
||
except lifecycle.TeardownError as e:
|
||
dep_teardown_error = str(e)
|
||
print(f"!! {dep_teardown_error}", flush=True)
|
||
|
||
# ---- deploy-count assertion (DG4.1) ----
|
||
with open(countfile) as f:
|
||
deploy_count = int(f.read().strip() or "0")
|
||
os.remove(countfile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(statefile)
|
||
with contextlib.suppress(OSError):
|
||
os.remove(depsfile)
|
||
# F2-11: sum the requires_deps skip counts conftest recorded across the custom files.
|
||
requires_deps_skipped = 0
|
||
try:
|
||
with open(skipfile) as f:
|
||
requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip())
|
||
except OSError:
|
||
pass
|
||
with contextlib.suppress(OSError):
|
||
os.remove(skipfile)
|
||
|
||
# ---- per-op summary (DG6 feed) ----
|
||
# SSO-dep plan §1: DG4.1 generalised — one `abra app new` per app in the run (recipe + each
|
||
# COLD dep). Chaos redeploys are NOT a fresh `app_new` and do NOT increment the count.
|
||
# WC1: a live-warm dep (keycloak) is NOT deployed by the run — it only gets a per-run realm — so
|
||
# warm deps contribute 0. So expected = 1 + (number of COLD deps that actually got deployed).
|
||
_dep_entries = deps_state.values() if isinstance(deps_state, dict) else (deps_state or [])
|
||
deps_deployed_count = sum(
|
||
1 for e in _dep_entries if not (isinstance(e, dict) and e.get("warm"))
|
||
)
|
||
expected_deploy_count = 1 + deps_deployed_count
|
||
print("\n===== RUN SUMMARY =====", flush=True)
|
||
print(f"deploy-count = {deploy_count} (expect {expected_deploy_count})")
|
||
if deps_state:
|
||
deps_list_for_summary = (
|
||
list(deps_state.keys())
|
||
if isinstance(deps_state, dict)
|
||
else [d.get("recipe", "?") for d in deps_state]
|
||
)
|
||
print(f" deps deployed: {deps_list_for_summary}")
|
||
if not deps_ready:
|
||
print(f" deps-not-ready: {deps_not_ready_reason}")
|
||
order = [s for s in ALL_STAGES if s in results]
|
||
for op in order:
|
||
suffix = ""
|
||
# F2-11: annotate the custom tier when requires_deps (SSO) tests were skipped, so a reader
|
||
# of the summary can't mistake a green custom tier for "SSO verified".
|
||
if op == "custom" and requires_deps_skipped:
|
||
suffix = f" ({requires_deps_skipped} requires_deps SKIPPED: deps-not-ready — SSO UNVERIFIED)"
|
||
print(f" {op:8s}: {results[op]}{suffix}")
|
||
|
||
overall = 0
|
||
if deploy_count != expected_deploy_count:
|
||
print(
|
||
f"!! deploy-count {deploy_count} != {expected_deploy_count} (DG4.1 violation)",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if dep_teardown_error:
|
||
# F2-5: dep teardown leaks violate §9 (teardown sacred); fail the run loudly.
|
||
print(f"!! dep teardown leaked state: {dep_teardown_error}", file=sys.stderr)
|
||
overall = 1
|
||
if any(v == "fail" for v in results.values()):
|
||
overall = 1
|
||
# F2-11: a deps-declaring recipe whose dep provisioning failed has NOT verified its SSO/OIDC
|
||
# claim — its requires_deps tests SKIPPED (a skip-only file exits 0, so without this the run
|
||
# would report GREEN). Fail the run for that recipe; generic-tier results above are untouched.
|
||
if sso_dep_unverified(declared, deps_ready, requires_deps_skipped):
|
||
print(
|
||
f"!! recipe declares DEPS={declared} but dep provisioning failed and "
|
||
f"{requires_deps_skipped} requires_deps (SSO) test(s) were SKIPPED — SSO claim NOT "
|
||
f"verified; failing run (F2-11). deps-not-ready: {deps_not_ready_reason}",
|
||
file=sys.stderr,
|
||
)
|
||
overall = 1
|
||
if not results:
|
||
print("no tiers ran", file=sys.stderr)
|
||
return 1
|
||
|
||
# ---- Phase 3 (R1/R3): assemble results.json (per-stage/per-test + computed level). Best-effort:
|
||
# a failure here NEVER changes `overall` (R7 — cosmetics never block the pipeline). ----
|
||
data: dict | None = None
|
||
try:
|
||
clean_teardown = (deploy_count == expected_deploy_count) and not dep_teardown_error
|
||
data = results_mod.build_results(
|
||
recipe=recipe,
|
||
version=target or (head_ref[:12] if head_ref else None),
|
||
pr=os.environ.get("PR", "0"),
|
||
ref=ref,
|
||
records=records,
|
||
results=results,
|
||
backup_capable=backup_cap,
|
||
has_upgrade_target=prev is not None, # structural: a deployable upgrade base exists
|
||
lint=lint_result, # L5 rung (phase lvl5)
|
||
clean_teardown=clean_teardown,
|
||
no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact
|
||
screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded
|
||
finished_ts=time.time(),
|
||
expected_na=meta.EXPECTED_NA, # declared intentional-skip map (recipe_meta)
|
||
customization=customization, # rcust P5: the run-start manifest, verbatim
|
||
)
|
||
# Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7).
|
||
blob = json.dumps(data)
|
||
leaked = any(v in blob for v in _REDACT)
|
||
data["flags"]["no_secret_leak"] = not leaked
|
||
if leaked:
|
||
print(
|
||
"!! results.json leak-scan: a known secret value appeared — scrubbing flag set False",
|
||
file=sys.stderr,
|
||
)
|
||
path = results_mod.write_results(data)
|
||
print(f"results.json written: {path} (level={data['level']} of 5)", flush=True)
|
||
# Surface UNVERIFIED rungs in the CI log (non-blocking, R7): a rung that should have run
|
||
# and wasn't verified blocks the level above it — fill the coverage, or (where a
|
||
# declared/structural reason genuinely applies) declare it in EXPECTED_NA.
|
||
for rung in data.get("skips", {}).get("unintentional", []):
|
||
print(
|
||
f"⚠ coverage: rung '{rung}' is UNVERIFIED (did not run / could not be checked) — "
|
||
f"the level cannot rise above it. Add the missing test/coverage, or declare a "
|
||
f"genuine inapplicability in tests/{recipe}/recipe_meta.py "
|
||
f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
|
||
print(
|
||
f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}",
|
||
file=sys.stderr,
|
||
)
|
||
|
||
# ---- Phase 3 U2 (R3/R6): render the summary CARD (HTML→PNG) + level BADGE (SVG) from the
|
||
# results dict into the run artifact dir, alongside results.json + screenshot.png. The card
|
||
# REPORTS results.json verbatim — it computes nothing, so it can never look greener than the tiers
|
||
# (cardinal invariant, plan §6). Separate best-effort block (results.json is already written by
|
||
# here) — any failure is swallowed and NEVER changes `overall` (R7); a render failure simply means
|
||
# no summary.png, and U3/U4 fall back to text. ----
|
||
if data is not None:
|
||
try:
|
||
html_path = os.path.join(run_artifact_dir, "summary.html")
|
||
with open(html_path, "w", encoding="utf-8") as f:
|
||
f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
|
||
png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
|
||
# Badge = level only (number + colour) — the per-rung table on the card is the sole
|
||
# carrier of "why isn't this higher" (operator-specified, phase lvl5).
|
||
with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
|
||
f.write(card_mod.level_badge_svg(data["level"]))
|
||
print(
|
||
f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
|
||
f"badge.svg written into {run_artifact_dir}",
|
||
flush=True,
|
||
)
|
||
except Exception as e: # noqa: BLE001 — card/badge are cosmetic; never fail a run (R7)
|
||
print(f"!! summary card/badge render failed (non-fatal): {_scrub(str(e))}", flush=True)
|
||
|
||
# WC5 promote-on-green-cold: a GREEN COLD run on LATEST (no PR head) of an enrolled
|
||
# (WARM_CANONICAL) recipe advances/seeds the canonical. ONLY cold-on-latest advances it (a PR
|
||
# `!testme` carries REF and must NOT promote; `--quick` never promotes — handled in run_quick).
|
||
# Non-fatal: a promote failure leaves the OLD known-good intact (never lose it) and is logged.
|
||
if should_promote_canonical(recipe, ref, overall, quick=False):
|
||
try:
|
||
promote_canonical(recipe, head_ref)
|
||
except Exception as e: # noqa: BLE001 — promote is a post-green bonus; never fail a green run
|
||
print(
|
||
f"!! WC5 promote failed (non-fatal; known-good unchanged): {_scrub(str(e))}",
|
||
flush=True,
|
||
)
|
||
|
||
return overall
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|