diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 4125915..c8b24b8 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -40,7 +40,17 @@ import tempfile ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, "runner")) -from harness import deps as deps_mod, discovery, generic, lifecycle, naming, warm # noqa: E402 +from harness import ( # noqa: E402 + abra, + canonical, + deps as deps_mod, + discovery, + generic, + lifecycle, + naming, + warm, + warmsnap, +) ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom") @@ -386,6 +396,202 @@ def run_custom(recipe: str, repo_local: str | None, domain: str) -> str: return "pass" if rc_all == 0 else "fail" +def _wait_undeployed(domain: str, timeout: int = 120) -> None: + """Block until the stack's services are gone after an undeploy (so warmsnap.restore, which + requires undeployed, doesn't race a half-removed stack).""" + stack = lifecycle._stack_name(domain) # noqa: SLF001 + deadline = time.time() + timeout + while time.time() < deadline: + if not lifecycle._docker_names("service", stack): # noqa: SLF001 + return + time.sleep(2) + + +def run_quick(recipe: str, ref: str | None, head_ref: str | None, repo_local: str | None, + meta: dict) -> int: + """WC4 `--quick` opt-in fast lane (plan §2). Reattach the data-warm canonical (known-good volume) + → upgrade IN PLACE to the PR head (chaos) → assert generic UPGRADE (reconverge+moved+serving) + + overlay + custom. PASS → undeploy-keep-volume, **known-good UNCHANGED (NEVER promote)**; FAIL → + restore the last-known-good snapshot + undeploy (roll back, data safe). Lower-confidence; does + NOT gate merge (WC7). Caller has confirmed a canonical exists. + + NB: the deps wiring + temp-state scaffolding intentionally mirror main()'s cold path rather than + refactoring it — keeping the gate-passed cold flow byte-identical (zero regression risk).""" + import contextlib + + domain = canonical.canonical_domain(recipe) + reg = canonical.read_registry(recipe) or {} + print( + f"\n== cc-ci run [MODE=quick]: recipe={recipe} canonical={domain} " + f"known-good={reg.get('version')} ref={ref}\n" + " quick = LOWER-CONFIDENCE opt-in fast lane; does NOT gate merge; NEVER promotes the canonical", + flush=True, + ) + + statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json") + with open(statefile, "w") as f: + json.dump({}, f) + os.environ["CCCI_OP_STATE_FILE"] = statefile + depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json") + with open(depsfile, "w") as f: + json.dump({}, f) + os.environ["CCCI_DEPS_FILE"] = depsfile + skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt") + with contextlib.suppress(OSError): + os.remove(skipfile) + os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile + + op_state: dict = {} + results: dict[str, str] = {} + declared = deps_mod.declared_deps(recipe) + deps_state: dict = {} + deps_ready = True + deps_not_ready_reason = "" + dep_teardown_error: str | None = None + warm_ok = False + rolled_back = False + + lifecycle.janitor() + try: + # 1) reattach the canonical (warm boot at the known-good version + retained volume) + try: + canonical.deploy_canonical(recipe, timeout=int(meta.get("DEPLOY_TIMEOUT", 900))) + lifecycle.wait_healthy( + domain, ok_codes=tuple(meta["HEALTH_OK"]), path=meta["HEALTH_PATH"], + deploy_timeout=meta["DEPLOY_TIMEOUT"], http_timeout=meta["HTTP_TIMEOUT"], + ) + warm_ok = True + except Exception as e: # noqa: BLE001 + print(f"!! canonical reattach/readiness failed: {_scrub(str(e))}", flush=True) + + if warm_ok: + # 2) deps (warm keycloak + per-run realm) — mirrors main()'s warm/cold split + if declared: + print(f"\n===== setup_custom_tests (quick): deps {declared} =====", flush=True) + try: + warm_deps, cold_deps = [], [] + for d in declared: + wd = warm.warm_domain(d) + (warm_deps if (wd and warm.is_warm_up(d, wd)) else cold_deps).append(d) + dep_metas = {d: _load_meta(d) for d in cold_deps} + deps_list = ( + deps_mod.deploy_deps(recipe, os.environ.get("PR", "0"), ref, cold_deps, + meta_for=dep_metas) + if cold_deps else [] + ) + for d in warm_deps: + wd = warm.warm_domain(d) + warm.reap_orphan_realms(d, wd) + deps_list.append({"recipe": d, "domain": wd, "warm": True}) + print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True) + deps_state = _enrich_deps_with_sso(recipe, domain, deps_list) + deps_mod.write_run_state(deps_state) + _run_setup_custom_tests_hook(recipe, domain, depsfile) + except Exception as e: # noqa: BLE001 + deps_ready = False + deps_not_ready_reason = _scrub(str(e))[:300] + print(f"!! setup_custom_tests failed (deps-not-ready): {deps_not_ready_reason}", + flush=True) + + # 3) UPGRADE to PR head (chaos) + assert (generic reconverge+moved+serving + overlay) + results["upgrade"] = run_lifecycle_tier( + recipe, "upgrade", repo_local, domain, meta, head_ref, op_state + ) + # 4) custom tier + os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0" + os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason + results["custom"] = run_custom(recipe, repo_local, domain) + else: + results["upgrade"] = "fail" + results["custom"] = "skip" + finally: + # F2-11 skip count (read before deciding pass/fail) + requires_deps_skipped = 0 + try: + with open(skipfile) as f: + requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip()) + except OSError: + pass + sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped) + passed = ( + warm_ok and bool(results) and all(v != "fail" for v in results.values()) + and not sso_unverified + ) + + # dep teardown: delete per-run warm realms; undeploy cold deps (mirrors cold) + if deps_state: + ordered = ([deps_state[d] for d in declared if d in deps_state] + if isinstance(deps_state, dict) else deps_state) + for e in [x for x in ordered if x.get("warm")]: + try: + from harness import sso + sso.delete_keycloak_realm(e["domain"], e["realm"]) + print(f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True) + except Exception as ex: # noqa: BLE001 + dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}" + print(f"!! {dep_teardown_error}", flush=True) + try: + deps_mod.teardown_deps([x for x in ordered if not x.get("warm")]) + except lifecycle.TeardownError as e: + dep_teardown_error = str(e) + print(f"!! {dep_teardown_error}", flush=True) + + # canonical teardown — the WC4 contract: + # PASS → undeploy, KEEP volume, known-good UNCHANGED (never promote) + # FAIL → restore last-known-good snapshot (data safe) then leave undeployed (idle) + try: + if warm_ok and passed: + canonical.undeploy_keep_volume(recipe) + print(" quick PASS → canonical undeployed, volume retained, known-good UNCHANGED", + flush=True) + elif warm_ok: + print(" quick FAIL → rolling back canonical to last-known-good snapshot", flush=True) + abra.undeploy(domain) + _wait_undeployed(domain) + warmsnap.restore(recipe, domain) + canonical._set_status(recipe, "idle") # noqa: SLF001 + rolled_back = True + print(" quick FAIL → restored known-good data; canonical idle (NOT promoted)", + flush=True) + except Exception as e: # noqa: BLE001 + dep_teardown_error = (dep_teardown_error or "") + f" | quick teardown/rollback: {e}" + print(f"!! quick teardown/rollback error: {e}", flush=True) + + with contextlib.suppress(OSError): + os.remove(statefile) + with contextlib.suppress(OSError): + os.remove(depsfile) + with contextlib.suppress(OSError): + os.remove(skipfile) + + print("\n===== RUN SUMMARY =====", flush=True) + print(f"mode = quick (LOWER-CONFIDENCE; opt-in; does not gate merge)") + print(f"canonical = {domain} known-good = {reg.get('version')} (UNCHANGED; quick never promotes)") + if rolled_back: + print("rolled-back = yes (restored last-known-good snapshot)") + for op in ("upgrade", "custom"): + if op in results: + suffix = "" + if op == "custom" and requires_deps_skipped: + suffix = f" ({requires_deps_skipped} requires_deps SKIPPED — SSO UNVERIFIED)" + print(f" {op:8s}: {results[op]}{suffix}") + + overall = 0 + if any(v == "fail" for v in results.values()) or not warm_ok: + overall = 1 + if sso_unverified: + print(f"!! DEPS={declared} but setup_custom_tests failed and {requires_deps_skipped} " + "requires_deps SKIPPED — SSO NOT verified (F2-11)", file=sys.stderr) + overall = 1 + if dep_teardown_error: + print(f"!! teardown leaked/erred: {dep_teardown_error}", file=sys.stderr) + overall = 1 + if not results: + print("no tiers ran", file=sys.stderr) + return 1 + return overall + + def main() -> int: recipe = os.environ.get("RECIPE") if not recipe: @@ -408,6 +614,18 @@ def main() -> int: head_ref = ref or lifecycle.recipe_head_commit(recipe) repo_local = snapshot_recipe_tests(recipe) meta = _load_meta(recipe) + + # WC4/WC7: opt-in `--quick` fast lane. Requires an existing data-warm canonical; if none, fall + # back cleanly to the full COLD run below so the PR is still tested (DECISIONS Phase-2w). + if os.environ.get("CCCI_QUICK") == "1" or os.environ.get("MODE") == "quick": + if canonical.has_canonical(recipe): + return run_quick(recipe, ref, head_ref, repo_local, meta) + print( + f"MODE=quick requested but no canonical for {recipe} — falling back to COLD run " + "(no-canonical fallback, WC7)", + flush=True, + ) + domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref) # Deploy-once base version: previous published version when the upgrade tier will run and one