feat(2w): W2 --quick mode in run_recipe_ci.py (WC4+WC7)

run_quick(): opt-in fast lane (CCCI_QUICK=1 / MODE=quick) — reattach the
data-warm canonical (canonical.deploy_canonical, known-good volume) → deps wiring
(warm keycloak + per-run realm) → UPGRADE to PR head (chaos, run_lifecycle_tier
'upgrade': reconverge+moved+serving + overlay) → custom tier. PASS →
undeploy_keep_volume, known-good UNCHANGED (NEVER promote); FAIL → warmsnap.restore
last-known-good + undeploy (roll back, data safe). Always deletes per-run warm
realm. mode=quick labelled lower-confidence (WC7); skips install/backup/restore;
no deploy-count guard (no deploy_app). main() dispatches to run_quick when a
canonical exists, else clean no-canonical fallback to COLD. Cold path byte-identical
(deps wiring intentionally mirrored, not refactored). 61 unit pass; cold untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-29 02:45:44 +01:00
parent 307269b5c6
commit f68e9d463f

View File

@ -40,7 +40,17 @@ import tempfile
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(ROOT, "runner"))
from harness import deps as deps_mod, discovery, generic, lifecycle, naming, warm # noqa: E402
from harness import ( # noqa: E402
abra,
canonical,
deps as deps_mod,
discovery,
generic,
lifecycle,
naming,
warm,
warmsnap,
)
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
@ -386,6 +396,202 @@ def run_custom(recipe: str, repo_local: str | None, domain: str) -> str:
return "pass" if rc_all == 0 else "fail"
def _wait_undeployed(domain: str, timeout: int = 120) -> None:
"""Block until the stack's services are gone after an undeploy (so warmsnap.restore, which
requires undeployed, doesn't race a half-removed stack)."""
stack = lifecycle._stack_name(domain) # noqa: SLF001
deadline = time.time() + timeout
while time.time() < deadline:
if not lifecycle._docker_names("service", stack): # noqa: SLF001
return
time.sleep(2)
def run_quick(recipe: str, ref: str | None, head_ref: str | None, repo_local: str | None,
meta: dict) -> int:
"""WC4 `--quick` opt-in fast lane (plan §2). Reattach the data-warm canonical (known-good volume)
→ upgrade IN PLACE to the PR head (chaos) → assert generic UPGRADE (reconverge+moved+serving) +
overlay + custom. PASS → undeploy-keep-volume, **known-good UNCHANGED (NEVER promote)**; FAIL →
restore the last-known-good snapshot + undeploy (roll back, data safe). Lower-confidence; does
NOT gate merge (WC7). Caller has confirmed a canonical exists.
NB: the deps wiring + temp-state scaffolding intentionally mirror main()'s cold path rather than
refactoring it — keeping the gate-passed cold flow byte-identical (zero regression risk)."""
import contextlib
domain = canonical.canonical_domain(recipe)
reg = canonical.read_registry(recipe) or {}
print(
f"\n== cc-ci run [MODE=quick]: recipe={recipe} canonical={domain} "
f"known-good={reg.get('version')} ref={ref}\n"
" quick = LOWER-CONFIDENCE opt-in fast lane; does NOT gate merge; NEVER promotes the canonical",
flush=True,
)
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
with open(statefile, "w") as f:
json.dump({}, f)
os.environ["CCCI_OP_STATE_FILE"] = statefile
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
with open(depsfile, "w") as f:
json.dump({}, f)
os.environ["CCCI_DEPS_FILE"] = depsfile
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
with contextlib.suppress(OSError):
os.remove(skipfile)
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
op_state: dict = {}
results: dict[str, str] = {}
declared = deps_mod.declared_deps(recipe)
deps_state: dict = {}
deps_ready = True
deps_not_ready_reason = ""
dep_teardown_error: str | None = None
warm_ok = False
rolled_back = False
lifecycle.janitor()
try:
# 1) reattach the canonical (warm boot at the known-good version + retained volume)
try:
canonical.deploy_canonical(recipe, timeout=int(meta.get("DEPLOY_TIMEOUT", 900)))
lifecycle.wait_healthy(
domain, ok_codes=tuple(meta["HEALTH_OK"]), path=meta["HEALTH_PATH"],
deploy_timeout=meta["DEPLOY_TIMEOUT"], http_timeout=meta["HTTP_TIMEOUT"],
)
warm_ok = True
except Exception as e: # noqa: BLE001
print(f"!! canonical reattach/readiness failed: {_scrub(str(e))}", flush=True)
if warm_ok:
# 2) deps (warm keycloak + per-run realm) — mirrors main()'s warm/cold split
if declared:
print(f"\n===== setup_custom_tests (quick): deps {declared} =====", flush=True)
try:
warm_deps, cold_deps = [], []
for d in declared:
wd = warm.warm_domain(d)
(warm_deps if (wd and warm.is_warm_up(d, wd)) else cold_deps).append(d)
dep_metas = {d: _load_meta(d) for d in cold_deps}
deps_list = (
deps_mod.deploy_deps(recipe, os.environ.get("PR", "0"), ref, cold_deps,
meta_for=dep_metas)
if cold_deps else []
)
for d in warm_deps:
wd = warm.warm_domain(d)
warm.reap_orphan_realms(d, wd)
deps_list.append({"recipe": d, "domain": wd, "warm": True})
print(f" dep: using live-warm {d} @ {wd} (per-run realm)", flush=True)
deps_state = _enrich_deps_with_sso(recipe, domain, deps_list)
deps_mod.write_run_state(deps_state)
_run_setup_custom_tests_hook(recipe, domain, depsfile)
except Exception as e: # noqa: BLE001
deps_ready = False
deps_not_ready_reason = _scrub(str(e))[:300]
print(f"!! setup_custom_tests failed (deps-not-ready): {deps_not_ready_reason}",
flush=True)
# 3) UPGRADE to PR head (chaos) + assert (generic reconverge+moved+serving + overlay)
results["upgrade"] = run_lifecycle_tier(
recipe, "upgrade", repo_local, domain, meta, head_ref, op_state
)
# 4) custom tier
os.environ["CCCI_DEPS_READY"] = "1" if deps_ready else "0"
os.environ["CCCI_DEPS_NOT_READY_REASON"] = deps_not_ready_reason
results["custom"] = run_custom(recipe, repo_local, domain)
else:
results["upgrade"] = "fail"
results["custom"] = "skip"
finally:
# F2-11 skip count (read before deciding pass/fail)
requires_deps_skipped = 0
try:
with open(skipfile) as f:
requires_deps_skipped = sum(int(x) for x in f.read().split() if x.strip())
except OSError:
pass
sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped)
passed = (
warm_ok and bool(results) and all(v != "fail" for v in results.values())
and not sso_unverified
)
# dep teardown: delete per-run warm realms; undeploy cold deps (mirrors cold)
if deps_state:
ordered = ([deps_state[d] for d in declared if d in deps_state]
if isinstance(deps_state, dict) else deps_state)
for e in [x for x in ordered if x.get("warm")]:
try:
from harness import sso
sso.delete_keycloak_realm(e["domain"], e["realm"])
print(f" dep: deleted per-run realm {e['realm']} on warm {e['recipe']}", flush=True)
except Exception as ex: # noqa: BLE001
dep_teardown_error = f"warm realm delete failed for {e.get('realm')}: {ex}"
print(f"!! {dep_teardown_error}", flush=True)
try:
deps_mod.teardown_deps([x for x in ordered if not x.get("warm")])
except lifecycle.TeardownError as e:
dep_teardown_error = str(e)
print(f"!! {dep_teardown_error}", flush=True)
# canonical teardown — the WC4 contract:
# PASS → undeploy, KEEP volume, known-good UNCHANGED (never promote)
# FAIL → restore last-known-good snapshot (data safe) then leave undeployed (idle)
try:
if warm_ok and passed:
canonical.undeploy_keep_volume(recipe)
print(" quick PASS → canonical undeployed, volume retained, known-good UNCHANGED",
flush=True)
elif warm_ok:
print(" quick FAIL → rolling back canonical to last-known-good snapshot", flush=True)
abra.undeploy(domain)
_wait_undeployed(domain)
warmsnap.restore(recipe, domain)
canonical._set_status(recipe, "idle") # noqa: SLF001
rolled_back = True
print(" quick FAIL → restored known-good data; canonical idle (NOT promoted)",
flush=True)
except Exception as e: # noqa: BLE001
dep_teardown_error = (dep_teardown_error or "") + f" | quick teardown/rollback: {e}"
print(f"!! quick teardown/rollback error: {e}", flush=True)
with contextlib.suppress(OSError):
os.remove(statefile)
with contextlib.suppress(OSError):
os.remove(depsfile)
with contextlib.suppress(OSError):
os.remove(skipfile)
print("\n===== RUN SUMMARY =====", flush=True)
print(f"mode = quick (LOWER-CONFIDENCE; opt-in; does not gate merge)")
print(f"canonical = {domain} known-good = {reg.get('version')} (UNCHANGED; quick never promotes)")
if rolled_back:
print("rolled-back = yes (restored last-known-good snapshot)")
for op in ("upgrade", "custom"):
if op in results:
suffix = ""
if op == "custom" and requires_deps_skipped:
suffix = f" ({requires_deps_skipped} requires_deps SKIPPED — SSO UNVERIFIED)"
print(f" {op:8s}: {results[op]}{suffix}")
overall = 0
if any(v == "fail" for v in results.values()) or not warm_ok:
overall = 1
if sso_unverified:
print(f"!! DEPS={declared} but setup_custom_tests failed and {requires_deps_skipped} "
"requires_deps SKIPPED — SSO NOT verified (F2-11)", file=sys.stderr)
overall = 1
if dep_teardown_error:
print(f"!! teardown leaked/erred: {dep_teardown_error}", file=sys.stderr)
overall = 1
if not results:
print("no tiers ran", file=sys.stderr)
return 1
return overall
def main() -> int:
recipe = os.environ.get("RECIPE")
if not recipe:
@ -408,6 +614,18 @@ def main() -> int:
head_ref = ref or lifecycle.recipe_head_commit(recipe)
repo_local = snapshot_recipe_tests(recipe)
meta = _load_meta(recipe)
# WC4/WC7: opt-in `--quick` fast lane. Requires an existing data-warm canonical; if none, fall
# back cleanly to the full COLD run below so the PR is still tested (DECISIONS Phase-2w).
if os.environ.get("CCCI_QUICK") == "1" or os.environ.get("MODE") == "quick":
if canonical.has_canonical(recipe):
return run_quick(recipe, ref, head_ref, repo_local, meta)
print(
f"MODE=quick requested but no canonical for {recipe} — falling back to COLD run "
"(no-canonical fallback, WC7)",
flush=True,
)
domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref)
# Deploy-once base version: previous published version when the upgrade tier will run and one