#!/usr/bin/env python3 """Nightly full-cold sweep (Phase 2w / WC6). Invoked by the `nightly-sweep` systemd timer (nix/modules/nightly-sweep.nix). Order (plan WC6): 1. Roll warm/infra to latest, HEALTH-GATED (WC1.1): re-run the keycloak + traefik reconcilers (warm_reconcile.py — fetch latest recipe → deploy → health-gate → commit/rollback+alert). This is the health-gated "warm/infra → latest" step; a full operator `nixos-rebuild switch` is the config-deploy path, not the autonomous nightly's job (DECISIONS Phase-2w WC6). 2. FULL-COLD sweep across enrolled (WARM_CANONICAL) recipes, SERIAL (MAX_TESTS honored — one at a time), each `RECIPE= run_recipe_ci.py` on LATEST (no REF) → a green run promotes/refreshes that recipe's canonical (WC5). Serves as the daily authoritative regression. MUST NOT run while a test/Drone build is in flight: if a `run_recipe_ci.py` is already active, skip this nightly (defer to the next) rather than pile on the single node. Bounded + serial. Exit 0 even if some recipes fail (logs per-recipe results; a red recipe just doesn't advance its canonical). """ from __future__ import annotations import os import subprocess import sys # The sweep drives the recipe RUNS (run_recipe_ci) + reads enrollment (tests//recipe_meta.py), # which live in the cc-ci CHECKOUT (the nix store packages only runner/, not tests/). So operate # against $CCCI_REPO (default /root/cc-ci) — the same checkout run_recipe_ci already runs from. REPO = os.environ.get("CCCI_REPO", "/root/cc-ci") sys.path.insert(0, os.path.join(REPO, "runner")) from harness import canonical # noqa: E402 WARM_APPS = ["keycloak", "traefik"] # the live-warm/infra reconcilers to roll first (health-gated) def _here() -> str: return os.path.join(REPO, "runner") def _another_run_active() -> bool: """True if a run_recipe_ci.py is already executing (don't pile onto the single node).""" r = subprocess.run(["pgrep", "-f", "run_recipe_ci.py"], capture_output=True, text=True) mine = str(os.getpid()) pids = [p for p in r.stdout.split() if p and p != mine] return bool(pids) def roll_warm_infra() -> None: """Re-run the health-gated reconcilers so keycloak + traefik roll to latest (WC1.1).""" for app in WARM_APPS: print(f"\n===== nightly: roll warm/infra {app} (health-gated) =====", flush=True) rc = subprocess.run( [sys.executable, os.path.join(_here(), "warm_reconcile.py"), app] ).returncode print(f"nightly: reconcile {app} rc={rc}", flush=True) def sweep() -> int: recipes = canonical.enrolled_recipes() print(f"\n===== nightly cold sweep: enrolled canonicals = {recipes} =====", flush=True) results: dict[str, int] = {} for r in recipes: print(f"\n===== nightly: full-cold {r} (latest) =====", flush=True) env = dict(os.environ, RECIPE=r) env.pop("REF", None) # latest, not a PR head env.pop("CCCI_QUICK", None) env.pop("MODE", None) rc = subprocess.run( [sys.executable, os.path.join(_here(), "run_recipe_ci.py")], env=env ).returncode results[r] = rc print(f"nightly: {r} rc={rc} ({'green→canonical refreshed' if rc == 0 else 'red'})", flush=True) # WC8 disk hygiene: drop warm data for de-enrolled canonicals; log the disk budget. pruned = canonical.prune_stale() if pruned: print(f"nightly: pruned stale warm data for de-enrolled canonicals: {pruned}", flush=True) df = subprocess.run(["df", "-h", "/"], capture_output=True, text=True) print(f"nightly: disk / →\n{df.stdout.strip()}", flush=True) print("\n===== nightly sweep summary =====", flush=True) for r, rc in results.items(): print(f" {r}: {'PASS' if rc == 0 else 'FAIL'}", flush=True) return 0 # the sweep itself succeeds; per-recipe reds are reported, not fatal def main() -> int: if _another_run_active(): print("nightly: a run_recipe_ci.py is active — skipping this nightly (defer)", flush=True) return 0 roll_warm_infra() return sweep() if __name__ == "__main__": raise SystemExit(main())