Files
cc-ci/runner/nightly_sweep.py
autonomic-bot 9a7772563a style: repo-wide lint pass — make the lint gate green again
Push builds have been RED on the lint step since ~build 209 from accumulated
formatting drift. This is the mechanical cleanup: ruff format + ruff --fix
(UP038 isinstance unions, SIM105 contextlib.suppress, UP031 f-strings, SIM115
tempfile context manager), shfmt -i 2 -ci, nixpkgs-fmt/statix/deadnix (merged
attrsets, dropped unused lib args), yamllint, and shell quoting fixes in
tests/lasuite-docs/setup_custom_tests.sh. No behaviour changes intended;
lint: PASS, unit tests: 138 passed.
2026-06-09 21:56:15 +00:00

96 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""Nightly full-cold sweep (Phase 2w / WC6).
Invoked by the `nightly-sweep` systemd timer (nix/modules/nightly-sweep.nix). Order (plan WC6):
1. Roll warm/infra to latest, HEALTH-GATED (WC1.1): re-run the keycloak + traefik reconcilers
(warm_reconcile.py <app> — fetch latest recipe → deploy → health-gate → commit/rollback+alert).
This is the health-gated "warm/infra → latest" step; a full operator `nixos-rebuild switch` is
the config-deploy path, not the autonomous nightly's job (DECISIONS Phase-2w WC6).
2. FULL-COLD sweep across enrolled (WARM_CANONICAL) recipes, SERIAL (MAX_TESTS honored — one at a
time), each `RECIPE=<r> run_recipe_ci.py` on LATEST (no REF) → a green run promotes/refreshes
that recipe's canonical (WC5). Serves as the daily authoritative regression.
MUST NOT run while a test/Drone build is in flight: if a `run_recipe_ci.py` is already active, skip
this nightly (defer to the next) rather than pile on the single node. Bounded + serial. Exit 0 even
if some recipes fail (logs per-recipe results; a red recipe just doesn't advance its canonical).
"""
from __future__ import annotations
import os
import subprocess
import sys
# The sweep drives the recipe RUNS (run_recipe_ci) + reads enrollment (tests/<r>/recipe_meta.py),
# which live in the cc-ci CHECKOUT (the nix store packages only runner/, not tests/). So operate
# against $CCCI_REPO (default /root/cc-ci) — the same checkout run_recipe_ci already runs from.
REPO = os.environ.get("CCCI_REPO", "/root/cc-ci")
sys.path.insert(0, os.path.join(REPO, "runner"))
from harness import canonical # noqa: E402
WARM_APPS = ["keycloak", "traefik"] # the live-warm/infra reconcilers to roll first (health-gated)
def _here() -> str:
return os.path.join(REPO, "runner")
def _another_run_active() -> bool:
"""True if a run_recipe_ci.py is already executing (don't pile onto the single node)."""
r = subprocess.run(["pgrep", "-f", "run_recipe_ci.py"], capture_output=True, text=True)
mine = str(os.getpid())
pids = [p for p in r.stdout.split() if p and p != mine]
return bool(pids)
def roll_warm_infra() -> None:
"""Re-run the health-gated reconcilers so keycloak + traefik roll to latest (WC1.1)."""
for app in WARM_APPS:
print(f"\n===== nightly: roll warm/infra {app} (health-gated) =====", flush=True)
rc = subprocess.run(
[sys.executable, os.path.join(_here(), "warm_reconcile.py"), app]
).returncode
print(f"nightly: reconcile {app} rc={rc}", flush=True)
def sweep() -> int:
recipes = canonical.enrolled_recipes()
print(f"\n===== nightly cold sweep: enrolled canonicals = {recipes} =====", flush=True)
results: dict[str, int] = {}
for r in recipes:
print(f"\n===== nightly: full-cold {r} (latest) =====", flush=True)
env = dict(os.environ, RECIPE=r)
env.pop("REF", None) # latest, not a PR head
env.pop("CCCI_QUICK", None)
env.pop("MODE", None)
rc = subprocess.run(
[sys.executable, os.path.join(_here(), "run_recipe_ci.py")], env=env
).returncode
results[r] = rc
print(
f"nightly: {r} rc={rc} ({'green→canonical refreshed' if rc == 0 else 'red'})",
flush=True,
)
# WC8 disk hygiene: drop warm data for de-enrolled canonicals; log the disk budget.
pruned = canonical.prune_stale()
if pruned:
print(f"nightly: pruned stale warm data for de-enrolled canonicals: {pruned}", flush=True)
df = subprocess.run(["df", "-h", "/"], capture_output=True, text=True)
print(f"nightly: disk / →\n{df.stdout.strip()}", flush=True)
print("\n===== nightly sweep summary =====", flush=True)
for r, rc in results.items():
print(f" {r}: {'PASS' if rc == 0 else 'FAIL'}", flush=True)
return 0 # the sweep itself succeeds; per-recipe reds are reported, not fatal
def main() -> int:
if _another_run_active():
print("nightly: a run_recipe_ci.py is active — skipping this nightly (defer)", flush=True)
return 0
roll_warm_infra()
return sweep()
if __name__ == "__main__":
raise SystemExit(main())