claim(2w): WC6 nightly full-cold sweep — timer+service roll warm/infra (health-gated) then serial cold sweep promoting canonicals (WC5); proven live
canonical.enrolled_recipes; runner/nightly_sweep.py (roll keycloak+traefik → serial full-cold over enrolled on latest → green promotes; skip if test active; operate against CCCI_REPO checkout for tests/); nix/modules/nightly-sweep.nix (timer 03:00 Persistent + oneshot service) wired in. 2 bugs fixed via live service run (repo-relative enrolled scan; util-linux for backup PTY). Live SERVICE sweep: enrolled=['custom-html'] → all tiers green → canonical advanced 1.10.0→1.11.0; red-run correctly does NOT promote. 71 unit pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -48,6 +48,20 @@ def canonical_domain(recipe: str) -> str:
|
||||
return warm.stable_domain(recipe)
|
||||
|
||||
|
||||
def enrolled_recipes() -> list[str]:
|
||||
"""All recipes enrolled as data-warm canonicals (recipe_meta.WARM_CANONICAL=True), sorted. Used
|
||||
by the WC6 nightly sweep to know which canonicals to refresh via a green cold run on latest."""
|
||||
tests_dir = os.path.join(os.path.dirname(__file__), "..", "..", "tests")
|
||||
out = []
|
||||
try:
|
||||
for name in sorted(os.listdir(tests_dir)):
|
||||
if os.path.isfile(os.path.join(tests_dir, name, "recipe_meta.py")) and is_enrolled(name):
|
||||
out.append(name)
|
||||
except OSError:
|
||||
pass
|
||||
return out
|
||||
|
||||
|
||||
def registry_path(recipe: str) -> str:
|
||||
return os.path.join(warmsnap.app_dir(recipe), "canonical.json")
|
||||
|
||||
|
||||
86
runner/nightly_sweep.py
Normal file
86
runner/nightly_sweep.py
Normal file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Nightly full-cold sweep (Phase 2w / WC6).
|
||||
|
||||
Invoked by the `nightly-sweep` systemd timer (nix/modules/nightly-sweep.nix). Order (plan WC6):
|
||||
1. Roll warm/infra to latest, HEALTH-GATED (WC1.1): re-run the keycloak + traefik reconcilers
|
||||
(warm_reconcile.py <app> — fetch latest recipe → deploy → health-gate → commit/rollback+alert).
|
||||
This is the health-gated "warm/infra → latest" step; a full operator `nixos-rebuild switch` is
|
||||
the config-deploy path, not the autonomous nightly's job (DECISIONS Phase-2w WC6).
|
||||
2. FULL-COLD sweep across enrolled (WARM_CANONICAL) recipes, SERIAL (MAX_TESTS honored — one at a
|
||||
time), each `RECIPE=<r> run_recipe_ci.py` on LATEST (no REF) → a green run promotes/refreshes
|
||||
that recipe's canonical (WC5). Serves as the daily authoritative regression.
|
||||
|
||||
MUST NOT run while a test/Drone build is in flight: if a `run_recipe_ci.py` is already active, skip
|
||||
this nightly (defer to the next) rather than pile on the single node. Bounded + serial. Exit 0 even
|
||||
if some recipes fail (logs per-recipe results; a red recipe just doesn't advance its canonical).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# The sweep drives the recipe RUNS (run_recipe_ci) + reads enrollment (tests/<r>/recipe_meta.py),
|
||||
# which live in the cc-ci CHECKOUT (the nix store packages only runner/, not tests/). So operate
|
||||
# against $CCCI_REPO (default /root/cc-ci) — the same checkout run_recipe_ci already runs from.
|
||||
REPO = os.environ.get("CCCI_REPO", "/root/cc-ci")
|
||||
sys.path.insert(0, os.path.join(REPO, "runner"))
|
||||
from harness import canonical # noqa: E402
|
||||
|
||||
WARM_APPS = ["keycloak", "traefik"] # the live-warm/infra reconcilers to roll first (health-gated)
|
||||
|
||||
|
||||
def _here() -> str:
|
||||
return os.path.join(REPO, "runner")
|
||||
|
||||
|
||||
def _another_run_active() -> bool:
|
||||
"""True if a run_recipe_ci.py is already executing (don't pile onto the single node)."""
|
||||
r = subprocess.run(["pgrep", "-f", "run_recipe_ci.py"], capture_output=True, text=True)
|
||||
mine = str(os.getpid())
|
||||
pids = [p for p in r.stdout.split() if p and p != mine]
|
||||
return bool(pids)
|
||||
|
||||
|
||||
def roll_warm_infra() -> None:
|
||||
"""Re-run the health-gated reconcilers so keycloak + traefik roll to latest (WC1.1)."""
|
||||
for app in WARM_APPS:
|
||||
print(f"\n===== nightly: roll warm/infra {app} (health-gated) =====", flush=True)
|
||||
rc = subprocess.run(
|
||||
[sys.executable, os.path.join(_here(), "warm_reconcile.py"), app]
|
||||
).returncode
|
||||
print(f"nightly: reconcile {app} rc={rc}", flush=True)
|
||||
|
||||
|
||||
def sweep() -> int:
|
||||
recipes = canonical.enrolled_recipes()
|
||||
print(f"\n===== nightly cold sweep: enrolled canonicals = {recipes} =====", flush=True)
|
||||
results: dict[str, int] = {}
|
||||
for r in recipes:
|
||||
print(f"\n===== nightly: full-cold {r} (latest) =====", flush=True)
|
||||
env = dict(os.environ, RECIPE=r)
|
||||
env.pop("REF", None) # latest, not a PR head
|
||||
env.pop("CCCI_QUICK", None)
|
||||
env.pop("MODE", None)
|
||||
rc = subprocess.run(
|
||||
[sys.executable, os.path.join(_here(), "run_recipe_ci.py")], env=env
|
||||
).returncode
|
||||
results[r] = rc
|
||||
print(f"nightly: {r} rc={rc} ({'green→canonical refreshed' if rc == 0 else 'red'})", flush=True)
|
||||
print("\n===== nightly sweep summary =====", flush=True)
|
||||
for r, rc in results.items():
|
||||
print(f" {r}: {'PASS' if rc == 0 else 'FAIL'}", flush=True)
|
||||
return 0 # the sweep itself succeeds; per-recipe reds are reported, not fatal
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if _another_run_active():
|
||||
print("nightly: a run_recipe_ci.py is active — skipping this nightly (defer)", flush=True)
|
||||
return 0
|
||||
roll_warm_infra()
|
||||
return sweep()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user