All checks were successful
continuous-integration/drone/push Build is passing
M2 finding (Adversary-flagged): promote_canonical did a bare `abra app deploy` that lacked the cold install's wiring, so recipes that passed the cold test still failed to promote: - ghost: `abra app new` FATA 'locally unstaged changes' — the CCCI_SKIP_FETCH per-run tree was left dirty by the tier suite. Fix: force re-checkout the tag + `git clean -fd` before deploy. - bluesky-pds: missing pds_plc_rotation_key (install_steps inserts it, #generate=false). - custom-html-tiny: 404 (install_steps seeds index.html). Fix: run install_steps_hook in promote. - OIDC recipes would miss their realm. Fix: provision DEPS in promote like the cold install. promote_canonical now: clean tree → provision deps → deploy_app with install_steps_hook + overlay + ready-probes, then snapshot. Also: sweep result label now derives from whether the canonical was actually written (promote is non-fatal; rc==0 did not imply promoted) — fixes the misleading 'PASS (promoted)'. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
153 lines
7.5 KiB
Python
153 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Weekly canonical sweep (Phase 2w / WC6 + phase canon).
|
|
|
|
Invoked by the `nightly-sweep` systemd timer (nix/modules/nightly-sweep.nix), weekly. Order:
|
|
1. Roll warm/infra to latest, HEALTH-GATED (WC1.1): re-run the keycloak + traefik reconcilers
|
|
(warm_reconcile.py <app> — fetch latest recipe → deploy → health-gate → commit/rollback+alert).
|
|
This is the health-gated "warm/infra → latest" step; a full operator `nixos-rebuild switch` is
|
|
the config-deploy path, not the autonomous sweep's job (DECISIONS Phase-2w WC6).
|
|
2. Per ENROLLED (WARM_CANONICAL) recipe, SERIAL (one at a time):
|
|
(C) faithfully mirror-sync the recipe to coopcloud upstream (main+tags, close merged-upstream
|
|
PRs) via scripts/recipe-mirror-sync.sh — so the sweep measures TRUE upstream tags/latest.
|
|
(D) NEW-RELEASE-TAG trigger (canon §2.D): compare the latest release tag to the recipe's
|
|
canonical version (NOT commit). No new tag → SKIP (even if `main` has new untagged commits).
|
|
New tag → cold-test that TAGGED version (run_on_tag) and, on green, promote the canonical to
|
|
it (run_recipe_ci.promote_canonical, gated on green+cold+latest+enrolled+TAGGED, canon §2.A).
|
|
Run-twice determinism (canon M2): a second immediate sweep finds latest tag == canonical for every
|
|
recipe → SKIPs all (clean no-op, no CI rerun).
|
|
|
|
MUST NOT run while a test/Drone build is in flight: if a `run_recipe_ci.py` is already active, skip
|
|
this sweep (defer) rather than pile on the single node. Bounded + serial. Exit 0 even if some recipes
|
|
fail (logs per-recipe results; a red recipe just doesn't advance its canonical). NO AI at runtime —
|
|
pure script + systemd timer.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
# The sweep drives the recipe RUNS (run_recipe_ci) + reads enrollment (tests/<r>/recipe_meta.py),
|
|
# which live in the cc-ci CHECKOUT (the nix store packages only runner/, not tests/). So operate
|
|
# against $CCCI_REPO (default /root/cc-ci) — the same checkout run_recipe_ci already runs from.
|
|
REPO = os.environ.get("CCCI_REPO", "/root/cc-ci")
|
|
sys.path.insert(0, os.path.join(REPO, "runner"))
|
|
import warm_reconcile as wr # noqa: E402
|
|
from harness import abra, canonical # noqa: E402
|
|
|
|
WARM_APPS = ["keycloak", "traefik"] # the live-warm/infra reconcilers to roll first (health-gated)
|
|
MIRROR_SYNC = os.path.join(REPO, "scripts", "recipe-mirror-sync.sh")
|
|
|
|
|
|
def _here() -> str:
|
|
return os.path.join(REPO, "runner")
|
|
|
|
|
|
def _another_run_active() -> bool:
|
|
"""True if a run_recipe_ci.py is already executing (don't pile onto the single node)."""
|
|
r = subprocess.run(["pgrep", "-f", "run_recipe_ci.py"], capture_output=True, text=True)
|
|
mine = str(os.getpid())
|
|
pids = [p for p in r.stdout.split() if p and p != mine]
|
|
return bool(pids)
|
|
|
|
|
|
def roll_warm_infra() -> None:
|
|
"""Re-run the health-gated reconcilers so keycloak + traefik roll to latest (WC1.1)."""
|
|
for app in WARM_APPS:
|
|
print(f"\n===== nightly: roll warm/infra {app} (health-gated) =====", flush=True)
|
|
rc = subprocess.run(
|
|
[sys.executable, os.path.join(_here(), "warm_reconcile.py"), app]
|
|
).returncode
|
|
print(f"nightly: reconcile {app} rc={rc}", flush=True)
|
|
|
|
|
|
def mirror_sync(recipe: str) -> int:
|
|
"""canon §2.C: faithfully reconcile the recipe MIRROR to coopcloud upstream (main+tags, close
|
|
merged-upstream PRs). Best-effort — a sync failure is logged but does NOT abort the recipe's run
|
|
(the trigger still reads upstream tags via the abra fetch below). Returns the script rc."""
|
|
if not os.path.isfile(MIRROR_SYNC):
|
|
print(
|
|
f"sweep: mirror-sync script missing ({MIRROR_SYNC}) — skipping sync for {recipe}",
|
|
flush=True,
|
|
)
|
|
return 0
|
|
rc = subprocess.run(["bash", MIRROR_SYNC, recipe]).returncode
|
|
if rc != 0:
|
|
print(f"sweep: mirror-sync {recipe} rc={rc} (non-fatal — continuing)", flush=True)
|
|
return rc
|
|
|
|
|
|
def run_on_tag(recipe: str, tag: str) -> int:
|
|
"""Run a full COLD CI on the recipe at the published RELEASE TAG `tag` (canon §2.D: the sweep
|
|
tests releases, not arbitrary `main` commits). Checks out the tag in the canonical recipe clone
|
|
and runs run_recipe_ci with CCCI_SKIP_FETCH=1 so the head under test IS the tag (head_version =
|
|
tag → the tagged-promote gate passes; REF stays empty → promote allowed). A green run promotes
|
|
the canonical to that tagged version (run_recipe_ci.should_promote_canonical)."""
|
|
abra.recipe_checkout(recipe, tag)
|
|
env = dict(os.environ, RECIPE=recipe, CCCI_SKIP_FETCH="1")
|
|
for k in ("REF", "CCCI_QUICK", "MODE", "VERSION"):
|
|
env.pop(k, None) # cold (no PR head), full mode, head = the staged tag checkout
|
|
return subprocess.run(
|
|
[sys.executable, os.path.join(_here(), "run_recipe_ci.py")], env=env
|
|
).returncode
|
|
|
|
|
|
def sweep() -> int:
|
|
recipes = canonical.enrolled_recipes()
|
|
print(f"\n===== weekly canonical sweep: enrolled = {recipes} =====", flush=True)
|
|
results: dict[str, str] = {}
|
|
for r in recipes:
|
|
print(f"\n===== sweep: {r} =====", flush=True)
|
|
# C. faithful mirror-sync to upstream (best-effort) so we measure true upstream tags/latest.
|
|
mirror_sync(r)
|
|
# Ensure the local recipe clone reflects upstream tags for the trigger computation.
|
|
try:
|
|
wr.fetch_recipe(r)
|
|
except Exception as e: # noqa: BLE001 — a fetch failure is logged; trigger uses what's local
|
|
print(f"sweep: {r} fetch_recipe failed (non-fatal): {e}", flush=True)
|
|
# D. new-release-tag trigger: latest release tag vs canonical version (NOT commit).
|
|
latest = wr.latest_version(wr.recipe_tags(r))
|
|
canon = (canonical.read_registry(r) or {}).get("version")
|
|
action, reason = wr.sweep_decision(latest, canon)
|
|
if action == "skip":
|
|
results[r] = f"SKIP ({reason})"
|
|
print(f"sweep: {r} SKIP — {reason}", flush=True)
|
|
continue
|
|
print(f"sweep: {r} RUN — {reason}; cold-testing tagged release {latest}", flush=True)
|
|
rc = run_on_tag(r, latest)
|
|
# Trustworthy label (canon M2): promote_canonical is non-fatal, so rc==0 does NOT imply a
|
|
# canonical was written. Derive the result from whether the registry now records `latest`.
|
|
rec = canonical.read_registry(r) or {}
|
|
if rc != 0:
|
|
results[r] = "FAIL (red; canonical unchanged)"
|
|
elif rec.get("version") == latest:
|
|
results[r] = f"PASS (promoted {latest})"
|
|
else:
|
|
results[r] = (
|
|
f"GREEN-BUT-PROMOTE-FAILED (canonical={rec.get('version') or 'none'}, expected {latest})"
|
|
)
|
|
print(f"sweep: {r} rc={rc} ({results[r]})", flush=True)
|
|
# WC8 disk hygiene: drop warm data for de-enrolled canonicals; log the disk budget.
|
|
pruned = canonical.prune_stale()
|
|
if pruned:
|
|
print(f"sweep: pruned stale warm data for de-enrolled canonicals: {pruned}", flush=True)
|
|
df = subprocess.run(["df", "-h", "/"], capture_output=True, text=True)
|
|
print(f"sweep: disk / →\n{df.stdout.strip()}", flush=True)
|
|
print("\n===== weekly sweep summary =====", flush=True)
|
|
for r, status in results.items():
|
|
print(f" {r}: {status}", flush=True)
|
|
return 0 # the sweep itself succeeds; per-recipe reds are reported, not fatal
|
|
|
|
|
|
def main() -> int:
|
|
if _another_run_active():
|
|
print("nightly: a run_recipe_ci.py is active — skipping this nightly (defer)", flush=True)
|
|
return 0
|
|
roll_warm_infra()
|
|
return sweep()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|