diff --git a/runner/nightly_sweep.py b/runner/nightly_sweep.py index a1f6448..ecfbbc4 100644 --- a/runner/nightly_sweep.py +++ b/runner/nightly_sweep.py @@ -116,7 +116,17 @@ def sweep() -> int: continue print(f"sweep: {r} RUN — {reason}; cold-testing tagged release {latest}", flush=True) rc = run_on_tag(r, latest) - results[r] = "PASS (promoted)" if rc == 0 else "FAIL (canonical unchanged)" + # Trustworthy label (canon M2): promote_canonical is non-fatal, so rc==0 does NOT imply a + # canonical was written. Derive the result from whether the registry now records `latest`. + rec = canonical.read_registry(r) or {} + if rc != 0: + results[r] = "FAIL (red; canonical unchanged)" + elif rec.get("version") == latest: + results[r] = f"PASS (promoted {latest})" + else: + results[r] = ( + f"GREEN-BUT-PROMOTE-FAILED (canonical={rec.get('version') or 'none'}, expected {latest})" + ) print(f"sweep: {r} rc={rc} ({results[r]})", flush=True) # WC8 disk hygiene: drop warm data for de-enrolled canonicals; log the disk budget. pruned = canonical.prune_stale() diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 745e9e8..0599dbd 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -921,37 +921,64 @@ def should_promote_canonical( return canonical.is_enrolled(recipe) and overall == 0 and not quick and not ref and tagged -def promote_canonical(recipe: str, head_ref: str | None, version: str | None) -> None: +def promote_canonical( + recipe: str, head_ref: str | None, version: str | None, repo_local: str | None = None +) -> None: """canon §2.A / WC5: (re)seed the canonical at the GREEN-VERIFIED TESTED RELEASE `version` — the exact version under test (head_version), which the should_promote tagged-gate guarantees is a - published release tag. Deploy `warm-` at that version (reattaching the retained canonical - volume if one exists — an in-place version bump — else a fresh install), wait healthy, undeploy, + published release tag. Deploy `warm-` at that version as a FAITHFUL install (the same + wiring the cold install used — deps + install_steps + overlay + secrets), wait healthy, undeploy, snapshot + record the registry (atomic replace of the last-known-good). - Promotes EXACTLY the tested version — it no longer re-derives `latest_version(recipe_tags)`, which - could differ from the version actually exercised by the run (e.g. a manual `RECIPE=` run whose - `main` checkout sits on a tag older than the newest published tag): the canonical must record the - version the tier suite proved green, not a never-tested newer tag. The OLD known-good is replaced - ONLY here, after green (never lost on a red run).""" - import warm_reconcile as wr - + The warm deploy must reproduce the cold install, not a bare `abra app deploy` (canon M2 finding): + - CLEAN the recipe tree first. The sweep's run_on_tag sets CCCI_SKIP_FETCH=1 so the cold run + stages the tag; by promote time that per-run tree was mutated by the tier suite (chaos head + checkout + the untracked compose.ccci.yml overlay), which makes `abra app new` FATA "locally + unstaged changes". A forced re-checkout of the tag + `git clean -fd` restores a pristine tree. + - PROVISION DEPS (OIDC realms) + run INSTALL_STEPS, exactly like the cold install. Without these, + recipes whose healthy state depends on them fail the warm deploy though the cold test was green + — e.g. bluesky-pds (install_steps inserts the non-generatable pds_plc_rotation_key), + custom-html-tiny (install_steps seeds index.html), and any DEPS recipe (OIDC env). + Promotes EXACTLY the tested version (never re-derives `latest_version`). The OLD known-good is + replaced ONLY here, after green (never lost on a red run).""" domain = canonical.canonical_domain(recipe) if not version: print(f"WC5 promote: no tested release version for {recipe} — skip", flush=True) return - wr.fetch_recipe(recipe) # ensure the release tag is present locally for the pinned checkout + warm_reconcile.fetch_recipe( + recipe + ) # no-op under CCCI_SKIP_FETCH; real fetch on the manual path meta = meta_mod.load(recipe) # The cold run's deploy-count was already asserted + the countfile removed; don't perturb it. os.environ.pop("CCCI_DEPLOY_COUNT_FILE", None) + # Pristine tree at the tag: discard the cold run's tier mutations + untracked overlay so the + # pinned `abra app new` clean-tree gate passes (deploy_app re-applies the overlay + auto-chaos). + abra.recipe_checkout(recipe, version) + subprocess.run( + ["git", "-C", abra.recipe_dir(recipe), "clean", "-fd"], capture_output=True, text=True + ) print( f"\n===== WC5 promote-on-green-cold: (re)seed canonical {recipe} @ {version} =====", flush=True, ) + # Faithful install wiring: deps (OIDC) then install_steps (via deploy_app's hook), same as cold. + declared = list(meta.DEPS) + if declared: + try: + _provision_deps(recipe, domain, None, declared) + print(f" WC5 promote: provisioned deps {declared} for warm {domain}", flush=True) + except Exception as e: # noqa: BLE001 — log; deploy may still come up for non-blocking deps + print( + f" WC5 promote: dep provisioning failed ({_scrub(str(e))}) — deploying anyway", + flush=True, + ) + hook = discovery.install_steps(recipe, repo_local) lifecycle.deploy_app( recipe, domain, version=version, secrets=True, + install_steps_hook=hook, deploy_timeout=int(meta.DEPLOY_TIMEOUT), meta=meta, ) @@ -962,6 +989,7 @@ def promote_canonical(recipe: str, head_ref: str | None, version: str | None) -> deploy_timeout=meta.DEPLOY_TIMEOUT, http_timeout=meta.HTTP_TIMEOUT, ) + lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="install") abra.undeploy(domain) _wait_undeployed(domain) canonical.seed_canonical(recipe, version, commit=head_ref) @@ -1500,7 +1528,7 @@ def main() -> int: tagged = warm_reconcile.is_released_version(recipe, head_version) if should_promote_canonical(recipe, ref, overall, quick=False, tagged=tagged): try: - promote_canonical(recipe, head_ref, head_version) + promote_canonical(recipe, head_ref, head_version, repo_local) except Exception as e: # noqa: BLE001 — promote is a post-green bonus; never fail a green run print( f"!! WC5 promote failed (non-fatal; known-good unchanged): {_scrub(str(e))}",