From bb2e3c6b2cd2e72fe9f015293b355d3f1ba34773 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 17 Jun 2026 00:14:53 +0000 Subject: [PATCH] =?UTF-8?q?feat(prevb):=20dynamic=20upgrade=20base=20(last?= =?UTF-8?q?-green=E2=86=92main=E2=86=92skip)=20+=20per-recipe=20previous/?= =?UTF-8?q?=20overlay;=20migrate=20discourse=20off=20static=20base=20+=20l?= =?UTF-8?q?eaky=20overlay?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - resolve_upgrade_base: BasePlan(kind=version|ref|skip); last-green (warm canonical) primary, main-tip fallback, declared skip else. UPGRADE_BASE_VERSION retained as optional override. - deploy_app: base_ref path (chaos-deploy a main-tip/last-green commit) + apply_previous wiring. - lifecycle: previous/ surface (has_previous, previous_target_version, previous_status decision, provide/remove overlay, compose_file add/remove, recipe_branch_commit, stack_service_names). - generic.perform_upgrade: strip previous/ overlay + COMPOSE_FILE entry before head redeploy. - discourse: compose.ccci.yml now environmental-only (order: stop-first); removed bitnamilegacy pins + sidekiq + UPGRADE_BASE_VERSION; test_upgrade.py asserts head image == official 3.5.3 + no sidekiq. - unit tests: resolve_upgrade_base matrix + previous/ apply/skip/stale + COMPOSE_FILE layering. --- runner/harness/generic.py | 10 ++ runner/harness/lifecycle.py | 185 ++++++++++++++++++++++++++++++- runner/run_recipe_ci.py | 104 ++++++++++++----- tests/discourse/compose.ccci.yml | 68 ++++-------- tests/discourse/recipe_meta.py | 26 ++--- tests/discourse/test_upgrade.py | 40 +++++++ tests/unit/test_previous.py | 118 ++++++++++++++++++++ tests/unit/test_upgrade_base.py | 118 ++++++++++++-------- 8 files changed, 532 insertions(+), 137 deletions(-) create mode 100644 tests/discourse/test_upgrade.py create mode 100644 tests/unit/test_previous.py diff --git a/runner/harness/generic.py b/runner/harness/generic.py index 3ee70c3..700ae5c 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -251,6 +251,16 @@ def perform_upgrade( before = lifecycle.deployed_identity(domain) if head_ref: lifecycle.recipe_checkout_ref(recipe, head_ref) + # Phase prevb: strip the base-only `previous/` overlay before the head redeploy so the PR head + # runs UNMODIFIED (the version-specific repair must never leak onto the head). Delete the copied + # compose.previous.yml and drop it from COMPOSE_FILE in the app .env. No-op when no previous/ was + # applied (the file/entry are absent). The environmental compose.ccci.yml stays (all deploys). + lifecycle.remove_previous_overlay(recipe) + cf_now = abra.env_get(domain, "COMPOSE_FILE") + if cf_now and lifecycle.PREVIOUS_COMPOSE in cf_now: + cf_stripped = lifecycle.compose_file_remove(cf_now, lifecycle.PREVIOUS_COMPOSE) + abra.env_set(domain, "COMPOSE_FILE", cf_stripped) + print(f" previous-overlay: COMPOSE_FILE for head redeploy = {cf_stripped}", flush=True) # UPGRADE_EXTRA_ENV (F2-14c): a recipe may need different app .env for the upgrade-TARGET deploy # than for the base — e.g. mumble's `compose.host-ports.yml` overlay exists ONLY in the newer # (target) version, so the base deploys minimally WITHOUT it and the upgrade adds it to COMPOSE_FILE diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index c131a6c..e063e5e 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -154,6 +154,145 @@ def provide_ccci_overlay(recipe: str) -> None: ) +# --------------------------------------------------------------------------------------------- +# Phase prevb: dynamic upgrade base + per-recipe `previous/` overlay. +# +# `previous/` holds the MINIMAL config needed to deploy the *previous (last-green) version* when it +# can't deploy as-published (e.g. an image relocation). It is applied ONLY to the base deploy and +# ONLY when the resolved base is that exact published version; NEVER to the PR head; on a main-tip +# base or version mismatch it is skipped + flagged stale. Mechanism mirrors the environmental +# compose.ccci.yml overlay (copied untracked into the checkout, referenced via COMPOSE_FILE), but is +# stripped before the head redeploy so the head runs unmodified. (plan-phase-prevb §2.) +# --------------------------------------------------------------------------------------------- + +PREVIOUS_COMPOSE = "compose.previous.yml" + + +def previous_dir(recipe: str) -> str: + return os.path.join(meta_mod.TESTS_DIR, recipe, "previous") + + +def has_previous(recipe: str) -> bool: + """True iff the recipe ships a `tests//previous/` folder with a compose.previous.yml.""" + return os.path.isfile(os.path.join(previous_dir(recipe), PREVIOUS_COMPOSE)) + + +def previous_target_version(recipe: str) -> str | None: + """The published version the recipe's `previous/` folder targets — declared in a one-line + `previous/VERSION` marker (first non-blank, non-`#` line). None if no marker. The harness applies + `previous/` ONLY when the resolved base equals this; otherwise the folder is stale and skipped.""" + marker = os.path.join(previous_dir(recipe), "VERSION") + try: + with open(marker) as f: + for line in f: + s = line.strip() + if s and not s.startswith("#"): + return s + except OSError: + return None + return None + + +def previous_status(recipe: str, base_kind: str, base_version: str | None) -> dict: + """Decide whether `previous/` applies to this base, as a pure decision (unit-tested). + + Returns {apply, stale, reason}: + - no `previous/` folder → apply=False, stale=False (nothing to do). + - base is not a pinned published → apply=False, stale=True (main-tip / no base): previous/ can + version (kind != "version") only repair a published base; flag for review. + - no `previous/VERSION` marker → apply=False, stale=True (undeclared: cannot version-guard). + - marker != resolved base version → apply=False, stale=True (stale: targets X, base is Y). + - marker == resolved base version → apply=True, stale=False. + """ + if not has_previous(recipe): + return {"apply": False, "stale": False, "reason": ""} + if base_kind != "version" or not base_version: + return { + "apply": False, + "stale": True, + "reason": ( + f"previous/ present but the resolved base is not a pinned published version " + f"(base_kind={base_kind}) — previous/ only repairs a published base; not applied" + ), + } + target = previous_target_version(recipe) + if not target: + return { + "apply": False, + "stale": True, + "reason": "previous/ has no VERSION marker — cannot version-guard; not applied", + } + if target != base_version: + return { + "apply": False, + "stale": True, + "reason": f"previous/ targets {target}, base is {base_version} — stale, remove it", + } + return {"apply": True, "stale": False, "reason": ""} + + +def provide_previous_overlay(recipe: str) -> None: + """Copy `tests//previous/compose.previous.yml` into THIS run's recipe checkout so a + COMPOSE_FILE reference to it resolves (base deploy only). No-op if absent.""" + src = os.path.join(previous_dir(recipe), PREVIOUS_COMPOSE) + if not os.path.isfile(src): + return + dest_dir = abra.recipe_dir(recipe) + if not os.path.isdir(dest_dir): + raise RuntimeError(f"recipe checkout missing for {recipe}: {dest_dir}") + shutil.copy(src, os.path.join(dest_dir, PREVIOUS_COMPOSE)) + print( + f" previous-overlay: provided {PREVIOUS_COMPOSE} to the {recipe} base checkout " + "(base-only; stripped before the head redeploy)", + flush=True, + ) + + +def remove_previous_overlay(recipe: str) -> None: + """Delete compose.previous.yml from this run's recipe checkout (called before the head redeploy so + the PR head NEVER sees the previous-version repair). No-op if absent.""" + p = os.path.join(abra.recipe_dir(recipe), PREVIOUS_COMPOSE) + with contextlib.suppress(OSError): + if os.path.isfile(p): + os.remove(p) + print( + f" previous-overlay: removed {PREVIOUS_COMPOSE} from the head checkout", flush=True + ) + + +def compose_file_add(compose_file: str, overlay: str) -> str: + """Append `overlay` to a ':'-separated COMPOSE_FILE value if absent (pure). Defaults a missing + value to `compose.yml` first, matching abra.""" + parts = [p for p in (compose_file or "").split(":") if p] or ["compose.yml"] + if overlay not in parts: + parts.append(overlay) + return ":".join(parts) + + +def compose_file_remove(compose_file: str, overlay: str) -> str: + """Remove `overlay` from a ':'-separated COMPOSE_FILE value (pure). Keeps order; defaults a now- + empty value to `compose.yml`.""" + parts = [p for p in (compose_file or "").split(":") if p and p != overlay] + return ":".join(parts or ["compose.yml"]) + + +def recipe_branch_commit(recipe: str, branch: str = "main") -> str | None: + """Resolve the recipe repo's target-branch tip (the predecessor the PR merges onto) to a commit + SHA, for the dynamic upgrade-base main-tip fallback. The per-run tree is a full clone of the + mirror, so `origin/` is present. Tries origin/, then origin/master. None if + neither resolves (new recipe / detached state).""" + path = abra.recipe_dir(recipe) + for ref in (f"origin/{branch}", "origin/master"): + proc = subprocess.run( + ["git", "-C", path, "rev-parse", "--verify", "--quiet", ref], + capture_output=True, + text=True, + ) + if proc.returncode == 0 and proc.stdout.strip(): + return proc.stdout.strip() + return None + + def _run_install_steps(hook: tuple[str, str], recipe: str, domain: str) -> None: """Run a recipe's custom install-steps hook (install_steps.sh) during the install tier — after `abra app new` + env defaults + secret generate, before deploy (Phase 1d DG5). The hook gets the @@ -234,6 +373,8 @@ def deploy_app( recipe: str, domain: str, version: str | None = None, + base_ref: str | None = None, + apply_previous: bool = False, secrets: bool = True, install_steps_hook: tuple[str, str] | None = None, deploy_timeout: int = 900, @@ -273,7 +414,18 @@ def deploy_app( # Adversary F1d-2). Chaos is correct ONLY for the version=None case (deploy the current PR-head # checkout). Order matters: checkout before secret_generate (-C) so secrets match the pinned tree. chaos = version is None - if version: + if base_ref: + # Dynamic upgrade base = target-branch (main) tip, or a last-green commit (phase prevb): an + # arbitrary git ref, not a published tag. Check it out and deploy via chaos — same mechanism + # as the head deploy (a non-tag ref would FATA abra's pinned-deploy lint/clean-tree gate). + recipe_checkout_ref(recipe, base_ref) + chaos = True + print( + f" deploy_app({recipe}): base = main-tip/ref {base_ref[:12]} → chaos deploy of the " + "checked-out ref (the PR's true predecessor; not a published pin)", + flush=True, + ) + elif version: abra.recipe_checkout(recipe, version) # A pinned (non-chaos) deploy runs `abra recipe lint`, which FATAs R014 ('only annotated # tags') if the upstream recipe ships a stray lightweight version tag (e.g. lasuite-meet's @@ -309,8 +461,19 @@ def deploy_app( # it ourselves is recipe-agnostic and canonical (the run domain IS the app's domain). abra.env_set(domain, "DOMAIN", domain) abra.env_set(domain, "LETS_ENCRYPT_ENV", "") - for k, v in meta_mod.extra_env(meta, meta_mod.hook_ctx(domain, meta)).items(): + extra = meta_mod.extra_env(meta, meta_mod.hook_ctx(domain, meta)) + for k, v in extra.items(): abra.env_set(domain, k, v) + # Per-recipe `previous/` overlay (phase prevb): version-specific repair to deploy the *previous + # (last-green) version*, applied to the BASE deploy ONLY (the caller resolved apply=True only when + # the resolved base equals previous/VERSION). Appended on top of the recipe's COMPOSE_FILE (which + # may already include the environmental compose.ccci.yml). Stripped before the head redeploy + # (generic.perform_upgrade) so the PR head runs unmodified. + if apply_previous: + cf = compose_file_add(extra.get("COMPOSE_FILE", "compose.yml"), PREVIOUS_COMPOSE) + abra.env_set(domain, "COMPOSE_FILE", cf) + provide_previous_overlay(recipe) + print(f" previous-overlay: COMPOSE_FILE for base deploy = {cf}", flush=True) if secrets: abra.secret_generate(domain) if install_steps_hook: @@ -332,6 +495,24 @@ def _stack_name(domain: str) -> str: return domain.replace(".", "_") +def stack_service_names(domain: str) -> list[str]: + """Short service names in this app's swarm stack (stack prefix stripped). Used by recipe overlay + assertions to prove a service was added/removed by an upgrade (e.g. discourse drops `sidekiq`).""" + stack = _stack_name(domain) + proc = subprocess.run( + ["docker", "stack", "services", stack, "--format", "{{.Name}}"], + capture_output=True, + text=True, + ) + names = [] + for ln in proc.stdout.split("\n"): + n = ln.strip() + if not n: + continue + names.append(n[len(stack) + 1 :] if n.startswith(stack + "_") else n) + return names + + def services_converged(domain: str) -> bool: """True when every service in the stack reports replicas N/N (N>0) AND no service is mid-rolling-update (swarm UpdateStatus settled).""" diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index e0cca2b..b37f57c 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -38,6 +38,7 @@ import subprocess import sys import tempfile import time +from typing import NamedTuple ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(ROOT, "runner")) @@ -88,36 +89,66 @@ def sso_dep_unverified(declared, deps_ready: bool, requires_deps_skipped: int) - return bool(declared) and not deps_ready and requires_deps_skipped > 0 -def upgrade_base(stages, meta, recipe: str) -> str | None: - """Deploy-once base version decision (pure given meta + the published-version lookup): - previous published version when the upgrade tier will run and one exists (so upgrade goes - previous→target in place), else None (the caller falls back to the target / PR head). - (DECISIONS.) +class BasePlan(NamedTuple): + """Resolved upgrade-base decision (phase prevb). `kind`: + - "version" → deploy a pinned published version (`version`): an explicit UPGRADE_BASE_VERSION + override, or last-green (warm-canonical) version. `previous/` may apply (version-guarded). + - "ref" → deploy the target-branch (main) tip at commit `ref` (chaos): the true predecessor + the PR merges onto, used when there is no last-green. `previous/` never applies to a ref base. + - "skip" → no upgrade base; the single deploy is the PR head and the upgrade tier records a + declared skip with `reason` (upgrade∉stages / EXPECTED_NA / new recipe / head==main tip).""" - A recipe may override the base via recipe_meta UPGRADE_BASE_VERSION when the harness default - (recipe_versions[-2]) is NOT the PR's true predecessor — e.g. a PR that adds a version ABOVE the - newest published tag, where the correct base is [-1] (the newest published), not [-2]. The - override must be an exact published version tag (deployed as a pinned base). (Adversary §7.1.) + kind: str + version: str | None + ref: str | None + reason: str - A recipe that declares the upgrade rung in EXPECTED_NA gets NO base: published versions may - exist yet be genuinely undeployable — e.g. bluesky-pds, where every published tag pins the - moving image tag `:0.4` that upstream republished with incompatible main builds, so no - published version can come up as an upgrade base (phase bsky, DECISIONS). Deploying one would - fail the INSTALL tier before the PR-head code is ever exercised. With no base, the single - deploy is the PR head itself and the upgrade tier records "skip", which derive_rungs - classifies as the DECLARED intentional skip (reason from EXPECTED_NA — visible in - results.json `skips.intentional`, never reported as a pass).""" + @property + def runs(self) -> bool: + return self.kind in ("version", "ref") + + +def resolve_upgrade_base(stages, meta, recipe: str, head_ref: str | None = None) -> BasePlan: + """Dynamic upgrade-base resolution (phase prevb, replaces the static `recipe_versions[-2]` + default). Order: explicit override → last-green (warm canonical) → target-branch (main) tip → + skip. EXPECTED_NA[upgrade] / upgrade∉stages short-circuit to a declared skip first. + + last-green is the PRIMARY base — the version cc-ci last recorded green for this recipe (the + warm-canonical registry record). main-tip is the FALLBACK: the recipe repo's `main` HEAD, the + real predecessor the PR merges on top of, used when there is no last-green. Else the tier is + skipped with a recorded reason (structural, declared — not a silent pass). + + `UPGRADE_BASE_VERSION` is RETAINED as an optional explicit override (wins when set) for the rare + PR-adds-a-version-above-the-newest-tag case; it is no longer the default (DECISIONS prevb).""" if "upgrade" not in stages: - return None - if "upgrade" in (meta.EXPECTED_NA or {}): + return BasePlan("skip", None, None, "upgrade tier not in requested stages") + declared = (meta.EXPECTED_NA or {}).get("upgrade") + if declared: print( - "== upgrade tier: declared EXPECTED_NA['upgrade'] — no upgrade base will be " - f"deployed; the single deploy is the target/PR head. Reason: " - f"{(meta.EXPECTED_NA or {}).get('upgrade')}", + f"== upgrade tier: declared EXPECTED_NA['upgrade'] — single deploy is the PR head. " + f"Reason: {declared}", flush=True, ) - return None - return meta.UPGRADE_BASE_VERSION or lifecycle.previous_version(recipe) + return BasePlan("skip", None, None, f"declared EXPECTED_NA[upgrade]: {declared}") + override = getattr(meta, "UPGRADE_BASE_VERSION", None) + if override: + return BasePlan("version", override, None, "explicit UPGRADE_BASE_VERSION override") + rec = canonical.read_registry(recipe) + if rec and rec.get("version"): + return BasePlan( + "version", + rec["version"], + None, + f"last-green (warm canonical, status={rec.get('status')})", + ) + main_tip = lifecycle.recipe_branch_commit(recipe, "main") + if main_tip and main_tip != head_ref: + return BasePlan("ref", None, main_tip, "target-branch (main) tip") + if main_tip and main_tip == head_ref: + return BasePlan("skip", None, None, "head == main tip (no predecessor delta)") + return BasePlan( + "skip", None, None, "no last-green and no main tip (new recipe / no predecessor)" + ) def _truthy(v: str | None) -> bool: @@ -952,8 +983,25 @@ def main() -> int: domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref) - prev = upgrade_base(stages, meta, recipe) - base = prev or target + base_plan = resolve_upgrade_base(stages, meta, recipe, head_ref=head_ref) + prev = base_plan.runs # gates the upgrade tier + # base deploy target: a pinned published version (kind=version) or main-tip commit (kind=ref); + # on skip fall back to the run's VERSION/head (target=None → chaos head deploy, as before). + base = base_plan.version or target + base_ref = base_plan.ref + prev_status = lifecycle.previous_status(recipe, base_plan.kind, base_plan.version) + print( + f"== upgrade base: kind={base_plan.kind} " + f"{('version=' + base) if base_plan.kind == 'version' else ''}" + f"{('ref=' + (base_ref or '')[:12]) if base_plan.kind == 'ref' else ''}" + f"{(' SKIP: ' + base_plan.reason) if base_plan.kind == 'skip' else ''} " + f"({base_plan.reason if base_plan.kind != 'skip' else ''})", + flush=True, + ) + if prev_status["stale"]: + print(f"!! previous/ STALE — {prev_status['reason']}", flush=True) + elif prev_status["apply"]: + print(f"== previous/ applies to the base deploy (targets {base})", flush=True) backup_cap = generic.backup_capable(recipe, meta) hook = discovery.install_steps(recipe, repo_local) @@ -1051,6 +1099,8 @@ def main() -> int: recipe, domain, version=base, + base_ref=base_ref, + apply_previous=prev_status["apply"], secrets=True, install_steps_hook=hook, deploy_timeout=int(meta.DEPLOY_TIMEOUT), @@ -1129,7 +1179,7 @@ def main() -> int: junit_dir=junit_dir, ) if prev - else "skip" # no upgrade base: single published version, or declared EXPECTED_NA + else "skip" # base_plan.kind == "skip": no predecessor / EXPECTED_NA / head==main ) # ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ---- if "backup" in stages: diff --git a/tests/discourse/compose.ccci.yml b/tests/discourse/compose.ccci.yml index f06774f..3a7e5d1 100644 --- a/tests/discourse/compose.ccci.yml +++ b/tests/discourse/compose.ccci.yml @@ -1,57 +1,29 @@ --- version: "3.8" -# cc-ci overlay (Phase 2 Q4.6) — minimal, single-purpose: make the UPGRADE-tier BASE deploy (the -# previous published discourse version) deployable so upgrade-to-latest can run. +# cc-ci ENVIRONMENTAL overlay (phase prevb) — applies to ALL deploys (base + PR head). Node-reality +# tweaks ONLY; NO version-specific image pins or service add/drop. (Version-specific repairs, when a +# previous base can't deploy as-published, live in tests//previous/ — base-only. discourse +# needs none: the dynamic base = last-green / main tip = bitnamilegacy/discourse:3.5.0 deploys clean.) # -# WHY THIS OVERLAY EXISTS (plan-ccci-compose-overlay-policy.md §1 "minimal justified fallback" + -# the §1 mandate that upgrade-to-latest must ALWAYS run): the harness base-deploys the from-version -# (UPGRADE_BASE_VERSION = 0.7.0+3.3.1), then `deploy --chaos` to the recipe-PR head. Two blockers on -# that published base, both resolved here, NEITHER weakening any test: -# 1. RE-PIN: every published discourse tag pins `bitnami/discourse:3.3.1` (and 0.6.3 → 3.1.2), -# but Docker Hub REMOVED the bitnami/discourse namespace (404). The recipe-PR (recipe-maintainers/ -# discourse#1) re-pins app+sidekiq to `bitnamilegacy/discourse:3.3.1` (the legit upstream -# relocation of the identical image). This overlay applies the SAME namespace-only re-pin to the -# BASE 0.7.0 (identical version 3.3.1, identical image content) so the from-version pulls — exactly -# the policy-blessed "minimal bitnami→bitnamilegacy re-pin overlay on the 0.7.0 from-version". -# 2. GRACE: discourse's Rails cold first boot (DB migrate + asset precompile) is 15-25min on cc-ci, -# exceeding the published 5m start_period → swarm kills the still-booting app. start_period CANNOT -# be an env var (abra validates the literal 'duration' BEFORE substitution → FATA; Adversary- -# reproduced, REVIEW-2 4b862f6), so we widen it to a literal 20m on the BASE. The PR head already -# ships 20m, so this overlay is idempotent on the head (it persists untracked across the checkout). -# Both changes are namespace/grace-only: identical image content, a healthy check still marks healthy -# immediately → NO assertion is weakened and no defect is masked. +# Fusing version-specific config into this all-deploys overlay was the prevb bug: the old overlay +# re-pinned app+sidekiq to bitnamilegacy/discourse:3.3.1 and re-added the sidekiq service on EVERY +# deploy, so the PR head (official discourse/discourse:3.5.3, sidekiq dropped) was silently reverted +# to the old image and its migration never tested. Removed entirely; only the environmental tweak below +# remains. # -# NOTE (prepull): the published recipe ships `sidekiq.depends_on: [discourse]` but the main service is -# named `app` (`discourse` is undefined), so `abra app config --images` returns invalid-compose (rc=15) -# and the harness prepull is SKIPPED. This overlay does NOT try to override depends_on — compose -# normalizes short-form depends_on to a map and map-merge is additive, so an override can't REMOVE the -# bad `discourse` key. Instead the 2.4GB `bitnamilegacy/discourse:3.3.1` image is kept warm in the node -# image cache, so the inline pull during deploy is a no-op and convergence isn't pull-bound. (swarm -# ignores depends_on, so the dangling ref has zero runtime effect — a recipe lint nit, not a defect.) -# -# 3. UPGRADE ROLLOUT (dstamp 2026-06-11, direct-evidence attribution in JOURNAL-dstamp): the -# published app service sets `deploy.update_config: { failure_action: rollback, order: -# start-first }`. On the upgrade chaos redeploy (base 0.7.0 → PR head), start-first runs the OLD -# and NEW precompile/Rails-heavy discourse tasks CO-RESIDENT (~2x memory); under host memory -# pressure the NEW task intermittently OOMs/fails swarm's update monitor → `failure_action: -# rollback` reverts the app service to its PREVIOUS spec, INCLUDING the -# `coop-cloud..chaos-version` label (head → base). Because start-first keeps the OLD task -# serving, wait_healthy still passes, and HC1 then reads the reverted BASE commit (eb96de9+U) and -# misreports it as 'the re-checkout failed' — the dstamp drift, reproduced solo (runs -# dstamp-repro1/4) with `.Spec.chaos-version=7ae7b0f7+U` (head applied) flipping to -# `.PreviousSpec=eb96de94+U` after the rollback. FIX: `order: stop-first` so the NEW task boots -# with the full host memory (no 2x co-residency) and genuinely becomes healthy → no spurious -# rollback. This is a CI deploy-rollout tweak only: the upgrade still really deploys + asserts the -# PR-head code under test, and `failure_action: rollback` is LEFT intact, so a genuinely broken -# head still rolls back and is caught (lifecycle.assert_upgrade_converged) — NO test is weakened. -# Trade-off: brief real downtime during the CI upgrade (covered by DEPLOY_TIMEOUT 3600). +# WHY (environmental — depends on the cc-ci node, must apply to the head too): the upgrade chaos +# redeploy (base bitnamilegacy:3.5.0 → PR-head official 3.5.3) runs the OLD and NEW Rails-heavy +# (DB-migrate + asset-precompile) tasks. The published recipe sets +# app.deploy.update_config.order: start-first, which keeps both co-resident (~2x memory); under the +# single CI node's memory pressure the NEW task intermittently OOMs swarm's update monitor → +# failure_action: rollback reverts the app spec (incl. the coop-cloud..chaos-version label) +# while the old task keeps serving, and the upgrade is misreported (dstamp finding, JOURNAL-dstamp). +# FIX: order: stop-first so the NEW task boots with the full host memory (no 2x co-residency) and +# genuinely becomes healthy. failure_action: rollback is LEFT intact → a genuinely broken head still +# rolls back and is caught (lifecycle.assert_upgrade_converged) — NO test/assertion is weakened. +# Trade-off: brief real downtime during the CI upgrade (covered by DEPLOY_TIMEOUT 3600). services: app: - image: bitnamilegacy/discourse:3.3.1 - healthcheck: - start_period: 20m deploy: update_config: order: stop-first - sidekiq: - image: bitnamilegacy/discourse:3.3.1 diff --git a/tests/discourse/recipe_meta.py b/tests/discourse/recipe_meta.py index 32b7c9f..3d3903b 100644 --- a/tests/discourse/recipe_meta.py +++ b/tests/discourse/recipe_meta.py @@ -21,20 +21,18 @@ HTTP_TIMEOUT = 1200 # still marks healthy immediately → fast hosts unaffected). Precedent: lasuite-drive collabora PR. # TIMEOUT (abra's internal convergence wait) is raised to outlast the boot. # -# UPGRADE-tier BASE (compose.ccci.yml + UPGRADE_BASE_VERSION): upgrade-to-latest must ALWAYS run -# (plan-ccci-compose-overlay-policy.md §1). The from-version is the latest published 0.7.0+3.3.1 -# (UPGRADE_BASE_VERSION below; the PR head is 0.7.0-based, so 0.7.0 is the true predecessor — not the -# default [-2]=0.6.3). The published 0.7.0 has TWO blockers, both resolved by the policy-blessed -# minimal base overlay compose.ccci.yml (see its header), neither weakening a test: -# (1) it pins the Docker-Hub-removed `bitnami/discourse:3.3.1` (404) → overlay re-pins app+sidekiq to -# `bitnamilegacy/discourse:3.3.1` (namespace-only, identical image), the same re-pin the PR makes; -# (2) its 5m start_period is too tight for the 15-25min Rails boot → overlay widens it to 20m (grace). -# The harness auto-provides the overlay to the checkout and auto-chaoses the base deploy -# (first-class compose.ccci.yml, rcust P2a); it persists across the head checkout (idempotent — the -# PR head already re-pins + ships 20m). -# Upgrade crossover: 0.7.0 (re-pinned base) → PR head; full assertions run on the HEAD. The 0.7.0 -# *custom* tests are not separately run (custom tier runs once, on the head — policy §1 allows skip+record). -UPGRADE_BASE_VERSION = "0.7.0+3.3.1" +# UPGRADE-tier BASE (phase prevb — DYNAMIC, no hardcoded UPGRADE_BASE_VERSION): the base the head +# upgrades from is resolved at run time — last-green (warm canonical) → fallback target-branch (`main`) +# tip → else skip (run_recipe_ci.resolve_upgrade_base). discourse has no warm canonical, so the base is +# the `main` tip = bitnamilegacy/discourse:3.5.0, which deploys clean (bitnamilegacy exists) with NO +# `previous/` repair needed. The PR head (recipe-maintainers/discourse#4) switches app to the official +# `discourse/discourse:3.5.3` and drops the sidekiq service, so the upgrade tier now exercises the REAL +# bitnamilegacy→official image migration the PR claims to support. +# +# compose.ccci.yml is now the ENVIRONMENTAL overlay (all deploys): only app.deploy.update_config.order: +# stop-first (node memory reality on the upgrade crossover — see its header). The version-specific +# bitnamilegacy re-pin + sidekiq block were REMOVED (they leaked onto the head and masked the migration +# — the prevb bug). No assertion weakened: the head runs unmodified and full assertions run on it. EXTRA_ENV = { "TIMEOUT": "3600", # abra's internal convergence wait; matches DEPLOY_TIMEOUT (slow Rails boot headroom) "COMPOSE_FILE": "compose.yml:compose.ccci.yml", diff --git a/tests/discourse/test_upgrade.py b/tests/discourse/test_upgrade.py new file mode 100644 index 0000000..b5bc690 --- /dev/null +++ b/tests/discourse/test_upgrade.py @@ -0,0 +1,40 @@ +"""discourse — UPGRADE overlay (phase prevb): FAITHFULNESS assertion that the PR head genuinely ran. + +The whole point of phase prevb: the old all-deploys overlay re-pinned the head back to +bitnamilegacy/discourse:3.3.1 and re-added the sidekiq service, so the head's official-image +migration was never tested. With the version-specific config removed from the all-deploys overlay +and the dynamic base (last-green/main = bitnamilegacy:3.5.0) deployed only as the *base*, the upgrade +chaos redeploy must land the PR head UNMODIFIED. This overlay asserts exactly that, post-upgrade: + + 1. the running `app` service image IS the official discourse/discourse:3.5.3 — NOT bitnamilegacy; + 2. the `sidekiq` service the PR deletes is GONE from the deployed stack. + +If either fails, the head did not really run (the overlay leaked onto it) → RED. Assertion-only, +additive to the generic upgrade tier (which already proves reconverge/serving/moved + HC1 commit stamp). +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle # noqa: E402 + + +def test_head_runs_official_image_not_bitnamilegacy(live_app): + image = lifecycle.deployed_identity(live_app, service="app").get("image") or "" + assert "bitnamilegacy" not in image, ( + f"app image is {image!r} — the bitnamilegacy base leaked onto the PR head " + "(the version-specific overlay was applied to the head, the prevb bug)" + ) + assert image.startswith("discourse/discourse:3.5.3"), ( + f"app image is {image!r}, expected the PR head's official discourse/discourse:3.5.3 " + "— the head's image migration was not exercised" + ) + + +def test_sidekiq_service_dropped_by_head(live_app): + services = lifecycle.stack_service_names(live_app) + assert "sidekiq" not in services, ( + f"sidekiq service still present after the upgrade to the PR head: {services} — the head " + "(which deletes sidekiq) did not really deploy" + ) diff --git a/tests/unit/test_previous.py b/tests/unit/test_previous.py new file mode 100644 index 0000000..1203aeb --- /dev/null +++ b/tests/unit/test_previous.py @@ -0,0 +1,118 @@ +"""Unit tests for the phase-prevb `previous/` overlay surface + COMPOSE_FILE layering (lifecycle). + +Covers: the version-guarded apply/skip/stale decision (`previous_status`), the VERSION-marker reader +(`previous_target_version`), folder discovery (`has_previous`), and the pure COMPOSE_FILE add/remove +helpers used to layer compose.previous.yml onto the base deploy and strip it before the head redeploy. +""" + +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle # noqa: E402 +from harness import meta as meta_mod # noqa: E402 + +# ---- pure COMPOSE_FILE helpers -------------------------------------------------------------------- + + +def test_compose_file_add_appends_when_absent(): + assert ( + lifecycle.compose_file_add("compose.yml:compose.ccci.yml", "compose.previous.yml") + == "compose.yml:compose.ccci.yml:compose.previous.yml" + ) + + +def test_compose_file_add_idempotent(): + cf = "compose.yml:compose.previous.yml" + assert lifecycle.compose_file_add(cf, "compose.previous.yml") == cf + + +def test_compose_file_add_defaults_missing_base(): + assert ( + lifecycle.compose_file_add("", "compose.previous.yml") == "compose.yml:compose.previous.yml" + ) + + +def test_compose_file_remove_strips_only_overlay(): + assert ( + lifecycle.compose_file_remove( + "compose.yml:compose.ccci.yml:compose.previous.yml", "compose.previous.yml" + ) + == "compose.yml:compose.ccci.yml" + ) + + +def test_compose_file_remove_keeps_compose_yml_when_emptied(): + assert ( + lifecycle.compose_file_remove("compose.previous.yml", "compose.previous.yml") + == "compose.yml" + ) + + +# ---- marker reader + discovery (filesystem, via a temp TESTS_DIR) --------------------------------- + + +def _stage(monkeypatch, tmp_path, recipe, *, compose=True, marker=None): + monkeypatch.setattr(meta_mod, "TESTS_DIR", str(tmp_path)) + prev = tmp_path / recipe / "previous" + prev.mkdir(parents=True) + if compose: + (prev / "compose.previous.yml").write_text("services: {}\n") + if marker is not None: + (prev / "VERSION").write_text(marker) + return prev + + +def test_has_previous_true_only_with_compose(monkeypatch, tmp_path): + _stage(monkeypatch, tmp_path, "rx", compose=True) + assert lifecycle.has_previous("rx") is True + assert lifecycle.has_previous("other") is False + + +def test_previous_target_version_reads_first_real_line(monkeypatch, tmp_path): + _stage(monkeypatch, tmp_path, "rx", marker="# the previous published version\n\n0.7.0+3.3.1\n") + assert lifecycle.previous_target_version("rx") == "0.7.0+3.3.1" + + +def test_previous_target_version_none_without_marker(monkeypatch, tmp_path): + _stage(monkeypatch, tmp_path, "rx", marker=None) + assert lifecycle.previous_target_version("rx") is None + + +# ---- the apply/skip/stale decision matrix --------------------------------------------------------- + + +def test_previous_status_no_folder(monkeypatch): + monkeypatch.setattr(lifecycle, "has_previous", lambda r: False) + st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1") + assert st == {"apply": False, "stale": False, "reason": ""} + + +def test_previous_status_applies_on_version_match(monkeypatch): + monkeypatch.setattr(lifecycle, "has_previous", lambda r: True) + monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: "0.7.0+3.3.1") + st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1") + assert st["apply"] is True and st["stale"] is False + + +def test_previous_status_stale_on_version_mismatch(monkeypatch): + monkeypatch.setattr(lifecycle, "has_previous", lambda r: True) + monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: "0.6.0+3.1.1") + st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1") + assert st["apply"] is False and st["stale"] is True and "stale" in st["reason"] + + +def test_previous_status_stale_on_main_tip_base(monkeypatch): + # previous/ can only repair a pinned published base; a main-tip (ref) base flags it for review. + monkeypatch.setattr(lifecycle, "has_previous", lambda r: True) + st = lifecycle.previous_status("rx", "ref", None) + assert st["apply"] is False and st["stale"] is True + + +def test_previous_status_stale_without_marker(monkeypatch): + monkeypatch.setattr(lifecycle, "has_previous", lambda r: True) + monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: None) + st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1") + assert st["apply"] is False and st["stale"] is True and "marker" in st["reason"] diff --git a/tests/unit/test_upgrade_base.py b/tests/unit/test_upgrade_base.py index b4b5c98..43dcb1f 100644 --- a/tests/unit/test_upgrade_base.py +++ b/tests/unit/test_upgrade_base.py @@ -1,12 +1,9 @@ -"""Unit tests for `run_recipe_ci.upgrade_base` — the deploy-once base-version decision. +"""Unit tests for `run_recipe_ci.resolve_upgrade_base` — the DYNAMIC upgrade-base decision (phase prevb). -Phase bsky: a recipe whose published versions ALL pin a moving image tag that upstream -republished with incompatible builds (bluesky-pds) has no deployable upgrade base — deploying -one fails the INSTALL tier before the PR head is ever exercised. The sanctioned escape hatch is -the EXISTING declared-intentional-skip mechanism: EXPECTED_NA["upgrade"] now also suppresses -the base deploy (single deploy = PR head; the tier records "skip"; derive_rungs classifies it -intentional with the declared reason). These tests lock the decision matrix; derive_rungs' -classification of the resulting skip is already covered in test_results.py. +Resolution order: upgrade∉stages / EXPECTED_NA[upgrade] (declared skip) → explicit UPGRADE_BASE_VERSION +override → last-green (warm canonical) → target-branch (`main`) tip → skip (no predecessor). The result +is a `BasePlan(kind, version, ref, reason)`: kind ∈ {"version", "ref", "skip"}; `.runs` is True for +version/ref (the upgrade tier runs). Replaces the old static `recipe_versions[-2]` default. """ from __future__ import annotations @@ -17,63 +14,92 @@ from types import SimpleNamespace sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) import run_recipe_ci # noqa: E402 -from harness import lifecycle # noqa: E402 +from harness import canonical, lifecycle # noqa: E402 ALL = {"install", "upgrade", "backup", "restore", "custom"} +HEAD = "aaaa1111head" +MAIN = "bbbb2222main" def _meta(expected_na=None, upgrade_base_version=None): return SimpleNamespace(EXPECTED_NA=expected_na, UPGRADE_BASE_VERSION=upgrade_base_version) -def test_default_prev_published(monkeypatch): - # upgrade in stages, ≥2 published versions, nothing declared → recipe_versions[-2] - monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.0+v1") - assert run_recipe_ci.upgrade_base(ALL, _meta(), "somerecipe") == "0.1.0+v1" +def _no_canonical(monkeypatch): + monkeypatch.setattr(canonical, "read_registry", lambda r: None) -def test_single_published_version_no_base(monkeypatch): - monkeypatch.setattr(lifecycle, "previous_version", lambda r: None) - assert run_recipe_ci.upgrade_base(ALL, _meta(), "somerecipe") is None +def _no_main(monkeypatch): + monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": None) -def test_upgrade_not_in_stages_no_base(monkeypatch): +def test_upgrade_not_in_stages_skip(monkeypatch): + # never consults canonical/main when upgrade isn't requested monkeypatch.setattr( - lifecycle, - "previous_version", - lambda r: (_ for _ in ()).throw(AssertionError("not consulted")), + canonical, "read_registry", lambda r: (_ for _ in ()).throw(AssertionError("not consulted")) ) - assert run_recipe_ci.upgrade_base(ALL - {"upgrade"}, _meta(), "somerecipe") is None + plan = run_recipe_ci.resolve_upgrade_base(ALL - {"upgrade"}, _meta(), "somerecipe") + assert plan.kind == "skip" and not plan.runs -def test_upgrade_base_version_override_wins(monkeypatch): +def test_expected_na_upgrade_skip_even_with_canonical_and_override(monkeypatch): + # EXPECTED_NA[upgrade] is the strongest, declared fact — short-circuits before override/canonical. monkeypatch.setattr( - lifecycle, - "previous_version", - lambda r: (_ for _ in ()).throw(AssertionError("not consulted")), + canonical, "read_registry", lambda r: {"version": "9.9.9", "status": "warm"} ) - meta = _meta(upgrade_base_version="0.7.0+3.3.1") - assert run_recipe_ci.upgrade_base(ALL, meta, "discourse") == "0.7.0+3.3.1" - - -def test_expected_na_upgrade_suppresses_base(monkeypatch): - # bluesky-pds shape: published versions exist but are undeployable — declared EXPECTED_NA - # upgrade → NO base (single deploy is the PR head), even though previous_version would - # return one and even if UPGRADE_BASE_VERSION is set (the declaration is the stronger, - # documented fact). - monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.1+v0.4") declared = {"upgrade": "no deployable upgrade base (moving tag republished)"} - assert run_recipe_ci.upgrade_base(ALL, _meta(expected_na=declared), "bluesky-pds") is None - assert ( - run_recipe_ci.upgrade_base( - ALL, _meta(expected_na=declared, upgrade_base_version="0.2.0+v0.4"), "bluesky-pds" - ) - is None + plan = run_recipe_ci.resolve_upgrade_base( + ALL, _meta(expected_na=declared, upgrade_base_version="0.2.0+v0.4"), "bluesky-pds" ) + assert plan.kind == "skip" and "EXPECTED_NA" in plan.reason -def test_expected_na_other_rung_does_not_suppress(monkeypatch): - # an EXPECTED_NA for backup_restore (custom-html-tiny shape) must NOT touch the upgrade base - monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.0+v1") - meta = _meta(expected_na={"backup_restore": "stateless"}) - assert run_recipe_ci.upgrade_base(ALL, meta, "custom-html-tiny") == "0.1.0+v1" +def test_explicit_override_wins_over_canonical(monkeypatch): + monkeypatch.setattr( + canonical, "read_registry", lambda r: {"version": "9.9.9", "status": "warm"} + ) + monkeypatch.setattr( + lifecycle, + "recipe_branch_commit", + lambda r, b="main": (_ for _ in ()).throw(AssertionError("not consulted")), + ) + plan = run_recipe_ci.resolve_upgrade_base( + ALL, _meta(upgrade_base_version="0.7.0+3.3.1"), "discourse" + ) + assert plan.kind == "version" and plan.version == "0.7.0+3.3.1" and plan.runs + + +def test_last_green_warm_canonical_is_primary(monkeypatch): + # no override → last-green (warm canonical version) is the PRIMARY base; main is not consulted. + monkeypatch.setattr( + canonical, "read_registry", lambda r: {"version": "0.6.0+3.1.1", "status": "idle"} + ) + monkeypatch.setattr( + lifecycle, + "recipe_branch_commit", + lambda r, b="main": (_ for _ in ()).throw(AssertionError("not consulted")), + ) + plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "keycloak", head_ref=HEAD) + assert plan.kind == "version" and plan.version == "0.6.0+3.1.1" and "last-green" in plan.reason + + +def test_main_tip_fallback_when_no_last_green(monkeypatch): + _no_canonical(monkeypatch) + monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": MAIN) + plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "discourse", head_ref=HEAD) + assert plan.kind == "ref" and plan.ref == MAIN and plan.version is None and plan.runs + + +def test_head_equals_main_tip_skip(monkeypatch): + # the PR head IS the main tip (empty PR / LATEST run) → no real predecessor → declared skip. + _no_canonical(monkeypatch) + monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": HEAD) + plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "discourse", head_ref=HEAD) + assert plan.kind == "skip" and "main tip" in plan.reason and not plan.runs + + +def test_no_canonical_no_main_skip(monkeypatch): + _no_canonical(monkeypatch) + _no_main(monkeypatch) + plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "brandnew", head_ref=HEAD) + assert plan.kind == "skip" and "no predecessor" in plan.reason