feat(prevb): dynamic upgrade base (last-green→main→skip) + per-recipe previous/ overlay; migrate discourse off static base + leaky overlay
All checks were successful
continuous-integration/drone/push Build is passing

- resolve_upgrade_base: BasePlan(kind=version|ref|skip); last-green (warm canonical) primary,
  main-tip fallback, declared skip else. UPGRADE_BASE_VERSION retained as optional override.
- deploy_app: base_ref path (chaos-deploy a main-tip/last-green commit) + apply_previous wiring.
- lifecycle: previous/ surface (has_previous, previous_target_version, previous_status decision,
  provide/remove overlay, compose_file add/remove, recipe_branch_commit, stack_service_names).
- generic.perform_upgrade: strip previous/ overlay + COMPOSE_FILE entry before head redeploy.
- discourse: compose.ccci.yml now environmental-only (order: stop-first); removed bitnamilegacy
  pins + sidekiq + UPGRADE_BASE_VERSION; test_upgrade.py asserts head image == official 3.5.3 + no sidekiq.
- unit tests: resolve_upgrade_base matrix + previous/ apply/skip/stale + COMPOSE_FILE layering.
This commit is contained in:
autonomic-bot
2026-06-17 00:14:53 +00:00
parent 1090abb97a
commit bb2e3c6b2c
8 changed files with 532 additions and 137 deletions

View File

@ -251,6 +251,16 @@ def perform_upgrade(
before = lifecycle.deployed_identity(domain)
if head_ref:
lifecycle.recipe_checkout_ref(recipe, head_ref)
# Phase prevb: strip the base-only `previous/` overlay before the head redeploy so the PR head
# runs UNMODIFIED (the version-specific repair must never leak onto the head). Delete the copied
# compose.previous.yml and drop it from COMPOSE_FILE in the app .env. No-op when no previous/ was
# applied (the file/entry are absent). The environmental compose.ccci.yml stays (all deploys).
lifecycle.remove_previous_overlay(recipe)
cf_now = abra.env_get(domain, "COMPOSE_FILE")
if cf_now and lifecycle.PREVIOUS_COMPOSE in cf_now:
cf_stripped = lifecycle.compose_file_remove(cf_now, lifecycle.PREVIOUS_COMPOSE)
abra.env_set(domain, "COMPOSE_FILE", cf_stripped)
print(f" previous-overlay: COMPOSE_FILE for head redeploy = {cf_stripped}", flush=True)
# UPGRADE_EXTRA_ENV (F2-14c): a recipe may need different app .env for the upgrade-TARGET deploy
# than for the base — e.g. mumble's `compose.host-ports.yml` overlay exists ONLY in the newer
# (target) version, so the base deploys minimally WITHOUT it and the upgrade adds it to COMPOSE_FILE

View File

@ -154,6 +154,145 @@ def provide_ccci_overlay(recipe: str) -> None:
)
# ---------------------------------------------------------------------------------------------
# Phase prevb: dynamic upgrade base + per-recipe `previous/` overlay.
#
# `previous/` holds the MINIMAL config needed to deploy the *previous (last-green) version* when it
# can't deploy as-published (e.g. an image relocation). It is applied ONLY to the base deploy and
# ONLY when the resolved base is that exact published version; NEVER to the PR head; on a main-tip
# base or version mismatch it is skipped + flagged stale. Mechanism mirrors the environmental
# compose.ccci.yml overlay (copied untracked into the checkout, referenced via COMPOSE_FILE), but is
# stripped before the head redeploy so the head runs unmodified. (plan-phase-prevb §2.)
# ---------------------------------------------------------------------------------------------
PREVIOUS_COMPOSE = "compose.previous.yml"
def previous_dir(recipe: str) -> str:
return os.path.join(meta_mod.TESTS_DIR, recipe, "previous")
def has_previous(recipe: str) -> bool:
"""True iff the recipe ships a `tests/<recipe>/previous/` folder with a compose.previous.yml."""
return os.path.isfile(os.path.join(previous_dir(recipe), PREVIOUS_COMPOSE))
def previous_target_version(recipe: str) -> str | None:
"""The published version the recipe's `previous/` folder targets — declared in a one-line
`previous/VERSION` marker (first non-blank, non-`#` line). None if no marker. The harness applies
`previous/` ONLY when the resolved base equals this; otherwise the folder is stale and skipped."""
marker = os.path.join(previous_dir(recipe), "VERSION")
try:
with open(marker) as f:
for line in f:
s = line.strip()
if s and not s.startswith("#"):
return s
except OSError:
return None
return None
def previous_status(recipe: str, base_kind: str, base_version: str | None) -> dict:
"""Decide whether `previous/` applies to this base, as a pure decision (unit-tested).
Returns {apply, stale, reason}:
- no `previous/` folder → apply=False, stale=False (nothing to do).
- base is not a pinned published → apply=False, stale=True (main-tip / no base): previous/ can
version (kind != "version") only repair a published base; flag for review.
- no `previous/VERSION` marker → apply=False, stale=True (undeclared: cannot version-guard).
- marker != resolved base version → apply=False, stale=True (stale: targets X, base is Y).
- marker == resolved base version → apply=True, stale=False.
"""
if not has_previous(recipe):
return {"apply": False, "stale": False, "reason": ""}
if base_kind != "version" or not base_version:
return {
"apply": False,
"stale": True,
"reason": (
f"previous/ present but the resolved base is not a pinned published version "
f"(base_kind={base_kind}) — previous/ only repairs a published base; not applied"
),
}
target = previous_target_version(recipe)
if not target:
return {
"apply": False,
"stale": True,
"reason": "previous/ has no VERSION marker — cannot version-guard; not applied",
}
if target != base_version:
return {
"apply": False,
"stale": True,
"reason": f"previous/ targets {target}, base is {base_version} — stale, remove it",
}
return {"apply": True, "stale": False, "reason": ""}
def provide_previous_overlay(recipe: str) -> None:
"""Copy `tests/<recipe>/previous/compose.previous.yml` into THIS run's recipe checkout so a
COMPOSE_FILE reference to it resolves (base deploy only). No-op if absent."""
src = os.path.join(previous_dir(recipe), PREVIOUS_COMPOSE)
if not os.path.isfile(src):
return
dest_dir = abra.recipe_dir(recipe)
if not os.path.isdir(dest_dir):
raise RuntimeError(f"recipe checkout missing for {recipe}: {dest_dir}")
shutil.copy(src, os.path.join(dest_dir, PREVIOUS_COMPOSE))
print(
f" previous-overlay: provided {PREVIOUS_COMPOSE} to the {recipe} base checkout "
"(base-only; stripped before the head redeploy)",
flush=True,
)
def remove_previous_overlay(recipe: str) -> None:
"""Delete compose.previous.yml from this run's recipe checkout (called before the head redeploy so
the PR head NEVER sees the previous-version repair). No-op if absent."""
p = os.path.join(abra.recipe_dir(recipe), PREVIOUS_COMPOSE)
with contextlib.suppress(OSError):
if os.path.isfile(p):
os.remove(p)
print(
f" previous-overlay: removed {PREVIOUS_COMPOSE} from the head checkout", flush=True
)
def compose_file_add(compose_file: str, overlay: str) -> str:
"""Append `overlay` to a ':'-separated COMPOSE_FILE value if absent (pure). Defaults a missing
value to `compose.yml` first, matching abra."""
parts = [p for p in (compose_file or "").split(":") if p] or ["compose.yml"]
if overlay not in parts:
parts.append(overlay)
return ":".join(parts)
def compose_file_remove(compose_file: str, overlay: str) -> str:
"""Remove `overlay` from a ':'-separated COMPOSE_FILE value (pure). Keeps order; defaults a now-
empty value to `compose.yml`."""
parts = [p for p in (compose_file or "").split(":") if p and p != overlay]
return ":".join(parts or ["compose.yml"])
def recipe_branch_commit(recipe: str, branch: str = "main") -> str | None:
"""Resolve the recipe repo's target-branch tip (the predecessor the PR merges onto) to a commit
SHA, for the dynamic upgrade-base main-tip fallback. The per-run tree is a full clone of the
mirror, so `origin/<branch>` is present. Tries origin/<branch>, then origin/master. None if
neither resolves (new recipe / detached state)."""
path = abra.recipe_dir(recipe)
for ref in (f"origin/{branch}", "origin/master"):
proc = subprocess.run(
["git", "-C", path, "rev-parse", "--verify", "--quiet", ref],
capture_output=True,
text=True,
)
if proc.returncode == 0 and proc.stdout.strip():
return proc.stdout.strip()
return None
def _run_install_steps(hook: tuple[str, str], recipe: str, domain: str) -> None:
"""Run a recipe's custom install-steps hook (install_steps.sh) during the install tier — after
`abra app new` + env defaults + secret generate, before deploy (Phase 1d DG5). The hook gets the
@ -234,6 +373,8 @@ def deploy_app(
recipe: str,
domain: str,
version: str | None = None,
base_ref: str | None = None,
apply_previous: bool = False,
secrets: bool = True,
install_steps_hook: tuple[str, str] | None = None,
deploy_timeout: int = 900,
@ -273,7 +414,18 @@ def deploy_app(
# Adversary F1d-2). Chaos is correct ONLY for the version=None case (deploy the current PR-head
# checkout). Order matters: checkout before secret_generate (-C) so secrets match the pinned tree.
chaos = version is None
if version:
if base_ref:
# Dynamic upgrade base = target-branch (main) tip, or a last-green commit (phase prevb): an
# arbitrary git ref, not a published tag. Check it out and deploy via chaos — same mechanism
# as the head deploy (a non-tag ref would FATA abra's pinned-deploy lint/clean-tree gate).
recipe_checkout_ref(recipe, base_ref)
chaos = True
print(
f" deploy_app({recipe}): base = main-tip/ref {base_ref[:12]} → chaos deploy of the "
"checked-out ref (the PR's true predecessor; not a published pin)",
flush=True,
)
elif version:
abra.recipe_checkout(recipe, version)
# A pinned (non-chaos) deploy runs `abra recipe lint`, which FATAs R014 ('only annotated
# tags') if the upstream recipe ships a stray lightweight version tag (e.g. lasuite-meet's
@ -309,8 +461,19 @@ def deploy_app(
# it ourselves is recipe-agnostic and canonical (the run domain IS the app's domain).
abra.env_set(domain, "DOMAIN", domain)
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
for k, v in meta_mod.extra_env(meta, meta_mod.hook_ctx(domain, meta)).items():
extra = meta_mod.extra_env(meta, meta_mod.hook_ctx(domain, meta))
for k, v in extra.items():
abra.env_set(domain, k, v)
# Per-recipe `previous/` overlay (phase prevb): version-specific repair to deploy the *previous
# (last-green) version*, applied to the BASE deploy ONLY (the caller resolved apply=True only when
# the resolved base equals previous/VERSION). Appended on top of the recipe's COMPOSE_FILE (which
# may already include the environmental compose.ccci.yml). Stripped before the head redeploy
# (generic.perform_upgrade) so the PR head runs unmodified.
if apply_previous:
cf = compose_file_add(extra.get("COMPOSE_FILE", "compose.yml"), PREVIOUS_COMPOSE)
abra.env_set(domain, "COMPOSE_FILE", cf)
provide_previous_overlay(recipe)
print(f" previous-overlay: COMPOSE_FILE for base deploy = {cf}", flush=True)
if secrets:
abra.secret_generate(domain)
if install_steps_hook:
@ -332,6 +495,24 @@ def _stack_name(domain: str) -> str:
return domain.replace(".", "_")
def stack_service_names(domain: str) -> list[str]:
"""Short service names in this app's swarm stack (stack prefix stripped). Used by recipe overlay
assertions to prove a service was added/removed by an upgrade (e.g. discourse drops `sidekiq`)."""
stack = _stack_name(domain)
proc = subprocess.run(
["docker", "stack", "services", stack, "--format", "{{.Name}}"],
capture_output=True,
text=True,
)
names = []
for ln in proc.stdout.split("\n"):
n = ln.strip()
if not n:
continue
names.append(n[len(stack) + 1 :] if n.startswith(stack + "_") else n)
return names
def services_converged(domain: str) -> bool:
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
mid-rolling-update (swarm UpdateStatus settled)."""

View File

@ -38,6 +38,7 @@ import subprocess
import sys
import tempfile
import time
from typing import NamedTuple
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(ROOT, "runner"))
@ -88,36 +89,66 @@ def sso_dep_unverified(declared, deps_ready: bool, requires_deps_skipped: int) -
return bool(declared) and not deps_ready and requires_deps_skipped > 0
def upgrade_base(stages, meta, recipe: str) -> str | None:
"""Deploy-once base version decision (pure given meta + the published-version lookup):
previous published version when the upgrade tier will run and one exists (so upgrade goes
previous→target in place), else None (the caller falls back to the target / PR head).
(DECISIONS.)
class BasePlan(NamedTuple):
"""Resolved upgrade-base decision (phase prevb). `kind`:
- "version" → deploy a pinned published version (`version`): an explicit UPGRADE_BASE_VERSION
override, or last-green (warm-canonical) version. `previous/` may apply (version-guarded).
- "ref" → deploy the target-branch (main) tip at commit `ref` (chaos): the true predecessor
the PR merges onto, used when there is no last-green. `previous/` never applies to a ref base.
- "skip" → no upgrade base; the single deploy is the PR head and the upgrade tier records a
declared skip with `reason` (upgrade∉stages / EXPECTED_NA / new recipe / head==main tip)."""
A recipe may override the base via recipe_meta UPGRADE_BASE_VERSION when the harness default
(recipe_versions[-2]) is NOT the PR's true predecessor — e.g. a PR that adds a version ABOVE the
newest published tag, where the correct base is [-1] (the newest published), not [-2]. The
override must be an exact published version tag (deployed as a pinned base). (Adversary §7.1.)
kind: str
version: str | None
ref: str | None
reason: str
A recipe that declares the upgrade rung in EXPECTED_NA gets NO base: published versions may
exist yet be genuinely undeployable — e.g. bluesky-pds, where every published tag pins the
moving image tag `:0.4` that upstream republished with incompatible main builds, so no
published version can come up as an upgrade base (phase bsky, DECISIONS). Deploying one would
fail the INSTALL tier before the PR-head code is ever exercised. With no base, the single
deploy is the PR head itself and the upgrade tier records "skip", which derive_rungs
classifies as the DECLARED intentional skip (reason from EXPECTED_NA — visible in
results.json `skips.intentional`, never reported as a pass)."""
@property
def runs(self) -> bool:
return self.kind in ("version", "ref")
def resolve_upgrade_base(stages, meta, recipe: str, head_ref: str | None = None) -> BasePlan:
"""Dynamic upgrade-base resolution (phase prevb, replaces the static `recipe_versions[-2]`
default). Order: explicit override → last-green (warm canonical) → target-branch (main) tip →
skip. EXPECTED_NA[upgrade] / upgrade∉stages short-circuit to a declared skip first.
last-green is the PRIMARY base — the version cc-ci last recorded green for this recipe (the
warm-canonical registry record). main-tip is the FALLBACK: the recipe repo's `main` HEAD, the
real predecessor the PR merges on top of, used when there is no last-green. Else the tier is
skipped with a recorded reason (structural, declared — not a silent pass).
`UPGRADE_BASE_VERSION` is RETAINED as an optional explicit override (wins when set) for the rare
PR-adds-a-version-above-the-newest-tag case; it is no longer the default (DECISIONS prevb)."""
if "upgrade" not in stages:
return None
if "upgrade" in (meta.EXPECTED_NA or {}):
return BasePlan("skip", None, None, "upgrade tier not in requested stages")
declared = (meta.EXPECTED_NA or {}).get("upgrade")
if declared:
print(
"== upgrade tier: declared EXPECTED_NA['upgrade'] — no upgrade base will be "
f"deployed; the single deploy is the target/PR head. Reason: "
f"{(meta.EXPECTED_NA or {}).get('upgrade')}",
f"== upgrade tier: declared EXPECTED_NA['upgrade'] — single deploy is the PR head. "
f"Reason: {declared}",
flush=True,
)
return None
return meta.UPGRADE_BASE_VERSION or lifecycle.previous_version(recipe)
return BasePlan("skip", None, None, f"declared EXPECTED_NA[upgrade]: {declared}")
override = getattr(meta, "UPGRADE_BASE_VERSION", None)
if override:
return BasePlan("version", override, None, "explicit UPGRADE_BASE_VERSION override")
rec = canonical.read_registry(recipe)
if rec and rec.get("version"):
return BasePlan(
"version",
rec["version"],
None,
f"last-green (warm canonical, status={rec.get('status')})",
)
main_tip = lifecycle.recipe_branch_commit(recipe, "main")
if main_tip and main_tip != head_ref:
return BasePlan("ref", None, main_tip, "target-branch (main) tip")
if main_tip and main_tip == head_ref:
return BasePlan("skip", None, None, "head == main tip (no predecessor delta)")
return BasePlan(
"skip", None, None, "no last-green and no main tip (new recipe / no predecessor)"
)
def _truthy(v: str | None) -> bool:
@ -952,8 +983,25 @@ def main() -> int:
domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref)
prev = upgrade_base(stages, meta, recipe)
base = prev or target
base_plan = resolve_upgrade_base(stages, meta, recipe, head_ref=head_ref)
prev = base_plan.runs # gates the upgrade tier
# base deploy target: a pinned published version (kind=version) or main-tip commit (kind=ref);
# on skip fall back to the run's VERSION/head (target=None → chaos head deploy, as before).
base = base_plan.version or target
base_ref = base_plan.ref
prev_status = lifecycle.previous_status(recipe, base_plan.kind, base_plan.version)
print(
f"== upgrade base: kind={base_plan.kind} "
f"{('version=' + base) if base_plan.kind == 'version' else ''}"
f"{('ref=' + (base_ref or '')[:12]) if base_plan.kind == 'ref' else ''}"
f"{(' SKIP: ' + base_plan.reason) if base_plan.kind == 'skip' else ''} "
f"({base_plan.reason if base_plan.kind != 'skip' else ''})",
flush=True,
)
if prev_status["stale"]:
print(f"!! previous/ STALE — {prev_status['reason']}", flush=True)
elif prev_status["apply"]:
print(f"== previous/ applies to the base deploy (targets {base})", flush=True)
backup_cap = generic.backup_capable(recipe, meta)
hook = discovery.install_steps(recipe, repo_local)
@ -1051,6 +1099,8 @@ def main() -> int:
recipe,
domain,
version=base,
base_ref=base_ref,
apply_previous=prev_status["apply"],
secrets=True,
install_steps_hook=hook,
deploy_timeout=int(meta.DEPLOY_TIMEOUT),
@ -1129,7 +1179,7 @@ def main() -> int:
junit_dir=junit_dir,
)
if prev
else "skip" # no upgrade base: single published version, or declared EXPECTED_NA
else "skip" # base_plan.kind == "skip": no predecessor / EXPECTED_NA / head==main
)
# ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ----
if "backup" in stages:

View File

@ -1,57 +1,29 @@
---
version: "3.8"
# cc-ci overlay (Phase 2 Q4.6) — minimal, single-purpose: make the UPGRADE-tier BASE deploy (the
# previous published discourse version) deployable so upgrade-to-latest can run.
# cc-ci ENVIRONMENTAL overlay (phase prevb) — applies to ALL deploys (base + PR head). Node-reality
# tweaks ONLY; NO version-specific image pins or service add/drop. (Version-specific repairs, when a
# previous base can't deploy as-published, live in tests/<recipe>/previous/ — base-only. discourse
# needs none: the dynamic base = last-green / main tip = bitnamilegacy/discourse:3.5.0 deploys clean.)
#
# WHY THIS OVERLAY EXISTS (plan-ccci-compose-overlay-policy.md §1 "minimal justified fallback" +
# the §1 mandate that upgrade-to-latest must ALWAYS run): the harness base-deploys the from-version
# (UPGRADE_BASE_VERSION = 0.7.0+3.3.1), then `deploy --chaos` to the recipe-PR head. Two blockers on
# that published base, both resolved here, NEITHER weakening any test:
# 1. RE-PIN: every published discourse tag pins `bitnami/discourse:3.3.1` (and 0.6.3 → 3.1.2),
# but Docker Hub REMOVED the bitnami/discourse namespace (404). The recipe-PR (recipe-maintainers/
# discourse#1) re-pins app+sidekiq to `bitnamilegacy/discourse:3.3.1` (the legit upstream
# relocation of the identical image). This overlay applies the SAME namespace-only re-pin to the
# BASE 0.7.0 (identical version 3.3.1, identical image content) so the from-version pulls — exactly
# the policy-blessed "minimal bitnami→bitnamilegacy re-pin overlay on the 0.7.0 from-version".
# 2. GRACE: discourse's Rails cold first boot (DB migrate + asset precompile) is 15-25min on cc-ci,
# exceeding the published 5m start_period → swarm kills the still-booting app. start_period CANNOT
# be an env var (abra validates the literal 'duration' BEFORE substitution → FATA; Adversary-
# reproduced, REVIEW-2 4b862f6), so we widen it to a literal 20m on the BASE. The PR head already
# ships 20m, so this overlay is idempotent on the head (it persists untracked across the checkout).
# Both changes are namespace/grace-only: identical image content, a healthy check still marks healthy
# immediately → NO assertion is weakened and no defect is masked.
# Fusing version-specific config into this all-deploys overlay was the prevb bug: the old overlay
# re-pinned app+sidekiq to bitnamilegacy/discourse:3.3.1 and re-added the sidekiq service on EVERY
# deploy, so the PR head (official discourse/discourse:3.5.3, sidekiq dropped) was silently reverted
# to the old image and its migration never tested. Removed entirely; only the environmental tweak below
# remains.
#
# NOTE (prepull): the published recipe ships `sidekiq.depends_on: [discourse]` but the main service is
# named `app` (`discourse` is undefined), so `abra app config --images` returns invalid-compose (rc=15)
# and the harness prepull is SKIPPED. This overlay does NOT try to override depends_on — compose
# normalizes short-form depends_on to a map and map-merge is additive, so an override can't REMOVE the
# bad `discourse` key. Instead the 2.4GB `bitnamilegacy/discourse:3.3.1` image is kept warm in the node
# image cache, so the inline pull during deploy is a no-op and convergence isn't pull-bound. (swarm
# ignores depends_on, so the dangling ref has zero runtime effect — a recipe lint nit, not a defect.)
#
# 3. UPGRADE ROLLOUT (dstamp 2026-06-11, direct-evidence attribution in JOURNAL-dstamp): the
# published app service sets `deploy.update_config: { failure_action: rollback, order:
# start-first }`. On the upgrade chaos redeploy (base 0.7.0 → PR head), start-first runs the OLD
# and NEW precompile/Rails-heavy discourse tasks CO-RESIDENT (~2x memory); under host memory
# pressure the NEW task intermittently OOMs/fails swarm's update monitor → `failure_action:
# rollback` reverts the app service to its PREVIOUS spec, INCLUDING the
# `coop-cloud.<stack>.chaos-version` label (head → base). Because start-first keeps the OLD task
# serving, wait_healthy still passes, and HC1 then reads the reverted BASE commit (eb96de9+U) and
# misreports it as 'the re-checkout failed' — the dstamp drift, reproduced solo (runs
# dstamp-repro1/4) with `.Spec.chaos-version=7ae7b0f7+U` (head applied) flipping to
# `.PreviousSpec=eb96de94+U` after the rollback. FIX: `order: stop-first` so the NEW task boots
# with the full host memory (no 2x co-residency) and genuinely becomes healthy → no spurious
# rollback. This is a CI deploy-rollout tweak only: the upgrade still really deploys + asserts the
# PR-head code under test, and `failure_action: rollback` is LEFT intact, so a genuinely broken
# head still rolls back and is caught (lifecycle.assert_upgrade_converged) — NO test is weakened.
# Trade-off: brief real downtime during the CI upgrade (covered by DEPLOY_TIMEOUT 3600).
# WHY (environmental — depends on the cc-ci node, must apply to the head too): the upgrade chaos
# redeploy (base bitnamilegacy:3.5.0 → PR-head official 3.5.3) runs the OLD and NEW Rails-heavy
# (DB-migrate + asset-precompile) tasks. The published recipe sets
# app.deploy.update_config.order: start-first, which keeps both co-resident (~2x memory); under the
# single CI node's memory pressure the NEW task intermittently OOMs swarm's update monitor →
# failure_action: rollback reverts the app spec (incl. the coop-cloud.<stack>.chaos-version label)
# while the old task keeps serving, and the upgrade is misreported (dstamp finding, JOURNAL-dstamp).
# FIX: order: stop-first so the NEW task boots with the full host memory (no 2x co-residency) and
# genuinely becomes healthy. failure_action: rollback is LEFT intact → a genuinely broken head still
# rolls back and is caught (lifecycle.assert_upgrade_converged) — NO test/assertion is weakened.
# Trade-off: brief real downtime during the CI upgrade (covered by DEPLOY_TIMEOUT 3600).
services:
app:
image: bitnamilegacy/discourse:3.3.1
healthcheck:
start_period: 20m
deploy:
update_config:
order: stop-first
sidekiq:
image: bitnamilegacy/discourse:3.3.1

View File

@ -21,20 +21,18 @@ HTTP_TIMEOUT = 1200
# still marks healthy immediately → fast hosts unaffected). Precedent: lasuite-drive collabora PR.
# TIMEOUT (abra's internal convergence wait) is raised to outlast the boot.
#
# UPGRADE-tier BASE (compose.ccci.yml + UPGRADE_BASE_VERSION): upgrade-to-latest must ALWAYS run
# (plan-ccci-compose-overlay-policy.md §1). The from-version is the latest published 0.7.0+3.3.1
# (UPGRADE_BASE_VERSION below; the PR head is 0.7.0-based, so 0.7.0 is the true predecessor — not the
# default [-2]=0.6.3). The published 0.7.0 has TWO blockers, both resolved by the policy-blessed
# minimal base overlay compose.ccci.yml (see its header), neither weakening a test:
# (1) it pins the Docker-Hub-removed `bitnami/discourse:3.3.1` (404) → overlay re-pins app+sidekiq to
# `bitnamilegacy/discourse:3.3.1` (namespace-only, identical image), the same re-pin the PR makes;
# (2) its 5m start_period is too tight for the 15-25min Rails boot → overlay widens it to 20m (grace).
# The harness auto-provides the overlay to the checkout and auto-chaoses the base deploy
# (first-class compose.ccci.yml, rcust P2a); it persists across the head checkout (idempotent — the
# PR head already re-pins + ships 20m).
# Upgrade crossover: 0.7.0 (re-pinned base) → PR head; full assertions run on the HEAD. The 0.7.0
# *custom* tests are not separately run (custom tier runs once, on the head — policy §1 allows skip+record).
UPGRADE_BASE_VERSION = "0.7.0+3.3.1"
# UPGRADE-tier BASE (phase prevb — DYNAMIC, no hardcoded UPGRADE_BASE_VERSION): the base the head
# upgrades from is resolved at run time — last-green (warm canonical) → fallback target-branch (`main`)
# tip → else skip (run_recipe_ci.resolve_upgrade_base). discourse has no warm canonical, so the base is
# the `main` tip = bitnamilegacy/discourse:3.5.0, which deploys clean (bitnamilegacy exists) with NO
# `previous/` repair needed. The PR head (recipe-maintainers/discourse#4) switches app to the official
# `discourse/discourse:3.5.3` and drops the sidekiq service, so the upgrade tier now exercises the REAL
# bitnamilegacy→official image migration the PR claims to support.
#
# compose.ccci.yml is now the ENVIRONMENTAL overlay (all deploys): only app.deploy.update_config.order:
# stop-first (node memory reality on the upgrade crossover — see its header). The version-specific
# bitnamilegacy re-pin + sidekiq block were REMOVED (they leaked onto the head and masked the migration
# — the prevb bug). No assertion weakened: the head runs unmodified and full assertions run on it.
EXTRA_ENV = {
"TIMEOUT": "3600", # abra's internal convergence wait; matches DEPLOY_TIMEOUT (slow Rails boot headroom)
"COMPOSE_FILE": "compose.yml:compose.ccci.yml",

View File

@ -0,0 +1,40 @@
"""discourse — UPGRADE overlay (phase prevb): FAITHFULNESS assertion that the PR head genuinely ran.
The whole point of phase prevb: the old all-deploys overlay re-pinned the head back to
bitnamilegacy/discourse:3.3.1 and re-added the sidekiq service, so the head's official-image
migration was never tested. With the version-specific config removed from the all-deploys overlay
and the dynamic base (last-green/main = bitnamilegacy:3.5.0) deployed only as the *base*, the upgrade
chaos redeploy must land the PR head UNMODIFIED. This overlay asserts exactly that, post-upgrade:
1. the running `app` service image IS the official discourse/discourse:3.5.3 — NOT bitnamilegacy;
2. the `sidekiq` service the PR deletes is GONE from the deployed stack.
If either fails, the head did not really run (the overlay leaked onto it) → RED. Assertion-only,
additive to the generic upgrade tier (which already proves reconverge/serving/moved + HC1 commit stamp).
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle # noqa: E402
def test_head_runs_official_image_not_bitnamilegacy(live_app):
image = lifecycle.deployed_identity(live_app, service="app").get("image") or ""
assert "bitnamilegacy" not in image, (
f"app image is {image!r} — the bitnamilegacy base leaked onto the PR head "
"(the version-specific overlay was applied to the head, the prevb bug)"
)
assert image.startswith("discourse/discourse:3.5.3"), (
f"app image is {image!r}, expected the PR head's official discourse/discourse:3.5.3 "
"— the head's image migration was not exercised"
)
def test_sidekiq_service_dropped_by_head(live_app):
services = lifecycle.stack_service_names(live_app)
assert "sidekiq" not in services, (
f"sidekiq service still present after the upgrade to the PR head: {services} — the head "
"(which deletes sidekiq) did not really deploy"
)

118
tests/unit/test_previous.py Normal file
View File

@ -0,0 +1,118 @@
"""Unit tests for the phase-prevb `previous/` overlay surface + COMPOSE_FILE layering (lifecycle).
Covers: the version-guarded apply/skip/stale decision (`previous_status`), the VERSION-marker reader
(`previous_target_version`), folder discovery (`has_previous`), and the pure COMPOSE_FILE add/remove
helpers used to layer compose.previous.yml onto the base deploy and strip it before the head redeploy.
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle # noqa: E402
from harness import meta as meta_mod # noqa: E402
# ---- pure COMPOSE_FILE helpers --------------------------------------------------------------------
def test_compose_file_add_appends_when_absent():
assert (
lifecycle.compose_file_add("compose.yml:compose.ccci.yml", "compose.previous.yml")
== "compose.yml:compose.ccci.yml:compose.previous.yml"
)
def test_compose_file_add_idempotent():
cf = "compose.yml:compose.previous.yml"
assert lifecycle.compose_file_add(cf, "compose.previous.yml") == cf
def test_compose_file_add_defaults_missing_base():
assert (
lifecycle.compose_file_add("", "compose.previous.yml") == "compose.yml:compose.previous.yml"
)
def test_compose_file_remove_strips_only_overlay():
assert (
lifecycle.compose_file_remove(
"compose.yml:compose.ccci.yml:compose.previous.yml", "compose.previous.yml"
)
== "compose.yml:compose.ccci.yml"
)
def test_compose_file_remove_keeps_compose_yml_when_emptied():
assert (
lifecycle.compose_file_remove("compose.previous.yml", "compose.previous.yml")
== "compose.yml"
)
# ---- marker reader + discovery (filesystem, via a temp TESTS_DIR) ---------------------------------
def _stage(monkeypatch, tmp_path, recipe, *, compose=True, marker=None):
monkeypatch.setattr(meta_mod, "TESTS_DIR", str(tmp_path))
prev = tmp_path / recipe / "previous"
prev.mkdir(parents=True)
if compose:
(prev / "compose.previous.yml").write_text("services: {}\n")
if marker is not None:
(prev / "VERSION").write_text(marker)
return prev
def test_has_previous_true_only_with_compose(monkeypatch, tmp_path):
_stage(monkeypatch, tmp_path, "rx", compose=True)
assert lifecycle.has_previous("rx") is True
assert lifecycle.has_previous("other") is False
def test_previous_target_version_reads_first_real_line(monkeypatch, tmp_path):
_stage(monkeypatch, tmp_path, "rx", marker="# the previous published version\n\n0.7.0+3.3.1\n")
assert lifecycle.previous_target_version("rx") == "0.7.0+3.3.1"
def test_previous_target_version_none_without_marker(monkeypatch, tmp_path):
_stage(monkeypatch, tmp_path, "rx", marker=None)
assert lifecycle.previous_target_version("rx") is None
# ---- the apply/skip/stale decision matrix ---------------------------------------------------------
def test_previous_status_no_folder(monkeypatch):
monkeypatch.setattr(lifecycle, "has_previous", lambda r: False)
st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1")
assert st == {"apply": False, "stale": False, "reason": ""}
def test_previous_status_applies_on_version_match(monkeypatch):
monkeypatch.setattr(lifecycle, "has_previous", lambda r: True)
monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: "0.7.0+3.3.1")
st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1")
assert st["apply"] is True and st["stale"] is False
def test_previous_status_stale_on_version_mismatch(monkeypatch):
monkeypatch.setattr(lifecycle, "has_previous", lambda r: True)
monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: "0.6.0+3.1.1")
st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1")
assert st["apply"] is False and st["stale"] is True and "stale" in st["reason"]
def test_previous_status_stale_on_main_tip_base(monkeypatch):
# previous/ can only repair a pinned published base; a main-tip (ref) base flags it for review.
monkeypatch.setattr(lifecycle, "has_previous", lambda r: True)
st = lifecycle.previous_status("rx", "ref", None)
assert st["apply"] is False and st["stale"] is True
def test_previous_status_stale_without_marker(monkeypatch):
monkeypatch.setattr(lifecycle, "has_previous", lambda r: True)
monkeypatch.setattr(lifecycle, "previous_target_version", lambda r: None)
st = lifecycle.previous_status("rx", "version", "0.7.0+3.3.1")
assert st["apply"] is False and st["stale"] is True and "marker" in st["reason"]

View File

@ -1,12 +1,9 @@
"""Unit tests for `run_recipe_ci.upgrade_base` — the deploy-once base-version decision.
"""Unit tests for `run_recipe_ci.resolve_upgrade_base` — the DYNAMIC upgrade-base decision (phase prevb).
Phase bsky: a recipe whose published versions ALL pin a moving image tag that upstream
republished with incompatible builds (bluesky-pds) has no deployable upgrade base — deploying
one fails the INSTALL tier before the PR head is ever exercised. The sanctioned escape hatch is
the EXISTING declared-intentional-skip mechanism: EXPECTED_NA["upgrade"] now also suppresses
the base deploy (single deploy = PR head; the tier records "skip"; derive_rungs classifies it
intentional with the declared reason). These tests lock the decision matrix; derive_rungs'
classification of the resulting skip is already covered in test_results.py.
Resolution order: upgrade∉stages / EXPECTED_NA[upgrade] (declared skip) → explicit UPGRADE_BASE_VERSION
override → last-green (warm canonical) → target-branch (`main`) tip → skip (no predecessor). The result
is a `BasePlan(kind, version, ref, reason)`: kind ∈ {"version", "ref", "skip"}; `.runs` is True for
version/ref (the upgrade tier runs). Replaces the old static `recipe_versions[-2]` default.
"""
from __future__ import annotations
@ -17,63 +14,92 @@ from types import SimpleNamespace
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
import run_recipe_ci # noqa: E402
from harness import lifecycle # noqa: E402
from harness import canonical, lifecycle # noqa: E402
ALL = {"install", "upgrade", "backup", "restore", "custom"}
HEAD = "aaaa1111head"
MAIN = "bbbb2222main"
def _meta(expected_na=None, upgrade_base_version=None):
return SimpleNamespace(EXPECTED_NA=expected_na, UPGRADE_BASE_VERSION=upgrade_base_version)
def test_default_prev_published(monkeypatch):
# upgrade in stages, ≥2 published versions, nothing declared → recipe_versions[-2]
monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.0+v1")
assert run_recipe_ci.upgrade_base(ALL, _meta(), "somerecipe") == "0.1.0+v1"
def _no_canonical(monkeypatch):
monkeypatch.setattr(canonical, "read_registry", lambda r: None)
def test_single_published_version_no_base(monkeypatch):
monkeypatch.setattr(lifecycle, "previous_version", lambda r: None)
assert run_recipe_ci.upgrade_base(ALL, _meta(), "somerecipe") is None
def _no_main(monkeypatch):
monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": None)
def test_upgrade_not_in_stages_no_base(monkeypatch):
def test_upgrade_not_in_stages_skip(monkeypatch):
# never consults canonical/main when upgrade isn't requested
monkeypatch.setattr(
lifecycle,
"previous_version",
lambda r: (_ for _ in ()).throw(AssertionError("not consulted")),
canonical, "read_registry", lambda r: (_ for _ in ()).throw(AssertionError("not consulted"))
)
assert run_recipe_ci.upgrade_base(ALL - {"upgrade"}, _meta(), "somerecipe") is None
plan = run_recipe_ci.resolve_upgrade_base(ALL - {"upgrade"}, _meta(), "somerecipe")
assert plan.kind == "skip" and not plan.runs
def test_upgrade_base_version_override_wins(monkeypatch):
def test_expected_na_upgrade_skip_even_with_canonical_and_override(monkeypatch):
# EXPECTED_NA[upgrade] is the strongest, declared fact — short-circuits before override/canonical.
monkeypatch.setattr(
lifecycle,
"previous_version",
lambda r: (_ for _ in ()).throw(AssertionError("not consulted")),
canonical, "read_registry", lambda r: {"version": "9.9.9", "status": "warm"}
)
meta = _meta(upgrade_base_version="0.7.0+3.3.1")
assert run_recipe_ci.upgrade_base(ALL, meta, "discourse") == "0.7.0+3.3.1"
def test_expected_na_upgrade_suppresses_base(monkeypatch):
# bluesky-pds shape: published versions exist but are undeployable — declared EXPECTED_NA
# upgrade → NO base (single deploy is the PR head), even though previous_version would
# return one and even if UPGRADE_BASE_VERSION is set (the declaration is the stronger,
# documented fact).
monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.1+v0.4")
declared = {"upgrade": "no deployable upgrade base (moving tag republished)"}
assert run_recipe_ci.upgrade_base(ALL, _meta(expected_na=declared), "bluesky-pds") is None
assert (
run_recipe_ci.upgrade_base(
ALL, _meta(expected_na=declared, upgrade_base_version="0.2.0+v0.4"), "bluesky-pds"
)
is None
plan = run_recipe_ci.resolve_upgrade_base(
ALL, _meta(expected_na=declared, upgrade_base_version="0.2.0+v0.4"), "bluesky-pds"
)
assert plan.kind == "skip" and "EXPECTED_NA" in plan.reason
def test_expected_na_other_rung_does_not_suppress(monkeypatch):
# an EXPECTED_NA for backup_restore (custom-html-tiny shape) must NOT touch the upgrade base
monkeypatch.setattr(lifecycle, "previous_version", lambda r: "0.1.0+v1")
meta = _meta(expected_na={"backup_restore": "stateless"})
assert run_recipe_ci.upgrade_base(ALL, meta, "custom-html-tiny") == "0.1.0+v1"
def test_explicit_override_wins_over_canonical(monkeypatch):
monkeypatch.setattr(
canonical, "read_registry", lambda r: {"version": "9.9.9", "status": "warm"}
)
monkeypatch.setattr(
lifecycle,
"recipe_branch_commit",
lambda r, b="main": (_ for _ in ()).throw(AssertionError("not consulted")),
)
plan = run_recipe_ci.resolve_upgrade_base(
ALL, _meta(upgrade_base_version="0.7.0+3.3.1"), "discourse"
)
assert plan.kind == "version" and plan.version == "0.7.0+3.3.1" and plan.runs
def test_last_green_warm_canonical_is_primary(monkeypatch):
# no override → last-green (warm canonical version) is the PRIMARY base; main is not consulted.
monkeypatch.setattr(
canonical, "read_registry", lambda r: {"version": "0.6.0+3.1.1", "status": "idle"}
)
monkeypatch.setattr(
lifecycle,
"recipe_branch_commit",
lambda r, b="main": (_ for _ in ()).throw(AssertionError("not consulted")),
)
plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "keycloak", head_ref=HEAD)
assert plan.kind == "version" and plan.version == "0.6.0+3.1.1" and "last-green" in plan.reason
def test_main_tip_fallback_when_no_last_green(monkeypatch):
_no_canonical(monkeypatch)
monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": MAIN)
plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "discourse", head_ref=HEAD)
assert plan.kind == "ref" and plan.ref == MAIN and plan.version is None and plan.runs
def test_head_equals_main_tip_skip(monkeypatch):
# the PR head IS the main tip (empty PR / LATEST run) → no real predecessor → declared skip.
_no_canonical(monkeypatch)
monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": HEAD)
plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "discourse", head_ref=HEAD)
assert plan.kind == "skip" and "main tip" in plan.reason and not plan.runs
def test_no_canonical_no_main_skip(monkeypatch):
_no_canonical(monkeypatch)
_no_main(monkeypatch)
plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "brandnew", head_ref=HEAD)
assert plan.kind == "skip" and "no predecessor" in plan.reason