diff --git a/runner/harness/generic.py b/runner/harness/generic.py index b377c92..3e861b5 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -264,7 +264,13 @@ def perform_upgrade( # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound. lifecycle.prepull_images(recipe, domain) lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True) - # Own the convergence verification (abra's monitor was skipped via -c). + # Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's + # rolling update of the app service actually converged to the NEW (head) spec and was not + # silently rolled back/paused (dstamp: failure_action=rollback + order=start-first reverts the + # chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a + # reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here + # is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened. + lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT)) lifecycle.wait_healthy( domain, ok_codes=tuple(meta.HEALTH_OK), diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index fe27b9e..b59666e 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -508,6 +508,62 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag} +def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 900) -> None: + """After an in-place upgrade chaos redeploy, wait for swarm's rolling update of the app service + to reach a TERMINAL state and assert it converged to the NEW (head) spec — i.e. did NOT roll + back or pause. Raises on a non-converged update; returns on success / nothing-to-converge. + + WHY (dstamp attribution, direct evidence in JOURNAL-dstamp 2026-06-11): a recipe whose app + service sets `deploy.update_config.failure_action: rollback` with `order: start-first` (e.g. + discourse) will, when the NEW task fails swarm's update monitor (e.g. a precompile/Rails-heavy + app OOMing under start-first's 2x old+new co-residency), execute the rollback and revert the + service to its PREVIOUS spec — INCLUDING the `coop-cloud..chaos-version` label. Under + start-first the OLD task keeps serving, so `wait_healthy` still passes; the reverted spec then + makes HC1 read the BASE commit and misreport it as 'the re-checkout to the code under test + failed'. The harness had ASSUMED `wait_healthy` (all services N/N + app health) implies the + upgrade converged to head — false under start-first + a rolled-back/paused update. This check + makes a rollback/pause VISIBLE and fails the upgrade HONESTLY (the head did not stay healthy ⇒ + not really upgraded to the code under test), WITHOUT weakening HC1: the underlying commit match + is unchanged; this only stops a silent swarm revert from masquerading as a stamp mismatch and + closes the wait_healthy-masking hole. abra's own monitor (`-c`) was skipped for the upgrade + redeploy, so the harness must own this convergence check itself. + + Terminal states: `completed` (OK). `rollback_completed`/`rollback_paused`/`paused` (FAIL — the + new task failed the monitor; running spec is not the code under test). Empty/`none` UpdateStatus + (fresh service or a no-op redeploy that performed no update) ⇒ OK (nothing to converge). While + `updating`/`rollback_started` (in flight) keep waiting up to `timeout`.""" + name = f"{_stack_name(domain)}_{service}" + fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}none{{end}}" + deadline = time.time() + timeout + last = None + while time.time() < deadline: + proc = subprocess.run( + ["docker", "service", "inspect", name, "--format", fmt], + capture_output=True, + text=True, + ) + state = proc.stdout.strip() + last = state + if state in ("", "none", "completed"): + if state == "completed": + print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True) + return + if state in ("rollback_completed", "rollback_paused", "paused"): + raise RuntimeError( + f"{domain}: upgrade redeploy did NOT converge to the head spec — swarm " + f"UpdateStatus={state!r}. The recipe's app service uses update_config " + f"failure_action=rollback/pause; the NEW (head) task failed swarm's update monitor, " + f"so the service reverted/paused and the RUNNING spec is the previous version, not " + f"the code under test. This is a real upgrade failure (the head did not stay " + f"healthy under the deploy), surfaced honestly — not a stamp mismatch." + ) + time.sleep(5) + raise RuntimeError( + f"{domain}: upgrade redeploy update did not reach a terminal swarm state within {timeout}s " + f"(last UpdateStatus={last!r}) — treating as a non-converged upgrade." + ) + + def upgrade_app(domain: str, version: str | None = None) -> None: abra.upgrade(domain, version=version) diff --git a/tests/discourse/compose.ccci.yml b/tests/discourse/compose.ccci.yml index 32d74ea..f06774f 100644 --- a/tests/discourse/compose.ccci.yml +++ b/tests/discourse/compose.ccci.yml @@ -28,10 +28,30 @@ version: "3.8" # bad `discourse` key. Instead the 2.4GB `bitnamilegacy/discourse:3.3.1` image is kept warm in the node # image cache, so the inline pull during deploy is a no-op and convergence isn't pull-bound. (swarm # ignores depends_on, so the dangling ref has zero runtime effect — a recipe lint nit, not a defect.) +# +# 3. UPGRADE ROLLOUT (dstamp 2026-06-11, direct-evidence attribution in JOURNAL-dstamp): the +# published app service sets `deploy.update_config: { failure_action: rollback, order: +# start-first }`. On the upgrade chaos redeploy (base 0.7.0 → PR head), start-first runs the OLD +# and NEW precompile/Rails-heavy discourse tasks CO-RESIDENT (~2x memory); under host memory +# pressure the NEW task intermittently OOMs/fails swarm's update monitor → `failure_action: +# rollback` reverts the app service to its PREVIOUS spec, INCLUDING the +# `coop-cloud..chaos-version` label (head → base). Because start-first keeps the OLD task +# serving, wait_healthy still passes, and HC1 then reads the reverted BASE commit (eb96de9+U) and +# misreports it as 'the re-checkout failed' — the dstamp drift, reproduced solo (runs +# dstamp-repro1/4) with `.Spec.chaos-version=7ae7b0f7+U` (head applied) flipping to +# `.PreviousSpec=eb96de94+U` after the rollback. FIX: `order: stop-first` so the NEW task boots +# with the full host memory (no 2x co-residency) and genuinely becomes healthy → no spurious +# rollback. This is a CI deploy-rollout tweak only: the upgrade still really deploys + asserts the +# PR-head code under test, and `failure_action: rollback` is LEFT intact, so a genuinely broken +# head still rolls back and is caught (lifecycle.assert_upgrade_converged) — NO test is weakened. +# Trade-off: brief real downtime during the CI upgrade (covered by DEPLOY_TIMEOUT 3600). services: app: image: bitnamilegacy/discourse:3.3.1 healthcheck: start_period: 20m + deploy: + update_config: + order: stop-first sidekiq: image: bitnamilegacy/discourse:3.3.1