From e9c26c72afafe6276d7accf3c7e398180491edc8 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Thu, 11 Jun 2026 17:18:43 +0000 Subject: [PATCH] =?UTF-8?q?harden(dstamp):=20assert=5Fupgrade=5Fconverged?= =?UTF-8?q?=20waits=20for=20the=20NEW=20swarm=20update=20(StartedAt=20adva?= =?UTF-8?q?nced)=20before=20accepting=20a=20terminal=20state=20=E2=80=94?= =?UTF-8?q?=20closes=20the=20Adversary-flagged=20race=20where=20a=20stale?= =?UTF-8?q?=20'completed'=20from=20the=20base=20deploy=20could=20mask=20a?= =?UTF-8?q?=20later=20rollback;=20no-op=20redeploy=20grace=20preserved?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runner/harness/generic.py | 7 +++- runner/harness/lifecycle.py | 80 +++++++++++++++++++++++++++++++------ 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/runner/harness/generic.py b/runner/harness/generic.py index 3e861b5..fcdfcfb 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -263,6 +263,9 @@ def perform_upgrade( # HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound. lifecycle.prepull_images(recipe, domain) + # Snapshot the app service's pre-redeploy swarm update marker so assert_upgrade_converged can + # tell the NEW rolling update apart from the install/base deploy's stale terminal state. + prev_started = lifecycle.update_status_started(domain) lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True) # Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's # rolling update of the app service actually converged to the NEW (head) spec and was not @@ -270,7 +273,9 @@ def perform_upgrade( # chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a # reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here # is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened. - lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT)) + lifecycle.assert_upgrade_converged( + domain, timeout=int(meta.DEPLOY_TIMEOUT), prev_started=prev_started + ) lifecycle.wait_healthy( domain, ok_codes=tuple(meta.HEALTH_OK), diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index b59666e..d14162b 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -508,11 +508,38 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag} -def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 900) -> None: +def update_status_started(domain: str, service: str = "app") -> str: + """The app service's current `UpdateStatus.StartedAt` ('' if no update recorded). Captured + BEFORE the upgrade chaos redeploy so assert_upgrade_converged can tell the NEW rolling update + apart from a stale terminal state left by the install/base deploy (closes the race where + `docker stack deploy -c` returns before swarm schedules the roll).""" + name = f"{_stack_name(domain)}_{service}" + proc = subprocess.run( + ["docker", "service", "inspect", name, "--format", + "{{if .UpdateStatus}}{{.UpdateStatus.StartedAt}}{{else}}{{end}}"], + capture_output=True, + text=True, + ) + return proc.stdout.strip() + + +def assert_upgrade_converged( + domain: str, service: str = "app", timeout: int = 900, prev_started: str | None = None +) -> None: """After an in-place upgrade chaos redeploy, wait for swarm's rolling update of the app service to reach a TERMINAL state and assert it converged to the NEW (head) spec — i.e. did NOT roll back or pause. Raises on a non-converged update; returns on success / nothing-to-converge. + `prev_started` is the app service's `UpdateStatus.StartedAt` captured BEFORE the redeploy (via + update_status_started). It closes the race the Adversary flagged: `chaos_redeploy` runs + `docker stack deploy -c` which returns BEFORE swarm schedules the rolling update, so the first + poll could read a STALE terminal `completed` (from the install/base deploy) and wrongly return + OK, then miss a rollback that fires moments later. We therefore (phase 1) wait until the NEW + update is observed — `StartedAt` advances past `prev_started`, or the state is an in-flight + `updating`/`rollback_started` — before (phase 2) accepting a terminal verdict. A no-op redeploy + that triggers no update at all (StartedAt never advances within a short grace) ⇒ OK (nothing to + converge); in practice the base→head upgrade always changes the spec, so an update always fires. + WHY (dstamp attribution, direct evidence in JOURNAL-dstamp 2026-06-11): a recipe whose app service sets `deploy.update_config.failure_action: rollback` with `order: start-first` (e.g. discourse) will, when the NEW task fails swarm's update monitor (e.g. a precompile/Rails-heavy @@ -533,22 +560,51 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9 (fresh service or a no-op redeploy that performed no update) ⇒ OK (nothing to converge). While `updating`/`rollback_started` (in flight) keep waiting up to `timeout`.""" name = f"{_stack_name(domain)}_{service}" - fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}none{{end}}" - deadline = time.time() + timeout - last = None - while time.time() < deadline: + fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}|{{.UpdateStatus.StartedAt}}{{else}}none|{{end}}" + terminal_ok = ("completed",) + terminal_fail = ("rollback_completed", "rollback_paused", "paused") + + def _poll() -> tuple[str, str]: proc = subprocess.run( ["docker", "service", "inspect", name, "--format", fmt], capture_output=True, text=True, ) - state = proc.stdout.strip() - last = state - if state in ("", "none", "completed"): - if state == "completed": - print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True) + state, _, started = proc.stdout.strip().partition("|") + return state, started + + deadline = time.time() + timeout + prev_started = prev_started or "" + # Phase 1: confirm the NEW rolling update has actually been scheduled (don't trust a stale + # terminal state left by the install/base deploy). Short grace: if no update fires, it's a + # no-op redeploy (spec unchanged) → nothing to converge. + grace = time.time() + 30 + observed_new = False + while time.time() < deadline: + state, started = _poll() + if started and started != prev_started: + observed_new = True + break + if state in ("updating", "rollback_started"): + observed_new = True + break + if time.time() > grace: + print( + f" upgrade-converged: {name} no swarm update scheduled within grace " + f"(no-op redeploy, spec unchanged) — nothing to converge", + flush=True, + ) return - if state in ("rollback_completed", "rollback_paused", "paused"): + time.sleep(2) + # Phase 2: wait for the (now-confirmed-new) update to reach a terminal state. + last = None + while time.time() < deadline: + state, _ = _poll() + last = state + if state in terminal_ok: + print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True) + return + if state in terminal_fail: raise RuntimeError( f"{domain}: upgrade redeploy did NOT converge to the head spec — swarm " f"UpdateStatus={state!r}. The recipe's app service uses update_config " @@ -560,7 +616,7 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9 time.sleep(5) raise RuntimeError( f"{domain}: upgrade redeploy update did not reach a terminal swarm state within {timeout}s " - f"(last UpdateStatus={last!r}) — treating as a non-converged upgrade." + f"(observed_new={observed_new}, last UpdateStatus={last!r}) — non-converged upgrade." )