From e9c26c72afafe6276d7accf3c7e398180491edc8 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@noreply.git.autonomic.zone>
Date: Thu, 11 Jun 2026 17:18:43 +0000
Subject: [PATCH] =?UTF-8?q?harden(dstamp):=20assert=5Fupgrade=5Fconverged?=
 =?UTF-8?q?=20waits=20for=20the=20NEW=20swarm=20update=20(StartedAt=20adva?=
 =?UTF-8?q?nced)=20before=20accepting=20a=20terminal=20state=20=E2=80=94?=
 =?UTF-8?q?=20closes=20the=20Adversary-flagged=20race=20where=20a=20stale?=
 =?UTF-8?q?=20'completed'=20from=20the=20base=20deploy=20could=20mask=20a?=
 =?UTF-8?q?=20later=20rollback;=20no-op=20redeploy=20grace=20preserved?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runner/harness/generic.py   |  7 +++-
 runner/harness/lifecycle.py | 80 +++++++++++++++++++++++++++++++------
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/runner/harness/generic.py b/runner/harness/generic.py
index 3e861b5..fcdfcfb 100644
--- a/runner/harness/generic.py
+++ b/runner/harness/generic.py
@@ -263,6 +263,9 @@ def perform_upgrade(
     # HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
     # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
     lifecycle.prepull_images(recipe, domain)
+    # Snapshot the app service's pre-redeploy swarm update marker so assert_upgrade_converged can
+    # tell the NEW rolling update apart from the install/base deploy's stale terminal state.
+    prev_started = lifecycle.update_status_started(domain)
     lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
     # Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's
     # rolling update of the app service actually converged to the NEW (head) spec and was not
@@ -270,7 +273,9 @@ def perform_upgrade(
     # chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a
     # reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here
     # is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened.
-    lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT))
+    lifecycle.assert_upgrade_converged(
+        domain, timeout=int(meta.DEPLOY_TIMEOUT), prev_started=prev_started
+    )
     lifecycle.wait_healthy(
         domain,
         ok_codes=tuple(meta.HEALTH_OK),
diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py
index b59666e..d14162b 100644
--- a/runner/harness/lifecycle.py
+++ b/runner/harness/lifecycle.py
@@ -508,11 +508,38 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None
     return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag}
 
 
-def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 900) -> None:
+def update_status_started(domain: str, service: str = "app") -> str:
+    """The app service's current `UpdateStatus.StartedAt` ('' if no update recorded). Captured
+    BEFORE the upgrade chaos redeploy so assert_upgrade_converged can tell the NEW rolling update
+    apart from a stale terminal state left by the install/base deploy (closes the race where
+    `docker stack deploy -c` returns before swarm schedules the roll)."""
+    name = f"{_stack_name(domain)}_{service}"
+    proc = subprocess.run(
+        ["docker", "service", "inspect", name, "--format",
+         "{{if .UpdateStatus}}{{.UpdateStatus.StartedAt}}{{else}}{{end}}"],
+        capture_output=True,
+        text=True,
+    )
+    return proc.stdout.strip()
+
+
+def assert_upgrade_converged(
+    domain: str, service: str = "app", timeout: int = 900, prev_started: str | None = None
+) -> None:
     """After an in-place upgrade chaos redeploy, wait for swarm's rolling update of the app service
     to reach a TERMINAL state and assert it converged to the NEW (head) spec — i.e. did NOT roll
     back or pause. Raises on a non-converged update; returns on success / nothing-to-converge.
 
+    `prev_started` is the app service's `UpdateStatus.StartedAt` captured BEFORE the redeploy (via
+    update_status_started). It closes the race the Adversary flagged: `chaos_redeploy` runs
+    `docker stack deploy -c` which returns BEFORE swarm schedules the rolling update, so the first
+    poll could read a STALE terminal `completed` (from the install/base deploy) and wrongly return
+    OK, then miss a rollback that fires moments later. We therefore (phase 1) wait until the NEW
+    update is observed — `StartedAt` advances past `prev_started`, or the state is an in-flight
+    `updating`/`rollback_started` — before (phase 2) accepting a terminal verdict. A no-op redeploy
+    that triggers no update at all (StartedAt never advances within a short grace) ⇒ OK (nothing to
+    converge); in practice the base→head upgrade always changes the spec, so an update always fires.
+
     WHY (dstamp attribution, direct evidence in JOURNAL-dstamp 2026-06-11): a recipe whose app
     service sets `deploy.update_config.failure_action: rollback` with `order: start-first` (e.g.
     discourse) will, when the NEW task fails swarm's update monitor (e.g. a precompile/Rails-heavy
@@ -533,22 +560,51 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9
     (fresh service or a no-op redeploy that performed no update) ⇒ OK (nothing to converge). While
     `updating`/`rollback_started` (in flight) keep waiting up to `timeout`."""
     name = f"{_stack_name(domain)}_{service}"
-    fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}none{{end}}"
-    deadline = time.time() + timeout
-    last = None
-    while time.time() < deadline:
+    fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}|{{.UpdateStatus.StartedAt}}{{else}}none|{{end}}"
+    terminal_ok = ("completed",)
+    terminal_fail = ("rollback_completed", "rollback_paused", "paused")
+
+    def _poll() -> tuple[str, str]:
         proc = subprocess.run(
             ["docker", "service", "inspect", name, "--format", fmt],
             capture_output=True,
             text=True,
         )
-        state = proc.stdout.strip()
-        last = state
-        if state in ("", "none", "completed"):
-            if state == "completed":
-                print(f"  upgrade-converged: {name} swarm UpdateStatus=completed", flush=True)
+        state, _, started = proc.stdout.strip().partition("|")
+        return state, started
+
+    deadline = time.time() + timeout
+    prev_started = prev_started or ""
+    # Phase 1: confirm the NEW rolling update has actually been scheduled (don't trust a stale
+    # terminal state left by the install/base deploy). Short grace: if no update fires, it's a
+    # no-op redeploy (spec unchanged) → nothing to converge.
+    grace = time.time() + 30
+    observed_new = False
+    while time.time() < deadline:
+        state, started = _poll()
+        if started and started != prev_started:
+            observed_new = True
+            break
+        if state in ("updating", "rollback_started"):
+            observed_new = True
+            break
+        if time.time() > grace:
+            print(
+                f"  upgrade-converged: {name} no swarm update scheduled within grace "
+                f"(no-op redeploy, spec unchanged) — nothing to converge",
+                flush=True,
+            )
             return
-        if state in ("rollback_completed", "rollback_paused", "paused"):
+        time.sleep(2)
+    # Phase 2: wait for the (now-confirmed-new) update to reach a terminal state.
+    last = None
+    while time.time() < deadline:
+        state, _ = _poll()
+        last = state
+        if state in terminal_ok:
+            print(f"  upgrade-converged: {name} swarm UpdateStatus=completed", flush=True)
+            return
+        if state in terminal_fail:
             raise RuntimeError(
                 f"{domain}: upgrade redeploy did NOT converge to the head spec — swarm "
                 f"UpdateStatus={state!r}. The recipe's app service uses update_config "
@@ -560,7 +616,7 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9
         time.sleep(5)
     raise RuntimeError(
         f"{domain}: upgrade redeploy update did not reach a terminal swarm state within {timeout}s "
-        f"(last UpdateStatus={last!r}) — treating as a non-converged upgrade."
+        f"(observed_new={observed_new}, last UpdateStatus={last!r}) — non-converged upgrade."
     )