harden(dstamp): assert_upgrade_converged waits for the NEW swarm update (StartedAt advanced) before accepting a terminal state — closes the Adversary-flagged race where a stale 'completed' from the base deploy could mask a later rollback; no-op redeploy grace preserved

2026-06-11 17:18:43 +00:00
parent a4c0dfcf11
commit e9c26c72af
2 changed files with 74 additions and 13 deletions
--- a/runner/harness/generic.py
+++ b/runner/harness/generic.py
@ -263,6 +263,9 @@ def perform_upgrade(
    # HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
    # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
    lifecycle.prepull_images(recipe, domain)
+    # Snapshot the app service's pre-redeploy swarm update marker so assert_upgrade_converged can
+    # tell the NEW rolling update apart from the install/base deploy's stale terminal state.
+    prev_started = lifecycle.update_status_started(domain)
    lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
    # Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's
    # rolling update of the app service actually converged to the NEW (head) spec and was not
@ -270,7 +273,9 @@ def perform_upgrade(
    # chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a
    # reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here
    # is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened.
-    lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT))
+    lifecycle.assert_upgrade_converged(
+        domain, timeout=int(meta.DEPLOY_TIMEOUT), prev_started=prev_started
+    )
    lifecycle.wait_healthy(
        domain,
        ok_codes=tuple(meta.HEALTH_OK),