fix(dstamp): discourse upgrade stop-first overlay (stop 2x-memory start-first OOM→spurious swarm rollback) + harness assert_upgrade_converged (detect rollback/pause → honest upgrade failure, HC1 unweakened). Root cause: failure_action:rollback reverted chaos-version label, masked by start-first+wait_healthy

2026-06-11 17:07:38 +00:00
parent 9959ad6a2d
commit 0cc31a507e
3 changed files with 83 additions and 1 deletions
--- a/runner/harness/generic.py
+++ b/runner/harness/generic.py
@ -264,7 +264,13 @@ def perform_upgrade(
    # tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
    lifecycle.prepull_images(recipe, domain)
    lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
-    # Own the convergence verification (abra's monitor was skipped via -c).
+    # Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's
+    # rolling update of the app service actually converged to the NEW (head) spec and was not
+    # silently rolled back/paused (dstamp: failure_action=rollback + order=start-first reverts the
+    # chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a
+    # reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here
+    # is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened.
+    lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT))
    lifecycle.wait_healthy(
        domain,
        ok_codes=tuple(meta.HEALTH_OK),