harden(dstamp): assert_upgrade_converged waits for the NEW swarm update (StartedAt advanced) before accepting a terminal state — closes the Adversary-flagged race where a stale 'completed' from the base deploy could mask a later rollback; no-op redeploy grace preserved
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
This commit is contained in:
@ -263,6 +263,9 @@ def perform_upgrade(
|
||||
# HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
|
||||
# tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
|
||||
lifecycle.prepull_images(recipe, domain)
|
||||
# Snapshot the app service's pre-redeploy swarm update marker so assert_upgrade_converged can
|
||||
# tell the NEW rolling update apart from the install/base deploy's stale terminal state.
|
||||
prev_started = lifecycle.update_status_started(domain)
|
||||
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
|
||||
# Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's
|
||||
# rolling update of the app service actually converged to the NEW (head) spec and was not
|
||||
@ -270,7 +273,9 @@ def perform_upgrade(
|
||||
# chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a
|
||||
# reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here
|
||||
# is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened.
|
||||
lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT))
|
||||
lifecycle.assert_upgrade_converged(
|
||||
domain, timeout=int(meta.DEPLOY_TIMEOUT), prev_started=prev_started
|
||||
)
|
||||
lifecycle.wait_healthy(
|
||||
domain,
|
||||
ok_codes=tuple(meta.HEALTH_OK),
|
||||
|
||||
@ -508,11 +508,38 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None
|
||||
return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag}
|
||||
|
||||
|
||||
def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 900) -> None:
|
||||
def update_status_started(domain: str, service: str = "app") -> str:
|
||||
"""The app service's current `UpdateStatus.StartedAt` ('' if no update recorded). Captured
|
||||
BEFORE the upgrade chaos redeploy so assert_upgrade_converged can tell the NEW rolling update
|
||||
apart from a stale terminal state left by the install/base deploy (closes the race where
|
||||
`docker stack deploy -c` returns before swarm schedules the roll)."""
|
||||
name = f"{_stack_name(domain)}_{service}"
|
||||
proc = subprocess.run(
|
||||
["docker", "service", "inspect", name, "--format",
|
||||
"{{if .UpdateStatus}}{{.UpdateStatus.StartedAt}}{{else}}{{end}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
return proc.stdout.strip()
|
||||
|
||||
|
||||
def assert_upgrade_converged(
|
||||
domain: str, service: str = "app", timeout: int = 900, prev_started: str | None = None
|
||||
) -> None:
|
||||
"""After an in-place upgrade chaos redeploy, wait for swarm's rolling update of the app service
|
||||
to reach a TERMINAL state and assert it converged to the NEW (head) spec — i.e. did NOT roll
|
||||
back or pause. Raises on a non-converged update; returns on success / nothing-to-converge.
|
||||
|
||||
`prev_started` is the app service's `UpdateStatus.StartedAt` captured BEFORE the redeploy (via
|
||||
update_status_started). It closes the race the Adversary flagged: `chaos_redeploy` runs
|
||||
`docker stack deploy -c` which returns BEFORE swarm schedules the rolling update, so the first
|
||||
poll could read a STALE terminal `completed` (from the install/base deploy) and wrongly return
|
||||
OK, then miss a rollback that fires moments later. We therefore (phase 1) wait until the NEW
|
||||
update is observed — `StartedAt` advances past `prev_started`, or the state is an in-flight
|
||||
`updating`/`rollback_started` — before (phase 2) accepting a terminal verdict. A no-op redeploy
|
||||
that triggers no update at all (StartedAt never advances within a short grace) ⇒ OK (nothing to
|
||||
converge); in practice the base→head upgrade always changes the spec, so an update always fires.
|
||||
|
||||
WHY (dstamp attribution, direct evidence in JOURNAL-dstamp 2026-06-11): a recipe whose app
|
||||
service sets `deploy.update_config.failure_action: rollback` with `order: start-first` (e.g.
|
||||
discourse) will, when the NEW task fails swarm's update monitor (e.g. a precompile/Rails-heavy
|
||||
@ -533,22 +560,51 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9
|
||||
(fresh service or a no-op redeploy that performed no update) ⇒ OK (nothing to converge). While
|
||||
`updating`/`rollback_started` (in flight) keep waiting up to `timeout`."""
|
||||
name = f"{_stack_name(domain)}_{service}"
|
||||
fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}none{{end}}"
|
||||
deadline = time.time() + timeout
|
||||
last = None
|
||||
while time.time() < deadline:
|
||||
fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}|{{.UpdateStatus.StartedAt}}{{else}}none|{{end}}"
|
||||
terminal_ok = ("completed",)
|
||||
terminal_fail = ("rollback_completed", "rollback_paused", "paused")
|
||||
|
||||
def _poll() -> tuple[str, str]:
|
||||
proc = subprocess.run(
|
||||
["docker", "service", "inspect", name, "--format", fmt],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
state = proc.stdout.strip()
|
||||
last = state
|
||||
if state in ("", "none", "completed"):
|
||||
if state == "completed":
|
||||
print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True)
|
||||
state, _, started = proc.stdout.strip().partition("|")
|
||||
return state, started
|
||||
|
||||
deadline = time.time() + timeout
|
||||
prev_started = prev_started or ""
|
||||
# Phase 1: confirm the NEW rolling update has actually been scheduled (don't trust a stale
|
||||
# terminal state left by the install/base deploy). Short grace: if no update fires, it's a
|
||||
# no-op redeploy (spec unchanged) → nothing to converge.
|
||||
grace = time.time() + 30
|
||||
observed_new = False
|
||||
while time.time() < deadline:
|
||||
state, started = _poll()
|
||||
if started and started != prev_started:
|
||||
observed_new = True
|
||||
break
|
||||
if state in ("updating", "rollback_started"):
|
||||
observed_new = True
|
||||
break
|
||||
if time.time() > grace:
|
||||
print(
|
||||
f" upgrade-converged: {name} no swarm update scheduled within grace "
|
||||
f"(no-op redeploy, spec unchanged) — nothing to converge",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
if state in ("rollback_completed", "rollback_paused", "paused"):
|
||||
time.sleep(2)
|
||||
# Phase 2: wait for the (now-confirmed-new) update to reach a terminal state.
|
||||
last = None
|
||||
while time.time() < deadline:
|
||||
state, _ = _poll()
|
||||
last = state
|
||||
if state in terminal_ok:
|
||||
print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True)
|
||||
return
|
||||
if state in terminal_fail:
|
||||
raise RuntimeError(
|
||||
f"{domain}: upgrade redeploy did NOT converge to the head spec — swarm "
|
||||
f"UpdateStatus={state!r}. The recipe's app service uses update_config "
|
||||
@ -560,7 +616,7 @@ def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 9
|
||||
time.sleep(5)
|
||||
raise RuntimeError(
|
||||
f"{domain}: upgrade redeploy update did not reach a terminal swarm state within {timeout}s "
|
||||
f"(last UpdateStatus={last!r}) — treating as a non-converged upgrade."
|
||||
f"(observed_new={observed_new}, last UpdateStatus={last!r}) — non-converged upgrade."
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user