fix(dstamp): discourse upgrade stop-first overlay (stop 2x-memory start-first OOM→spurious swarm rollback) + harness assert_upgrade_converged (detect rollback/pause → honest upgrade failure, HC1 unweakened). Root cause: failure_action:rollback reverted chaos-version label, masked by start-first+wait_healthy
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@ -264,7 +264,13 @@ def perform_upgrade(
|
||||
# tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
|
||||
lifecycle.prepull_images(recipe, domain)
|
||||
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
|
||||
# Own the convergence verification (abra's monitor was skipped via -c).
|
||||
# Own the convergence verification (abra's monitor was skipped via -c). FIRST confirm swarm's
|
||||
# rolling update of the app service actually converged to the NEW (head) spec and was not
|
||||
# silently rolled back/paused (dstamp: failure_action=rollback + order=start-first reverts the
|
||||
# chaos-version label while the old task keeps serving, so wait_healthy alone would pass on a
|
||||
# reverted-to-base spec and HC1 would misreport it as a stamp mismatch). A rollback/pause here
|
||||
# is a genuine upgrade failure (head did not stay healthy) — surfaced honestly, HC1 unweakened.
|
||||
lifecycle.assert_upgrade_converged(domain, timeout=int(meta.DEPLOY_TIMEOUT))
|
||||
lifecycle.wait_healthy(
|
||||
domain,
|
||||
ok_codes=tuple(meta.HEALTH_OK),
|
||||
|
||||
@ -508,6 +508,62 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None
|
||||
return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag}
|
||||
|
||||
|
||||
def assert_upgrade_converged(domain: str, service: str = "app", timeout: int = 900) -> None:
|
||||
"""After an in-place upgrade chaos redeploy, wait for swarm's rolling update of the app service
|
||||
to reach a TERMINAL state and assert it converged to the NEW (head) spec — i.e. did NOT roll
|
||||
back or pause. Raises on a non-converged update; returns on success / nothing-to-converge.
|
||||
|
||||
WHY (dstamp attribution, direct evidence in JOURNAL-dstamp 2026-06-11): a recipe whose app
|
||||
service sets `deploy.update_config.failure_action: rollback` with `order: start-first` (e.g.
|
||||
discourse) will, when the NEW task fails swarm's update monitor (e.g. a precompile/Rails-heavy
|
||||
app OOMing under start-first's 2x old+new co-residency), execute the rollback and revert the
|
||||
service to its PREVIOUS spec — INCLUDING the `coop-cloud.<stack>.chaos-version` label. Under
|
||||
start-first the OLD task keeps serving, so `wait_healthy` still passes; the reverted spec then
|
||||
makes HC1 read the BASE commit and misreport it as 'the re-checkout to the code under test
|
||||
failed'. The harness had ASSUMED `wait_healthy` (all services N/N + app health) implies the
|
||||
upgrade converged to head — false under start-first + a rolled-back/paused update. This check
|
||||
makes a rollback/pause VISIBLE and fails the upgrade HONESTLY (the head did not stay healthy ⇒
|
||||
not really upgraded to the code under test), WITHOUT weakening HC1: the underlying commit match
|
||||
is unchanged; this only stops a silent swarm revert from masquerading as a stamp mismatch and
|
||||
closes the wait_healthy-masking hole. abra's own monitor (`-c`) was skipped for the upgrade
|
||||
redeploy, so the harness must own this convergence check itself.
|
||||
|
||||
Terminal states: `completed` (OK). `rollback_completed`/`rollback_paused`/`paused` (FAIL — the
|
||||
new task failed the monitor; running spec is not the code under test). Empty/`none` UpdateStatus
|
||||
(fresh service or a no-op redeploy that performed no update) ⇒ OK (nothing to converge). While
|
||||
`updating`/`rollback_started` (in flight) keep waiting up to `timeout`."""
|
||||
name = f"{_stack_name(domain)}_{service}"
|
||||
fmt = "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}none{{end}}"
|
||||
deadline = time.time() + timeout
|
||||
last = None
|
||||
while time.time() < deadline:
|
||||
proc = subprocess.run(
|
||||
["docker", "service", "inspect", name, "--format", fmt],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
state = proc.stdout.strip()
|
||||
last = state
|
||||
if state in ("", "none", "completed"):
|
||||
if state == "completed":
|
||||
print(f" upgrade-converged: {name} swarm UpdateStatus=completed", flush=True)
|
||||
return
|
||||
if state in ("rollback_completed", "rollback_paused", "paused"):
|
||||
raise RuntimeError(
|
||||
f"{domain}: upgrade redeploy did NOT converge to the head spec — swarm "
|
||||
f"UpdateStatus={state!r}. The recipe's app service uses update_config "
|
||||
f"failure_action=rollback/pause; the NEW (head) task failed swarm's update monitor, "
|
||||
f"so the service reverted/paused and the RUNNING spec is the previous version, not "
|
||||
f"the code under test. This is a real upgrade failure (the head did not stay "
|
||||
f"healthy under the deploy), surfaced honestly — not a stamp mismatch."
|
||||
)
|
||||
time.sleep(5)
|
||||
raise RuntimeError(
|
||||
f"{domain}: upgrade redeploy update did not reach a terminal swarm state within {timeout}s "
|
||||
f"(last UpdateStatus={last!r}) — treating as a non-converged upgrade."
|
||||
)
|
||||
|
||||
|
||||
def upgrade_app(domain: str, version: str | None = None) -> None:
|
||||
abra.upgrade(domain, version=version)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user