From 68ef0f84fb4117d998a8955a7224ff138254fbe1 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 22:10:55 +0000 Subject: [PATCH] fix(harness): convergence must span stop-first rolling updates (immich 238 backup 409) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit services_converged() accepted N/N replicas as converged — but a chaos redeploy that changes a non-app service image (immich PR #2 moves the db to the vectorchord pin) registers a stop-first rolling update that swarm may not have STARTED yet: the OLD task still shows 1/1, the wait passes, and the task dies seconds later. Build 238: backupbot resolved the db hook container, the task was killed in the gap, and the pre-hook exec crashed the whole backup with a 409 -> no dump in the snapshot -> restore had nothing -> RED. - services_converged() now also requires every service's swarm UpdateStatus to be settled ('', completed, rollback_completed) — updating/paused/rollback in flight is NOT converged. Strictly stricter: no gate is weakened. - backup_app() gains a bounded (300s) settle-wait before 'abra app backup create' as defence in depth; on timeout the backup still runs and the tier's assertion delivers the verdict. lint: PASS, unit tests: 138 passed. --- runner/harness/lifecycle.py | 42 ++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 9098c89..6ba8b64 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -340,18 +340,22 @@ def _stack_name(domain: str) -> str: def services_converged(domain: str) -> bool: - """True when every service in the stack reports replicas N/N (N>0).""" + """True when every service in the stack reports replicas N/N (N>0) AND no service is + mid-rolling-update (swarm UpdateStatus settled).""" stack = _stack_name(domain) proc = subprocess.run( - ["docker", "stack", "services", stack, "--format", "{{.Replicas}}"], + ["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"], capture_output=True, text=True, ) rows = [r for r in proc.stdout.split("\n") if r.strip()] if not rows: return False + names = [] for r in rows: - cur, _, want = r.partition("/") + name, _, replicas = r.partition(" ") + names.append(name) + cur, _, want = replicas.partition("/") # A service at its DESIRED replica count is converged — including a `replicas: 0` # on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up # manually only when buckets need (re)creating), which reports "0/0". The earlier @@ -360,6 +364,28 @@ def services_converged(domain: str) -> bool: # still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged. if not want or cur != want: return False + # N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes + # a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may + # not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later + # (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require + # every service's UpdateStatus to be settled too, so the wait spans the whole rolling update. + proc = subprocess.run( + [ + "docker", + "service", + "inspect", + *names, + "--format", + "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}", + ], + capture_output=True, + text=True, + ) + if proc.returncode != 0: + return False # a service vanished mid-check — not settled + for state in proc.stdout.split("\n"): + if state.strip() not in ("", "completed", "rollback_completed"): + return False return True @@ -572,6 +598,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None: def backup_app(domain: str) -> str: """Create a backup; return the abra/restic output (carries the produced snapshot_id).""" + # Never back up a stack that is still converging/rolling-updating: backupbot resolves each + # service's hook container ONCE up front, so a task that cycles between that lookup and the + # pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout + # we still attempt the backup and let the tier's assertion deliver the verdict. + deadline = time.time() + 300 + while time.time() < deadline and not services_converged(domain): + print( + f" backup: {domain} stack not settled yet — waiting before backup create", flush=True + ) + time.sleep(5) return abra.backup_create(domain)