diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 9098c89..6ba8b64 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -340,18 +340,22 @@ def _stack_name(domain: str) -> str: def services_converged(domain: str) -> bool: - """True when every service in the stack reports replicas N/N (N>0).""" + """True when every service in the stack reports replicas N/N (N>0) AND no service is + mid-rolling-update (swarm UpdateStatus settled).""" stack = _stack_name(domain) proc = subprocess.run( - ["docker", "stack", "services", stack, "--format", "{{.Replicas}}"], + ["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"], capture_output=True, text=True, ) rows = [r for r in proc.stdout.split("\n") if r.strip()] if not rows: return False + names = [] for r in rows: - cur, _, want = r.partition("/") + name, _, replicas = r.partition(" ") + names.append(name) + cur, _, want = replicas.partition("/") # A service at its DESIRED replica count is converged — including a `replicas: 0` # on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up # manually only when buckets need (re)creating), which reports "0/0". The earlier @@ -360,6 +364,28 @@ def services_converged(domain: str) -> bool: # still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged. if not want or cur != want: return False + # N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes + # a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may + # not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later + # (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require + # every service's UpdateStatus to be settled too, so the wait spans the whole rolling update. + proc = subprocess.run( + [ + "docker", + "service", + "inspect", + *names, + "--format", + "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}", + ], + capture_output=True, + text=True, + ) + if proc.returncode != 0: + return False # a service vanished mid-check — not settled + for state in proc.stdout.split("\n"): + if state.strip() not in ("", "completed", "rollback_completed"): + return False return True @@ -572,6 +598,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None: def backup_app(domain: str) -> str: """Create a backup; return the abra/restic output (carries the produced snapshot_id).""" + # Never back up a stack that is still converging/rolling-updating: backupbot resolves each + # service's hook container ONCE up front, so a task that cycles between that lookup and the + # pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout + # we still attempt the backup and let the tier's assertion deliver the verdict. + deadline = time.time() + 300 + while time.time() < deadline and not services_converged(domain): + print( + f" backup: {domain} stack not settled yet — waiting before backup create", flush=True + ) + time.sleep(5) return abra.backup_create(domain)