fix(harness): convergence must span stop-first rolling updates (immich 238 backup 409)
services_converged() accepted N/N replicas as converged — but a chaos redeploy that changes a non-app service image (immich PR #2 moves the db to the vectorchord pin) registers a stop-first rolling update that swarm may not have STARTED yet: the OLD task still shows 1/1, the wait passes, and the task dies seconds later. Build 238: backupbot resolved the db hook container, the task was killed in the gap, and the pre-hook exec crashed the whole backup with a 409 -> no dump in the snapshot -> restore had nothing -> RED. - services_converged() now also requires every service's swarm UpdateStatus to be settled ('', completed, rollback_completed) — updating/paused/rollback in flight is NOT converged. Strictly stricter: no gate is weakened. - backup_app() gains a bounded (300s) settle-wait before 'abra app backup create' as defence in depth; on timeout the backup still runs and the tier's assertion delivers the verdict. lint: PASS, unit tests: 138 passed.
This commit is contained in:
@ -340,18 +340,22 @@ def _stack_name(domain: str) -> str:
|
||||
|
||||
|
||||
def services_converged(domain: str) -> bool:
|
||||
"""True when every service in the stack reports replicas N/N (N>0)."""
|
||||
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
|
||||
mid-rolling-update (swarm UpdateStatus settled)."""
|
||||
stack = _stack_name(domain)
|
||||
proc = subprocess.run(
|
||||
["docker", "stack", "services", stack, "--format", "{{.Replicas}}"],
|
||||
["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
||||
if not rows:
|
||||
return False
|
||||
names = []
|
||||
for r in rows:
|
||||
cur, _, want = r.partition("/")
|
||||
name, _, replicas = r.partition(" ")
|
||||
names.append(name)
|
||||
cur, _, want = replicas.partition("/")
|
||||
# A service at its DESIRED replica count is converged — including a `replicas: 0`
|
||||
# on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up
|
||||
# manually only when buckets need (re)creating), which reports "0/0". The earlier
|
||||
@ -360,6 +364,28 @@ def services_converged(domain: str) -> bool:
|
||||
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
||||
if not want or cur != want:
|
||||
return False
|
||||
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
|
||||
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
|
||||
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later
|
||||
# (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require
|
||||
# every service's UpdateStatus to be settled too, so the wait spans the whole rolling update.
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"docker",
|
||||
"service",
|
||||
"inspect",
|
||||
*names,
|
||||
"--format",
|
||||
"{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
return False # a service vanished mid-check — not settled
|
||||
for state in proc.stdout.split("\n"):
|
||||
if state.strip() not in ("", "completed", "rollback_completed"):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@ -572,6 +598,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None:
|
||||
|
||||
def backup_app(domain: str) -> str:
|
||||
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
|
||||
# Never back up a stack that is still converging/rolling-updating: backupbot resolves each
|
||||
# service's hook container ONCE up front, so a task that cycles between that lookup and the
|
||||
# pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout
|
||||
# we still attempt the backup and let the tier's assertion deliver the verdict.
|
||||
deadline = time.time() + 300
|
||||
while time.time() < deadline and not services_converged(domain):
|
||||
print(
|
||||
f" backup: {domain} stack not settled yet — waiting before backup create", flush=True
|
||||
)
|
||||
time.sleep(5)
|
||||
return abra.backup_create(domain)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user