|
|
|
|
@ -340,18 +340,22 @@ def _stack_name(domain: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def services_converged(domain: str) -> bool:
|
|
|
|
|
"""True when every service in the stack reports replicas N/N (N>0)."""
|
|
|
|
|
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
|
|
|
|
|
mid-rolling-update (swarm UpdateStatus settled)."""
|
|
|
|
|
stack = _stack_name(domain)
|
|
|
|
|
proc = subprocess.run(
|
|
|
|
|
["docker", "stack", "services", stack, "--format", "{{.Replicas}}"],
|
|
|
|
|
["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
text=True,
|
|
|
|
|
)
|
|
|
|
|
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
|
|
|
|
if not rows:
|
|
|
|
|
return False
|
|
|
|
|
names = []
|
|
|
|
|
for r in rows:
|
|
|
|
|
cur, _, want = r.partition("/")
|
|
|
|
|
name, _, replicas = r.partition(" ")
|
|
|
|
|
names.append(name)
|
|
|
|
|
cur, _, want = replicas.partition("/")
|
|
|
|
|
# A service at its DESIRED replica count is converged — including a `replicas: 0`
|
|
|
|
|
# on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up
|
|
|
|
|
# manually only when buckets need (re)creating), which reports "0/0". The earlier
|
|
|
|
|
@ -360,6 +364,28 @@ def services_converged(domain: str) -> bool:
|
|
|
|
|
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
|
|
|
|
if not want or cur != want:
|
|
|
|
|
return False
|
|
|
|
|
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
|
|
|
|
|
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
|
|
|
|
|
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later
|
|
|
|
|
# (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require
|
|
|
|
|
# every service's UpdateStatus to be settled too, so the wait spans the whole rolling update.
|
|
|
|
|
proc = subprocess.run(
|
|
|
|
|
[
|
|
|
|
|
"docker",
|
|
|
|
|
"service",
|
|
|
|
|
"inspect",
|
|
|
|
|
*names,
|
|
|
|
|
"--format",
|
|
|
|
|
"{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}",
|
|
|
|
|
],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
text=True,
|
|
|
|
|
)
|
|
|
|
|
if proc.returncode != 0:
|
|
|
|
|
return False # a service vanished mid-check — not settled
|
|
|
|
|
for state in proc.stdout.split("\n"):
|
|
|
|
|
if state.strip() not in ("", "completed", "rollback_completed"):
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -572,6 +598,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None:
|
|
|
|
|
|
|
|
|
|
def backup_app(domain: str) -> str:
|
|
|
|
|
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
|
|
|
|
|
# Never back up a stack that is still converging/rolling-updating: backupbot resolves each
|
|
|
|
|
# service's hook container ONCE up front, so a task that cycles between that lookup and the
|
|
|
|
|
# pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout
|
|
|
|
|
# we still attempt the backup and let the tier's assertion deliver the verdict.
|
|
|
|
|
deadline = time.time() + 300
|
|
|
|
|
while time.time() < deadline and not services_converged(domain):
|
|
|
|
|
print(
|
|
|
|
|
f" backup: {domain} stack not settled yet — waiting before backup create", flush=True
|
|
|
|
|
)
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
return abra.backup_create(domain)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|