diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 6ba8b64..5c44c81 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -384,7 +384,13 @@ def services_converged(domain: str) -> bool: if proc.returncode != 0: return False # a service vanished mid-check — not settled for state in proc.stdout.split("\n"): - if state.strip() not in ("", "completed", "rollback_completed"): + # Only ACTIVE states block convergence. 'paused'/'rollback_paused' are terminal-without- + # intervention: swarm's default update-failure-action pauses the update on one task flicker + # and the flag then persists FOREVER (immich CI 241: app service 'paused' from a restart + # during restore, service back at 1/1 and healthy — the wait hung to its deadline). With + # N/N already required above, a paused update is settled for our purposes; the HTTP-health + # and tier assertions still gate whether the app actually works. + if state.strip() in ("updating", "rollback_started"): return False return True