From e6d55b53c7d7dd502482dbb27662cd2616844e46 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 23:07:36 +0000 Subject: [PATCH] =?UTF-8?q?fix(harness):=20a=20paused=20swarm=20update=20i?= =?UTF-8?q?s=20settled=20=E2=80=94=20only=20active=20states=20block=20conv?= =?UTF-8?q?ergence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 68ef0f8 made services_converged() require UpdateStatus settled, treating 'paused' as in flight. But swarm's default update-failure-action pauses the update on a single task flicker and the flag persists FOREVER (until the next update): immich CI 241 had the app service 'paused' from a restart during restore while the service was back at 1/1 and healthy — every subsequent wait hung to its deadline and the run had to be killed. Only 'updating' and 'rollback_started' now block convergence: those are the states swarm is actively driving (the 238 stop-first race lives in 'updating'). 'paused'/'rollback_paused' make no progress without intervention, so waiting on them is pointless — N/N replicas is already required, and the HTTP-health and tier assertions still gate whether the app actually works. lint: PASS, unit tests: 138 passed. --- runner/harness/lifecycle.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 6ba8b64..5c44c81 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -384,7 +384,13 @@ def services_converged(domain: str) -> bool: if proc.returncode != 0: return False # a service vanished mid-check — not settled for state in proc.stdout.split("\n"): - if state.strip() not in ("", "completed", "rollback_completed"): + # Only ACTIVE states block convergence. 'paused'/'rollback_paused' are terminal-without- + # intervention: swarm's default update-failure-action pauses the update on one task flicker + # and the flag then persists FOREVER (immich CI 241: app service 'paused' from a restart + # during restore, service back at 1/1 and healthy — the wait hung to its deadline). With + # N/N already required above, a paused update is settled for our purposes; the HTTP-health + # and tier assertions still gate whether the app actually works. + if state.strip() in ("updating", "rollback_started"): return False return True