From be2026aafb906d5d366e89a4422f7f9404c2e2a1 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Thu, 11 Jun 2026 00:26:53 +0000 Subject: [PATCH] =?UTF-8?q?fix(harness):=20services=5Fconverged=20?= =?UTF-8?q?=E2=80=94=20a=20replica=20deficit=20explained=20entirely=20by?= =?UTF-8?q?=20Complete=20tasks=20is=20converged=20(triggered=20one-shot,?= =?UTF-8?q?=20rcust=20M2=20lasuite-drive=20root=20cause)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runner/harness/lifecycle.py | 21 +++++- tests/unit/test_converged_oneshot.py | 96 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_converged_oneshot.py diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index fddc286..fe27b9e 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -348,8 +348,27 @@ def services_converged(domain: str) -> bool: # `want == "0"` rejection wrongly treated those as never-converged, hanging the deploy # forever. `cur == want` (with `want` present) is the correct convergence test; a service # still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged. - if not want or cur != want: + if not want: return False + if cur != want: + # A TRIGGERED one-shot (restart_policy none, scaled 0→1, runs once, exits 0) reports + # "0/1" FOREVER after its task completes — swarm never restarts it, so a bare + # `cur != want` rejection would block convergence for the rest of the run (lasuite-drive + # minio-createbuckets, rcust M2: install assert burned the full DEPLOY_TIMEOUT after the + # P2b port moved the bucket trigger BEFORE the install assert; pre-restructure the + # trigger ran after it, so converge never saw the 0/1). A replica deficit explained + # entirely by COMPLETE tasks IS converged: the one-shot did its job and will never run + # again. Anything else in the deficit (Running/Starting/Pending = still spinning up; + # Failed/Rejected = genuinely broken) stays not-converged, and a desired>0 service with + # no tasks yet is still scheduling. + tasks = subprocess.run( + ["docker", "service", "ps", name, "--format", "{{.CurrentState}}"], + capture_output=True, + text=True, + ) + states = [ln.split()[0] for ln in tasks.stdout.split("\n") if ln.strip()] + if not (states and all(s == "Complete" for s in states)): + return False # N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes # a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may # not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later diff --git a/tests/unit/test_converged_oneshot.py b/tests/unit/test_converged_oneshot.py new file mode 100644 index 0000000..158a8bb --- /dev/null +++ b/tests/unit/test_converged_oneshot.py @@ -0,0 +1,96 @@ +"""Unit tests for lifecycle.services_converged's completed-one-shot rule (rcust M2 fix-forward). + +A TRIGGERED one-shot service (restart_policy none, scaled 0→1, runs once, exits 0) reports "0/1" +forever after its task completes — swarm never restarts it. A bare `cur != want` rejection then +blocks convergence for the REST OF THE RUN (lasuite-drive minio-createbuckets: the P2b port moved +the bucket trigger BEFORE the install assert, so the assert burned the full DEPLOY_TIMEOUT — +pre-restructure the trigger ran after the assert and converge never saw the 0/1). + +Pins (the Adversary's non-vacuity criteria): +- deficit explained ENTIRELY by Complete tasks → converged (the one-shot did its job). +- deficit with a Failed task → NOT converged (a broken one-shot must not pass). +- deficit with a Running/Preparing task → NOT converged (still spinning up; no early green). +- deficit with NO tasks yet → NOT converged (still scheduling). +- plain N/N services still converge; plain 0/1-spinning-up still doesn't (regression guards). +""" + +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle as lc # noqa: E402 + + +class _R: + def __init__(self, stdout="", stderr="", returncode=0): + self.stdout, self.stderr, self.returncode = stdout, stderr, returncode + + +def _patch_docker(monkeypatch, replicas_rows, task_states_by_service=None, update_state=""): + """Fake subprocess.run for the three docker calls services_converged makes.""" + task_states_by_service = task_states_by_service or {} + + def fake_run(args, **kw): + if args[:3] == ["docker", "stack", "services"]: + return _R(stdout="\n".join(replicas_rows) + "\n") + if args[:3] == ["docker", "service", "ps"]: + name = args[3] + return _R(stdout="\n".join(task_states_by_service.get(name, [])) + "\n") + if args[:3] == ["docker", "service", "inspect"]: + return _R(stdout=update_state + "\n") + raise AssertionError(f"unexpected docker call: {args}") + + monkeypatch.setattr(lc.subprocess, "run", fake_run) + + +def test_completed_oneshot_deficit_is_converged(monkeypatch): + _patch_docker( + monkeypatch, + ["stack_app 1/1", "stack_minio-createbuckets 0/1"], + {"stack_minio-createbuckets": ["Complete 28 minutes ago"]}, + ) + assert lc.services_converged("app.example.com") is True + + +def test_failed_oneshot_deficit_is_not_converged(monkeypatch): + _patch_docker( + monkeypatch, + ["stack_app 1/1", "stack_minio-createbuckets 0/1"], + {"stack_minio-createbuckets": ["Failed 2 minutes ago"]}, + ) + assert lc.services_converged("app.example.com") is False + + +def test_mixed_complete_and_failed_tasks_not_converged(monkeypatch): + _patch_docker( + monkeypatch, + ["stack_oneshot 0/1"], + {"stack_oneshot": ["Complete 5 minutes ago", "Failed 6 minutes ago"]}, + ) + assert lc.services_converged("app.example.com") is False + + +def test_still_spinning_up_not_converged(monkeypatch): + _patch_docker( + monkeypatch, + ["stack_app 0/1"], + {"stack_app": ["Preparing 10 seconds ago"]}, + ) + assert lc.services_converged("app.example.com") is False + + +def test_deficit_with_no_tasks_yet_not_converged(monkeypatch): + _patch_docker(monkeypatch, ["stack_app 0/1"], {"stack_app": []}) + assert lc.services_converged("app.example.com") is False + + +def test_all_full_replicas_still_converged(monkeypatch): + _patch_docker(monkeypatch, ["stack_app 1/1", "stack_db 1/1"]) + assert lc.services_converged("app.example.com") is True + + +def test_on_demand_zero_zero_oneshot_still_converged(monkeypatch): + _patch_docker(monkeypatch, ["stack_app 1/1", "stack_minio-createbuckets 0/0"]) + assert lc.services_converged("app.example.com") is True