fix(harness): merge fix/converged-oneshot @ be2026a — services_converged completed-one-shot rule (rcust M2 fix-forward #2, Adversary-approved a531746)

This commit is contained in:
autonomic-bot
2026-06-11 00:33:07 +00:00
2 changed files with 116 additions and 1 deletions

View File

@ -348,8 +348,27 @@ def services_converged(domain: str) -> bool:
# `want == "0"` rejection wrongly treated those as never-converged, hanging the deploy
# forever. `cur == want` (with `want` present) is the correct convergence test; a service
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
if not want or cur != want:
if not want:
return False
if cur != want:
# A TRIGGERED one-shot (restart_policy none, scaled 0→1, runs once, exits 0) reports
# "0/1" FOREVER after its task completes — swarm never restarts it, so a bare
# `cur != want` rejection would block convergence for the rest of the run (lasuite-drive
# minio-createbuckets, rcust M2: install assert burned the full DEPLOY_TIMEOUT after the
# P2b port moved the bucket trigger BEFORE the install assert; pre-restructure the
# trigger ran after it, so converge never saw the 0/1). A replica deficit explained
# entirely by COMPLETE tasks IS converged: the one-shot did its job and will never run
# again. Anything else in the deficit (Running/Starting/Pending = still spinning up;
# Failed/Rejected = genuinely broken) stays not-converged, and a desired>0 service with
# no tasks yet is still scheduling.
tasks = subprocess.run(
["docker", "service", "ps", name, "--format", "{{.CurrentState}}"],
capture_output=True,
text=True,
)
states = [ln.split()[0] for ln in tasks.stdout.split("\n") if ln.strip()]
if not (states and all(s == "Complete" for s in states)):
return False
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later

View File

@ -0,0 +1,96 @@
"""Unit tests for lifecycle.services_converged's completed-one-shot rule (rcust M2 fix-forward).
A TRIGGERED one-shot service (restart_policy none, scaled 0→1, runs once, exits 0) reports "0/1"
forever after its task completes — swarm never restarts it. A bare `cur != want` rejection then
blocks convergence for the REST OF THE RUN (lasuite-drive minio-createbuckets: the P2b port moved
the bucket trigger BEFORE the install assert, so the assert burned the full DEPLOY_TIMEOUT —
pre-restructure the trigger ran after the assert and converge never saw the 0/1).
Pins (the Adversary's non-vacuity criteria):
- deficit explained ENTIRELY by Complete tasks → converged (the one-shot did its job).
- deficit with a Failed task → NOT converged (a broken one-shot must not pass).
- deficit with a Running/Preparing task → NOT converged (still spinning up; no early green).
- deficit with NO tasks yet → NOT converged (still scheduling).
- plain N/N services still converge; plain 0/1-spinning-up still doesn't (regression guards).
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle as lc # noqa: E402
class _R:
def __init__(self, stdout="", stderr="", returncode=0):
self.stdout, self.stderr, self.returncode = stdout, stderr, returncode
def _patch_docker(monkeypatch, replicas_rows, task_states_by_service=None, update_state=""):
"""Fake subprocess.run for the three docker calls services_converged makes."""
task_states_by_service = task_states_by_service or {}
def fake_run(args, **kw):
if args[:3] == ["docker", "stack", "services"]:
return _R(stdout="\n".join(replicas_rows) + "\n")
if args[:3] == ["docker", "service", "ps"]:
name = args[3]
return _R(stdout="\n".join(task_states_by_service.get(name, [])) + "\n")
if args[:3] == ["docker", "service", "inspect"]:
return _R(stdout=update_state + "\n")
raise AssertionError(f"unexpected docker call: {args}")
monkeypatch.setattr(lc.subprocess, "run", fake_run)
def test_completed_oneshot_deficit_is_converged(monkeypatch):
_patch_docker(
monkeypatch,
["stack_app 1/1", "stack_minio-createbuckets 0/1"],
{"stack_minio-createbuckets": ["Complete 28 minutes ago"]},
)
assert lc.services_converged("app.example.com") is True
def test_failed_oneshot_deficit_is_not_converged(monkeypatch):
_patch_docker(
monkeypatch,
["stack_app 1/1", "stack_minio-createbuckets 0/1"],
{"stack_minio-createbuckets": ["Failed 2 minutes ago"]},
)
assert lc.services_converged("app.example.com") is False
def test_mixed_complete_and_failed_tasks_not_converged(monkeypatch):
_patch_docker(
monkeypatch,
["stack_oneshot 0/1"],
{"stack_oneshot": ["Complete 5 minutes ago", "Failed 6 minutes ago"]},
)
assert lc.services_converged("app.example.com") is False
def test_still_spinning_up_not_converged(monkeypatch):
_patch_docker(
monkeypatch,
["stack_app 0/1"],
{"stack_app": ["Preparing 10 seconds ago"]},
)
assert lc.services_converged("app.example.com") is False
def test_deficit_with_no_tasks_yet_not_converged(monkeypatch):
_patch_docker(monkeypatch, ["stack_app 0/1"], {"stack_app": []})
assert lc.services_converged("app.example.com") is False
def test_all_full_replicas_still_converged(monkeypatch):
_patch_docker(monkeypatch, ["stack_app 1/1", "stack_db 1/1"])
assert lc.services_converged("app.example.com") is True
def test_on_demand_zero_zero_oneshot_still_converged(monkeypatch):
_patch_docker(monkeypatch, ["stack_app 1/1", "stack_minio-createbuckets 0/0"])
assert lc.services_converged("app.example.com") is True