fix(harness): merge fix/converged-oneshot @ be2026a — services_converged completed-one-shot rule (rcust M2 fix-forward #2, Adversary-approved a531746)
This commit is contained in:
@ -348,8 +348,27 @@ def services_converged(domain: str) -> bool:
|
||||
# `want == "0"` rejection wrongly treated those as never-converged, hanging the deploy
|
||||
# forever. `cur == want` (with `want` present) is the correct convergence test; a service
|
||||
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
||||
if not want or cur != want:
|
||||
if not want:
|
||||
return False
|
||||
if cur != want:
|
||||
# A TRIGGERED one-shot (restart_policy none, scaled 0→1, runs once, exits 0) reports
|
||||
# "0/1" FOREVER after its task completes — swarm never restarts it, so a bare
|
||||
# `cur != want` rejection would block convergence for the rest of the run (lasuite-drive
|
||||
# minio-createbuckets, rcust M2: install assert burned the full DEPLOY_TIMEOUT after the
|
||||
# P2b port moved the bucket trigger BEFORE the install assert; pre-restructure the
|
||||
# trigger ran after it, so converge never saw the 0/1). A replica deficit explained
|
||||
# entirely by COMPLETE tasks IS converged: the one-shot did its job and will never run
|
||||
# again. Anything else in the deficit (Running/Starting/Pending = still spinning up;
|
||||
# Failed/Rejected = genuinely broken) stays not-converged, and a desired>0 service with
|
||||
# no tasks yet is still scheduling.
|
||||
tasks = subprocess.run(
|
||||
["docker", "service", "ps", name, "--format", "{{.CurrentState}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
states = [ln.split()[0] for ln in tasks.stdout.split("\n") if ln.strip()]
|
||||
if not (states and all(s == "Complete" for s in states)):
|
||||
return False
|
||||
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
|
||||
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
|
||||
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later
|
||||
|
||||
96
tests/unit/test_converged_oneshot.py
Normal file
96
tests/unit/test_converged_oneshot.py
Normal file
@ -0,0 +1,96 @@
|
||||
"""Unit tests for lifecycle.services_converged's completed-one-shot rule (rcust M2 fix-forward).
|
||||
|
||||
A TRIGGERED one-shot service (restart_policy none, scaled 0→1, runs once, exits 0) reports "0/1"
|
||||
forever after its task completes — swarm never restarts it. A bare `cur != want` rejection then
|
||||
blocks convergence for the REST OF THE RUN (lasuite-drive minio-createbuckets: the P2b port moved
|
||||
the bucket trigger BEFORE the install assert, so the assert burned the full DEPLOY_TIMEOUT —
|
||||
pre-restructure the trigger ran after the assert and converge never saw the 0/1).
|
||||
|
||||
Pins (the Adversary's non-vacuity criteria):
|
||||
- deficit explained ENTIRELY by Complete tasks → converged (the one-shot did its job).
|
||||
- deficit with a Failed task → NOT converged (a broken one-shot must not pass).
|
||||
- deficit with a Running/Preparing task → NOT converged (still spinning up; no early green).
|
||||
- deficit with NO tasks yet → NOT converged (still scheduling).
|
||||
- plain N/N services still converge; plain 0/1-spinning-up still doesn't (regression guards).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import lifecycle as lc # noqa: E402
|
||||
|
||||
|
||||
class _R:
|
||||
def __init__(self, stdout="", stderr="", returncode=0):
|
||||
self.stdout, self.stderr, self.returncode = stdout, stderr, returncode
|
||||
|
||||
|
||||
def _patch_docker(monkeypatch, replicas_rows, task_states_by_service=None, update_state=""):
|
||||
"""Fake subprocess.run for the three docker calls services_converged makes."""
|
||||
task_states_by_service = task_states_by_service or {}
|
||||
|
||||
def fake_run(args, **kw):
|
||||
if args[:3] == ["docker", "stack", "services"]:
|
||||
return _R(stdout="\n".join(replicas_rows) + "\n")
|
||||
if args[:3] == ["docker", "service", "ps"]:
|
||||
name = args[3]
|
||||
return _R(stdout="\n".join(task_states_by_service.get(name, [])) + "\n")
|
||||
if args[:3] == ["docker", "service", "inspect"]:
|
||||
return _R(stdout=update_state + "\n")
|
||||
raise AssertionError(f"unexpected docker call: {args}")
|
||||
|
||||
monkeypatch.setattr(lc.subprocess, "run", fake_run)
|
||||
|
||||
|
||||
def test_completed_oneshot_deficit_is_converged(monkeypatch):
|
||||
_patch_docker(
|
||||
monkeypatch,
|
||||
["stack_app 1/1", "stack_minio-createbuckets 0/1"],
|
||||
{"stack_minio-createbuckets": ["Complete 28 minutes ago"]},
|
||||
)
|
||||
assert lc.services_converged("app.example.com") is True
|
||||
|
||||
|
||||
def test_failed_oneshot_deficit_is_not_converged(monkeypatch):
|
||||
_patch_docker(
|
||||
monkeypatch,
|
||||
["stack_app 1/1", "stack_minio-createbuckets 0/1"],
|
||||
{"stack_minio-createbuckets": ["Failed 2 minutes ago"]},
|
||||
)
|
||||
assert lc.services_converged("app.example.com") is False
|
||||
|
||||
|
||||
def test_mixed_complete_and_failed_tasks_not_converged(monkeypatch):
|
||||
_patch_docker(
|
||||
monkeypatch,
|
||||
["stack_oneshot 0/1"],
|
||||
{"stack_oneshot": ["Complete 5 minutes ago", "Failed 6 minutes ago"]},
|
||||
)
|
||||
assert lc.services_converged("app.example.com") is False
|
||||
|
||||
|
||||
def test_still_spinning_up_not_converged(monkeypatch):
|
||||
_patch_docker(
|
||||
monkeypatch,
|
||||
["stack_app 0/1"],
|
||||
{"stack_app": ["Preparing 10 seconds ago"]},
|
||||
)
|
||||
assert lc.services_converged("app.example.com") is False
|
||||
|
||||
|
||||
def test_deficit_with_no_tasks_yet_not_converged(monkeypatch):
|
||||
_patch_docker(monkeypatch, ["stack_app 0/1"], {"stack_app": []})
|
||||
assert lc.services_converged("app.example.com") is False
|
||||
|
||||
|
||||
def test_all_full_replicas_still_converged(monkeypatch):
|
||||
_patch_docker(monkeypatch, ["stack_app 1/1", "stack_db 1/1"])
|
||||
assert lc.services_converged("app.example.com") is True
|
||||
|
||||
|
||||
def test_on_demand_zero_zero_oneshot_still_converged(monkeypatch):
|
||||
_patch_docker(monkeypatch, ["stack_app 1/1", "stack_minio-createbuckets 0/0"])
|
||||
assert lc.services_converged("app.example.com") is True
|
||||
Reference in New Issue
Block a user