diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index fd6855b..cc0d999 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -11,6 +11,7 @@ import datetime import json import os import re +import socket import ssl import subprocess import time @@ -440,12 +441,44 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None: e.g. lasuite-drive's collabora WOPI discovery (`/hosting/discovery` on the collabora sibling host): swarm reports collabora 1/1 'running' while coolwsd is still doing jail/config init and its discovery endpoint 404s, so replica-convergence alone is not real readiness. Used after the - install deploy and after the upgrade chaos redeploy so 'reconverged' means genuinely ready.""" + install deploy and after the upgrade chaos redeploy so 'reconverged' means genuinely ready. + + A probe may instead be a TCP-listen check: `{"tcp_host":..., "tcp_port": int, "stable": N}` — poll + until a socket connect succeeds N consecutive times (default 2). This is for NON-HTTP services + whose HEALTH_PATH doesn't reflect them, e.g. mumble's voice server on 64738: the app's HTTP + readiness comes from the mumble-web sidecar, so after a chaos upgrade redeploy (host-mode 64738 + must be released by the old task + rebound by the new) the voice server can be down while + HTTP-200 still passes — and backup-bot then execs into a not-running app container (409). Requiring + the voice port to be stably listening before proceeding closes that window.""" probe_fn = meta.get("READY_PROBE") if not callable(probe_fn): return probes = probe_fn(domain) or [] for probe in probes: + if "tcp_port" in probe: + host = probe.get("tcp_host", "127.0.0.1") + port = int(probe["tcp_port"]) + needed = int(probe.get("stable", 2)) + deadline = time.time() + timeout + consec = 0 + last_err = None + while time.time() < deadline: + try: + with socket.create_connection((host, port), timeout=10): + consec += 1 + if consec >= needed: + print(f" ready-probe OK (tcp {needed}x): {host}:{port}", flush=True) + break + except OSError as e: + consec = 0 + last_err = e + time.sleep(3) + else: + raise TimeoutError( + f"READY_PROBE tcp {host}:{port} not stably listening ({needed}x) within " + f"{timeout}s — last error: {last_err}" + ) + continue host = probe["host"] path = probe.get("path", "/") ok = tuple(probe.get("ok", (200,))) diff --git a/tests/mumble/recipe_meta.py b/tests/mumble/recipe_meta.py index d2ed789..eaed723 100644 --- a/tests/mumble/recipe_meta.py +++ b/tests/mumble/recipe_meta.py @@ -36,3 +36,14 @@ EXTRA_ENV = { "WELCOME_TEXT": WELCOME_TEXT_MARKER, "USERS": str(MAX_USERS), } + + +def READY_PROBE(domain): + # HEALTH_PATH "/" only proves the mumble-web HTTP sidecar; it does NOT reflect the voice server. + # After a chaos upgrade redeploy the host-mode 64738 port must be released by the old task and + # rebound by the new one — a window where the app (voice) container isn't yet serving while + # mumble-web still returns 200. backup-bot then execs its sqlite pre-hook into a not-running app + # container → 409. Gate readiness on the voice port being STABLY listening (3 consecutive + # connects) before the harness proceeds to the backup tier. The port is host-published + # (compose.host-ports.yml), so we probe it on the cc-ci host where the run executes. + return [{"tcp_host": "127.0.0.1", "tcp_port": 64738, "stable": 3}]