From ec760724892a76c007195345ba667bbf059ab726 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Fri, 29 May 2026 20:19:07 +0100 Subject: [PATCH] =?UTF-8?q?fix(2):=20Q4.2=20mumble=20=E2=80=94=20TCP=20voi?= =?UTF-8?q?ce-server=20READY=5FPROBE=20gates=20backup=20past=20upgrade=20h?= =?UTF-8?q?ost-port=20churn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnostic (RECIPE=mumble STAGES=install,backup,restore,custom, no upgrade) PROVED backup+restore green on a stable 1.0.0 deploy incl. ci_marker survival (P4). The full-run backup 409 ('container not running') was the chaos UPGRADE redeploy: host-mode 64738 must be released by the old task + rebound by the new, and HEALTH_PATH '/' only proves the mumble-web sidecar (not the voice server), so wait_healthy passed while the app churned → backup-bot execed a not-running container. Fix: extend lifecycle.wait_ready_probes to support a TCP probe ({tcp_host,tcp_port,stable=N consecutive connects}); mumble recipe_meta READY_PROBE returns 64738 (stable=3) so the harness waits for the voice server up after install AND upgrade before backup. Co-Authored-By: Claude Opus 4.8 (1M context) --- runner/harness/lifecycle.py | 35 ++++++++++++++++++++++++++++++++++- tests/mumble/recipe_meta.py | 11 +++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index fd6855b..cc0d999 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -11,6 +11,7 @@ import datetime import json import os import re +import socket import ssl import subprocess import time @@ -440,12 +441,44 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None: e.g. lasuite-drive's collabora WOPI discovery (`/hosting/discovery` on the collabora sibling host): swarm reports collabora 1/1 'running' while coolwsd is still doing jail/config init and its discovery endpoint 404s, so replica-convergence alone is not real readiness. Used after the - install deploy and after the upgrade chaos redeploy so 'reconverged' means genuinely ready.""" + install deploy and after the upgrade chaos redeploy so 'reconverged' means genuinely ready. + + A probe may instead be a TCP-listen check: `{"tcp_host":..., "tcp_port": int, "stable": N}` — poll + until a socket connect succeeds N consecutive times (default 2). This is for NON-HTTP services + whose HEALTH_PATH doesn't reflect them, e.g. mumble's voice server on 64738: the app's HTTP + readiness comes from the mumble-web sidecar, so after a chaos upgrade redeploy (host-mode 64738 + must be released by the old task + rebound by the new) the voice server can be down while + HTTP-200 still passes — and backup-bot then execs into a not-running app container (409). Requiring + the voice port to be stably listening before proceeding closes that window.""" probe_fn = meta.get("READY_PROBE") if not callable(probe_fn): return probes = probe_fn(domain) or [] for probe in probes: + if "tcp_port" in probe: + host = probe.get("tcp_host", "127.0.0.1") + port = int(probe["tcp_port"]) + needed = int(probe.get("stable", 2)) + deadline = time.time() + timeout + consec = 0 + last_err = None + while time.time() < deadline: + try: + with socket.create_connection((host, port), timeout=10): + consec += 1 + if consec >= needed: + print(f" ready-probe OK (tcp {needed}x): {host}:{port}", flush=True) + break + except OSError as e: + consec = 0 + last_err = e + time.sleep(3) + else: + raise TimeoutError( + f"READY_PROBE tcp {host}:{port} not stably listening ({needed}x) within " + f"{timeout}s — last error: {last_err}" + ) + continue host = probe["host"] path = probe.get("path", "/") ok = tuple(probe.get("ok", (200,))) diff --git a/tests/mumble/recipe_meta.py b/tests/mumble/recipe_meta.py index d2ed789..eaed723 100644 --- a/tests/mumble/recipe_meta.py +++ b/tests/mumble/recipe_meta.py @@ -36,3 +36,14 @@ EXTRA_ENV = { "WELCOME_TEXT": WELCOME_TEXT_MARKER, "USERS": str(MAX_USERS), } + + +def READY_PROBE(domain): + # HEALTH_PATH "/" only proves the mumble-web HTTP sidecar; it does NOT reflect the voice server. + # After a chaos upgrade redeploy the host-mode 64738 port must be released by the old task and + # rebound by the new one — a window where the app (voice) container isn't yet serving while + # mumble-web still returns 200. backup-bot then execs its sqlite pre-hook into a not-running app + # container → 409. Gate readiness on the voice port being STABLY listening (3 consecutive + # connects) before the harness proceeds to the backup tier. The port is host-published + # (compose.host-ports.yml), so we probe it on the cc-ci host where the run executes. + return [{"tcp_host": "127.0.0.1", "tcp_port": 64738, "stable": 3}]