From db124d510702e8b68a942551788e87e0442e586b Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Sat, 30 May 2026 00:52:18 +0100 Subject: [PATCH] =?UTF-8?q?fix(2):=20matrix=20register=20test=20=E2=80=94?= =?UTF-8?q?=20bounded=20readiness-retry=20on=20transient=20post-restore=20?= =?UTF-8?q?5xx=20(synapse=20re-establishing=20DB=20pool=20after=20restore-?= =?UTF-8?q?tier=20DROP=20DATABASE);=20assertion=20unchanged,=20RAISEs=20on?= =?UTF-8?q?=20persistent=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- .../functional/test_register_and_message.py | 76 ++++++++++++++----- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/tests/matrix-synapse/functional/test_register_and_message.py b/tests/matrix-synapse/functional/test_register_and_message.py index 746ebad..a00249e 100644 --- a/tests/matrix-synapse/functional/test_register_and_message.py +++ b/tests/matrix-synapse/functional/test_register_and_message.py @@ -80,29 +80,65 @@ def _container_curl(domain: str, method: str, path: str, body: dict | None = Non def _admin_register(domain: str, secret: str, username: str, password: str, admin: bool) -> dict: - """Register a user via the shared-secret admin endpoint, called from inside the container.""" - # Step 1: GET nonce - r = _container_curl(domain, "GET", "/_synapse/admin/v1/register") - assert r["status"] == 200, f"nonce GET failed: status={r['status']} raw={r['raw'][:200]!r}" - nonce = (r["body"] or {}).get("nonce") - assert nonce, f"no nonce in response: {r['body']!r}" + """Register a user via the shared-secret admin endpoint, called from inside the container. + + Readiness-robust: in the FULL lifecycle the custom tier runs right after the restore tier, which + `DROP DATABASE … WITH (FORCE)` + recreates synapse's postgres DB (pg_backup.sh restore). Synapse + is still re-establishing its DB connection pool in that window, so a registration (a DB *write*) + can transiently return HTTP 500 M_UNKNOWN even though HTTP health (a read) is already green. We + poll: re-fetch a fresh nonce + re-POST on 5xx/transport-error until 200 or timeout, then RAISE. A + 4xx (real rejection — bad MAC, user exists, policy) is NOT retried (fail fast). The assertion is + unchanged (registration must succeed); only the post-restore recovery window is tolerated.""" + import time - # Step 2: HMAC and POST admin_flag = "admin" if admin else "notadmin" - msg = f"{nonce}\0{username}\0{password}\0{admin_flag}".encode() - mac = hmac.new(secret.encode(), msg, hashlib.sha1).hexdigest() - payload = { - "nonce": nonce, - "username": username, - "password": password, - "mac": mac, - "admin": admin, - } - r = _container_curl(domain, "POST", "/_synapse/admin/v1/register", body=payload) - assert r["status"] == 200, ( - f"register {username!r} failed: status={r['status']} body={r['body']!r}" + deadline = time.monotonic() + 90 # bounded recovery window + attempt = 0 + last = {"status": 0, "body": None, "raw": ""} + while time.monotonic() < deadline: + attempt += 1 + # Step 1: GET a fresh nonce (single-use; re-fetch each attempt) + r = _container_curl(domain, "GET", "/_synapse/admin/v1/register") + if r["status"] in (500, 502, 503, 504, 0): + last = r + print(f" [register] {username}: nonce GET transient {r['status']} " + f"(attempt {attempt}, synapse recovering) — retrying", flush=True) + time.sleep(5) + continue + assert r["status"] == 200, f"nonce GET failed: status={r['status']} raw={r['raw'][:200]!r}" + nonce = (r["body"] or {}).get("nonce") + assert nonce, f"no nonce in response: {r['body']!r}" + + # Step 2: HMAC and POST + msg = f"{nonce}\0{username}\0{password}\0{admin_flag}".encode() + mac = hmac.new(secret.encode(), msg, hashlib.sha1).hexdigest() + payload = { + "nonce": nonce, + "username": username, + "password": password, + "mac": mac, + "admin": admin, + } + r = _container_curl(domain, "POST", "/_synapse/admin/v1/register", body=payload) + if r["status"] == 200: + if attempt > 1: + print(f" [register] {username}: succeeded on attempt {attempt} " + f"(synapse recovered)", flush=True) + return r["body"] or {} + if r["status"] in (500, 502, 503, 504, 0): + last = r + print(f" [register] {username}: POST transient {r['status']} " + f"(attempt {attempt}, synapse recovering) — retrying", flush=True) + time.sleep(5) + continue + # a 4xx is a real rejection — fail fast, do not retry + raise AssertionError( + f"register {username!r} rejected: status={r['status']} body={r['body']!r}" + ) + raise AssertionError( + f"register {username!r} never succeeded within the post-restore recovery window " + f"({attempt} attempts, 90s): last status={last['status']} body={last['body']!r}" ) - return r["body"] or {} def _login(domain: str, username: str, password: str) -> str: