fix(2): matrix register test — bounded readiness-retry on transient post-restore 5xx (synapse re-establishing DB pool after restore-tier DROP DATABASE); assertion unchanged, RAISEs on persistent failure

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-30 00:52:18 +01:00
parent cf54fe36a8
commit db124d5107

View File

@ -80,29 +80,65 @@ def _container_curl(domain: str, method: str, path: str, body: dict | None = Non
def _admin_register(domain: str, secret: str, username: str, password: str, admin: bool) -> dict:
"""Register a user via the shared-secret admin endpoint, called from inside the container."""
# Step 1: GET nonce
r = _container_curl(domain, "GET", "/_synapse/admin/v1/register")
assert r["status"] == 200, f"nonce GET failed: status={r['status']} raw={r['raw'][:200]!r}"
nonce = (r["body"] or {}).get("nonce")
assert nonce, f"no nonce in response: {r['body']!r}"
"""Register a user via the shared-secret admin endpoint, called from inside the container.
Readiness-robust: in the FULL lifecycle the custom tier runs right after the restore tier, which
`DROP DATABASE … WITH (FORCE)` + recreates synapse's postgres DB (pg_backup.sh restore). Synapse
is still re-establishing its DB connection pool in that window, so a registration (a DB *write*)
can transiently return HTTP 500 M_UNKNOWN even though HTTP health (a read) is already green. We
poll: re-fetch a fresh nonce + re-POST on 5xx/transport-error until 200 or timeout, then RAISE. A
4xx (real rejection — bad MAC, user exists, policy) is NOT retried (fail fast). The assertion is
unchanged (registration must succeed); only the post-restore recovery window is tolerated."""
import time
# Step 2: HMAC and POST
admin_flag = "admin" if admin else "notadmin"
msg = f"{nonce}\0{username}\0{password}\0{admin_flag}".encode()
mac = hmac.new(secret.encode(), msg, hashlib.sha1).hexdigest()
payload = {
"nonce": nonce,
"username": username,
"password": password,
"mac": mac,
"admin": admin,
}
r = _container_curl(domain, "POST", "/_synapse/admin/v1/register", body=payload)
assert r["status"] == 200, (
f"register {username!r} failed: status={r['status']} body={r['body']!r}"
deadline = time.monotonic() + 90 # bounded recovery window
attempt = 0
last = {"status": 0, "body": None, "raw": ""}
while time.monotonic() < deadline:
attempt += 1
# Step 1: GET a fresh nonce (single-use; re-fetch each attempt)
r = _container_curl(domain, "GET", "/_synapse/admin/v1/register")
if r["status"] in (500, 502, 503, 504, 0):
last = r
print(f" [register] {username}: nonce GET transient {r['status']} "
f"(attempt {attempt}, synapse recovering) — retrying", flush=True)
time.sleep(5)
continue
assert r["status"] == 200, f"nonce GET failed: status={r['status']} raw={r['raw'][:200]!r}"
nonce = (r["body"] or {}).get("nonce")
assert nonce, f"no nonce in response: {r['body']!r}"
# Step 2: HMAC and POST
msg = f"{nonce}\0{username}\0{password}\0{admin_flag}".encode()
mac = hmac.new(secret.encode(), msg, hashlib.sha1).hexdigest()
payload = {
"nonce": nonce,
"username": username,
"password": password,
"mac": mac,
"admin": admin,
}
r = _container_curl(domain, "POST", "/_synapse/admin/v1/register", body=payload)
if r["status"] == 200:
if attempt > 1:
print(f" [register] {username}: succeeded on attempt {attempt} "
f"(synapse recovered)", flush=True)
return r["body"] or {}
if r["status"] in (500, 502, 503, 504, 0):
last = r
print(f" [register] {username}: POST transient {r['status']} "
f"(attempt {attempt}, synapse recovering) — retrying", flush=True)
time.sleep(5)
continue
# a 4xx is a real rejection — fail fast, do not retry
raise AssertionError(
f"register {username!r} rejected: status={r['status']} body={r['body']!r}"
)
raise AssertionError(
f"register {username!r} never succeeded within the post-restore recovery window "
f"({attempt} attempts, 90s): last status={last['status']} body={last['body']!r}"
)
return r["body"] or {}
def _login(domain: str, username: str, password: str) -> str: