Compare commits
3 Commits
main
...
redfix-m2-
| Author | SHA1 | Date | |
|---|---|---|---|
| b96b8a4c72 | |||
| 07fc6d4af5 | |||
| 61211dba70 |
@ -40,7 +40,17 @@ def is_enrolled(recipe: str) -> bool:
|
||||
|
||||
|
||||
def canonical_domain(recipe: str) -> str:
|
||||
"""Stable data-warm domain for the recipe's canonical."""
|
||||
"""Stable data-warm domain for the recipe's canonical.
|
||||
|
||||
For a recipe that is ALSO a live-warm provider (in `warm.WARM_DOMAINS` — e.g. keycloak, whose
|
||||
always-on shared OIDC instance lives at `warm-keycloak…`), the data-warm canonical MUST use a
|
||||
DISTINCT domain: otherwise the sweep's promote deploy/teardown at `warm-<recipe>` collides with —
|
||||
and could disrupt — the live shared service that other recipes (lasuite-*/drone) depend on. Give
|
||||
those recipes a collision-free `warm-canon-<recipe>` namespace (a separate stack/domain that can
|
||||
never touch the live provider); every other recipe keeps the plain `warm-<recipe>` scheme
|
||||
(zero blast radius on the 15 existing canonicals)."""
|
||||
if recipe in warm.WARM_DOMAINS:
|
||||
return f"warm-canon-{recipe}.ci.commoninternet.net"
|
||||
return warm.stable_domain(recipe)
|
||||
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ def _goat_admin(domain: str, args: str) -> str:
|
||||
f'--admin-password "$(cat /run/secrets/pds_admin_password)" '
|
||||
f"--pds-host {PDS_HOST_LOCAL} 2>&1"
|
||||
)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], timeout=120)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="pds", timeout=120)
|
||||
|
||||
|
||||
def account_did(domain: str) -> str | None:
|
||||
|
||||
@ -46,7 +46,7 @@ def _in_container(domain: str, shell_cmd: str) -> str:
|
||||
"""Run `shell_cmd` inside the PDS app container via exec_in_app (sh -c wrapper)."""
|
||||
# The admin_pw_flag uses $(cat ...) which only the sh inside the container can expand —
|
||||
# callers pass the raw shell command including those substitutions.
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", shell_cmd], timeout=120)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", shell_cmd], service="pds", timeout=120)
|
||||
|
||||
|
||||
def _goat_admin(domain: str, args: str) -> str:
|
||||
|
||||
@ -7,10 +7,12 @@ DEPLOY_TIMEOUT = (
|
||||
)
|
||||
HTTP_TIMEOUT = 900
|
||||
|
||||
# canon §2.B EXCEPTION (recorded in DECISIONS): keycloak is NOT a data-warm canonical. It is the
|
||||
# project's LIVE-WARM OIDC dep provider — an always-on shared service at the SAME stable domain a
|
||||
# data-warm canonical would use (warm-keycloak.ci.commoninternet.net). Enrolling it would make the
|
||||
# sweep's promote deploy/teardown collide with the live provider that lasuite-*/drone depend on for
|
||||
# SSO. keycloak is instead kept current by the sweep's roll_warm_infra step (the health-gated
|
||||
# warm/infra reconciler, WC1.1) — so it never lacks coverage. WARM_CANONICAL stays False.
|
||||
WARM_CANONICAL = False
|
||||
# phase redfix: keycloak IS now a data-warm canonical. The original canon §2.B exception de-enrolled
|
||||
# it because its canonical would have used the SAME domain as the live-warm OIDC provider
|
||||
# (warm-keycloak.ci.commoninternet.net), so the sweep's promote deploy/teardown would collide with the
|
||||
# live service lasuite-*/drone depend on. That collision is now structurally impossible:
|
||||
# `canonical.canonical_domain()` routes any recipe in `warm.WARM_DOMAINS` (keycloak) to a distinct
|
||||
# `warm-canon-<recipe>` domain/stack, so the data-warm canonical and the live-warm provider are
|
||||
# separate deployments that can never touch each other. keycloak therefore gets full data-warm
|
||||
# canonical coverage (a real promote on its latest release) without risking the live OIDC service.
|
||||
WARM_CANONICAL = True
|
||||
|
||||
@ -19,7 +19,14 @@ import _mumble_proto # noqa: E402
|
||||
|
||||
|
||||
def test_handshake_completes_with_channel_presence(live_app):
|
||||
r = _mumble_proto.retry_handshake(attempts=12, interval=5.0)
|
||||
# Readiness budget: 36×5s = 180s. The TCP READY_PROBE (recipe_meta) only proves port 64738 is
|
||||
# LISTENING; the murmur control channel needs additional warmup before it completes a full
|
||||
# TLS+Version+ServerSync handshake. Under concurrent node load (the canon sweep) that warmup
|
||||
# exceeded the old 60s budget and flaked this test RED, while it is reliably GREEN in isolation
|
||||
# (phase redfix M1: 3× isolation green, 0 isolation reds). The longer budget absorbs the
|
||||
# load-induced readiness delay WITHOUT weakening the assertion — a genuinely non-responsive
|
||||
# server still exhausts all retries and FAILs (the asserts below are unchanged).
|
||||
r = _mumble_proto.retry_handshake(attempts=36, interval=5.0)
|
||||
|
||||
assert r["tls_connect"], f"TLS connection to 127.0.0.1:64738 failed — {r.get('error')}"
|
||||
assert r["server_version"] is not None, "server did not send a Version message"
|
||||
|
||||
Reference in New Issue
Block a user