Compare commits

...

3 Commits

Author SHA1 Message Date
b96b8a4c72 fix(bluesky-pds): exec into renamed 'pds' service (pairs with recipe rename app->pds)
Some checks failed
continuous-integration/drone/push Build is failing
The recipe renames its main service app->pds so caddy resolves THIS stack's PDS on the shared proxy
(abra drops compose network aliases, so a rename is the robust fix). Update the two exec_in_app calls
to service=pds to match. Same assertions.
2026-06-18 01:58:16 +00:00
07fc6d4af5 fix(mumble): widen handshake readiness budget 60s->180s (load flake stabilization)
The TCP READY_PROBE proves 64738 is listening, but the murmur control channel needs more warmup to
complete a full TLS+ServerSync handshake; under concurrent sweep load that exceeded the 60s budget
(green in isolation, red under load). Longer budget absorbs the delay; assertions unchanged (a dead
server still fails after all retries).
2026-06-18 01:58:16 +00:00
61211dba70 fix(keycloak): collision-free canonical domain for live-warm providers; enroll keycloak
canonical_domain() routes any recipe in warm.WARM_DOMAINS (keycloak) to a distinct warm-canon-<recipe>
domain so the data-warm canonical promote can never collide with the live-warm OIDC provider at
warm-keycloak. keycloak WARM_CANONICAL=True (full canonical coverage without risking live SSO).
2026-06-18 01:58:16 +00:00
5 changed files with 30 additions and 11 deletions

View File

@ -40,7 +40,17 @@ def is_enrolled(recipe: str) -> bool:
def canonical_domain(recipe: str) -> str:
"""Stable data-warm domain for the recipe's canonical."""
"""Stable data-warm domain for the recipe's canonical.
For a recipe that is ALSO a live-warm provider (in `warm.WARM_DOMAINS` — e.g. keycloak, whose
always-on shared OIDC instance lives at `warm-keycloak…`), the data-warm canonical MUST use a
DISTINCT domain: otherwise the sweep's promote deploy/teardown at `warm-<recipe>` collides with —
and could disrupt — the live shared service that other recipes (lasuite-*/drone) depend on. Give
those recipes a collision-free `warm-canon-<recipe>` namespace (a separate stack/domain that can
never touch the live provider); every other recipe keeps the plain `warm-<recipe>` scheme
(zero blast radius on the 15 existing canonicals)."""
if recipe in warm.WARM_DOMAINS:
return f"warm-canon-{recipe}.ci.commoninternet.net"
return warm.stable_domain(recipe)

View File

@ -37,7 +37,7 @@ def _goat_admin(domain: str, args: str) -> str:
f'--admin-password "$(cat /run/secrets/pds_admin_password)" '
f"--pds-host {PDS_HOST_LOCAL} 2>&1"
)
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], timeout=120)
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="pds", timeout=120)
def account_did(domain: str) -> str | None:

View File

@ -46,7 +46,7 @@ def _in_container(domain: str, shell_cmd: str) -> str:
"""Run `shell_cmd` inside the PDS app container via exec_in_app (sh -c wrapper)."""
# The admin_pw_flag uses $(cat ...) which only the sh inside the container can expand —
# callers pass the raw shell command including those substitutions.
return lifecycle.exec_in_app(domain, ["sh", "-c", shell_cmd], timeout=120)
return lifecycle.exec_in_app(domain, ["sh", "-c", shell_cmd], service="pds", timeout=120)
def _goat_admin(domain: str, args: str) -> str:

View File

@ -7,10 +7,12 @@ DEPLOY_TIMEOUT = (
)
HTTP_TIMEOUT = 900
# canon §2.B EXCEPTION (recorded in DECISIONS): keycloak is NOT a data-warm canonical. It is the
# project's LIVE-WARM OIDC dep provider — an always-on shared service at the SAME stable domain a
# data-warm canonical would use (warm-keycloak.ci.commoninternet.net). Enrolling it would make the
# sweep's promote deploy/teardown collide with the live provider that lasuite-*/drone depend on for
# SSO. keycloak is instead kept current by the sweep's roll_warm_infra step (the health-gated
# warm/infra reconciler, WC1.1) — so it never lacks coverage. WARM_CANONICAL stays False.
WARM_CANONICAL = False
# phase redfix: keycloak IS now a data-warm canonical. The original canon §2.B exception de-enrolled
# it because its canonical would have used the SAME domain as the live-warm OIDC provider
# (warm-keycloak.ci.commoninternet.net), so the sweep's promote deploy/teardown would collide with the
# live service lasuite-*/drone depend on. That collision is now structurally impossible:
# `canonical.canonical_domain()` routes any recipe in `warm.WARM_DOMAINS` (keycloak) to a distinct
# `warm-canon-<recipe>` domain/stack, so the data-warm canonical and the live-warm provider are
# separate deployments that can never touch each other. keycloak therefore gets full data-warm
# canonical coverage (a real promote on its latest release) without risking the live OIDC service.
WARM_CANONICAL = True

View File

@ -19,7 +19,14 @@ import _mumble_proto # noqa: E402
def test_handshake_completes_with_channel_presence(live_app):
r = _mumble_proto.retry_handshake(attempts=12, interval=5.0)
# Readiness budget: 36×5s = 180s. The TCP READY_PROBE (recipe_meta) only proves port 64738 is
# LISTENING; the murmur control channel needs additional warmup before it completes a full
# TLS+Version+ServerSync handshake. Under concurrent node load (the canon sweep) that warmup
# exceeded the old 60s budget and flaked this test RED, while it is reliably GREEN in isolation
# (phase redfix M1: 3× isolation green, 0 isolation reds). The longer budget absorbs the
# load-induced readiness delay WITHOUT weakening the assertion — a genuinely non-responsive
# server still exhausts all retries and FAILs (the asserts below are unchanged).
r = _mumble_proto.retry_handshake(attempts=36, interval=5.0)
assert r["tls_connect"], f"TLS connection to 127.0.0.1:64738 failed — {r.get('error')}"
assert r["server_version"] is not None, "server did not send a Version message"