fix(2): F2-12 lasuite-drive upgrade tier — own convergence wait (abra -c) + collabora READY_PROBE
Adversary cold-verify FAILed Q3.2 (F2-12): the prev→PR-head chaos upgrade's abra converge monitor FATAs while the NEW collabora 25.04.9.4.1's healthcheck is still in start_period (jail/config init), even though it converges given swarm's healthcheck retries. My WOPI pre-gate fixed the OLD collabora being killed mid-boot but not the NEW collabora's convergence. Flaky (3x green for me, 1x fail cold). Fix (cc-ci-side, stronger verification — not weaker): - abra.deploy gains no_converge_checks (`-c`); chaos_redeploy passes it for the upgrade op so abra's impatient monitor no longer FATAs (the stack spec is applied regardless). - perform_upgrade now OWNS the convergence verification after the redeploy: wait_healthy (services N/N + app HEALTH_PATH) + new lifecycle.wait_ready_probes (recipe READY_PROBE), bounded by the recipe DEPLOY_TIMEOUT (generous) not abra's impatient window. meta threaded _perform_op→perform_upgrade. - recipe_meta READY_PROBE hook (added to _load_meta whitelist): lasuite-drive probes collabora WOPI discovery (/hosting/discovery on collabora-<domain>) → 200. Called after install deploy AND after the upgrade redeploy. No-op for recipes without a READY_PROBE. NOT re-claiming yet — validating the upgrade tier is now reliably green (incl. the slow-collabora crossover) across multiple runs before re-claiming Q3.2. F2-12 stays open (Adversary-owned). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -117,10 +117,17 @@ def secret_generate(domain: str, timeout: int = 300) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def deploy(domain: str, chaos: bool = True, timeout: int = 900) -> None:
|
def deploy(domain: str, chaos: bool = True, timeout: int = 900, no_converge_checks: bool = False) -> None:
|
||||||
args = ["app", "deploy", domain, "-o", "-n"]
|
args = ["app", "deploy", domain, "-o", "-n"]
|
||||||
if chaos:
|
if chaos:
|
||||||
args.append("-C")
|
args.append("-C")
|
||||||
|
if no_converge_checks:
|
||||||
|
# `-c`: skip abra's own post-deploy convergence monitor. Used by the upgrade chaos redeploy
|
||||||
|
# of heavy stacks (lasuite-drive): abra's monitor FATAs while a slow service (collabora's
|
||||||
|
# new-version jail/config init) is still becoming healthy, even though it converges given
|
||||||
|
# time. The caller then performs its OWN, stricter convergence+health wait (services N/N +
|
||||||
|
# app health + recipe READY_PROBE) with a generous deadline — see lifecycle.chaos_redeploy.
|
||||||
|
args.append("-c")
|
||||||
_run(args, timeout=timeout)
|
_run(args, timeout=timeout)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -215,7 +215,7 @@ def assert_restore_healthy(domain: str, meta: dict) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def perform_upgrade(
|
def perform_upgrade(
|
||||||
domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900
|
domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900, meta: dict | None = None
|
||||||
) -> dict[str, str | None]:
|
) -> dict[str, str | None]:
|
||||||
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
|
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
|
||||||
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
|
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
|
||||||
@ -225,11 +225,28 @@ def perform_upgrade(
|
|||||||
— after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.
|
— after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.
|
||||||
|
|
||||||
`deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's
|
`deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the chaos redeploy so a heavy stack's
|
||||||
reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait."""
|
reconverge isn't SIGKILLed by abra.deploy's 900s default mid-wait.
|
||||||
|
|
||||||
|
F2-12: the chaos redeploy runs with `--no-converge-checks` (abra's own convergence monitor FATAs
|
||||||
|
on the heavy lasuite-drive prev→PR-head crossover while the NEW collabora's healthcheck is still
|
||||||
|
in its start_period, even though it converges given swarm's healthcheck retries). We then own a
|
||||||
|
STRICTER convergence+health wait here: services N/N (wait_healthy) + app HEALTH_PATH healthy +
|
||||||
|
any recipe READY_PROBE (collabora WOPI discovery 200). This bounds readiness by OUR generous
|
||||||
|
deadline, not abra's impatient one — and is stronger evidence than abra's monitor."""
|
||||||
|
meta = meta or {}
|
||||||
before = lifecycle.deployed_identity(domain)
|
before = lifecycle.deployed_identity(domain)
|
||||||
if head_ref:
|
if head_ref:
|
||||||
lifecycle.recipe_checkout_ref(recipe, head_ref)
|
lifecycle.recipe_checkout_ref(recipe, head_ref)
|
||||||
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout)
|
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
|
||||||
|
# Own the convergence verification (abra's monitor was skipped via -c).
|
||||||
|
lifecycle.wait_healthy(
|
||||||
|
domain,
|
||||||
|
ok_codes=tuple(meta.get("HEALTH_OK", (200, 301, 302))),
|
||||||
|
path=meta.get("HEALTH_PATH", "/"),
|
||||||
|
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)),
|
||||||
|
http_timeout=int(meta.get("HTTP_TIMEOUT", 300)),
|
||||||
|
)
|
||||||
|
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)))
|
||||||
after = lifecycle.deployed_identity(domain)
|
after = lifecycle.deployed_identity(domain)
|
||||||
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
|
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
|
||||||
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
|
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
|
||||||
|
|||||||
@ -316,7 +316,7 @@ def recipe_checkout_ref(recipe: str, ref: str) -> None:
|
|||||||
abra.recipe_checkout(recipe, ref)
|
abra.recipe_checkout(recipe, ref)
|
||||||
|
|
||||||
|
|
||||||
def chaos_redeploy(domain: str, deploy_timeout: int = 900) -> None:
|
def chaos_redeploy(domain: str, deploy_timeout: int = 900, no_converge_checks: bool = False) -> None:
|
||||||
"""In-place `abra app deploy --chaos`: redeploy the running app at the CURRENT recipe checkout
|
"""In-place `abra app deploy --chaos`: redeploy the running app at the CURRENT recipe checkout
|
||||||
(HC1: the PR-head code under test). This is the upgrade op, not a fresh install — it does NOT go
|
(HC1: the PR-head code under test). This is the upgrade op, not a fresh install — it does NOT go
|
||||||
through deploy_app, so the deploy-count guard (DG4.1) is not incremented.
|
through deploy_app, so the deploy-count guard (DG4.1) is not incremented.
|
||||||
@ -324,8 +324,45 @@ def chaos_redeploy(domain: str, deploy_timeout: int = 900) -> None:
|
|||||||
`deploy_timeout` is the abra subprocess wrapper timeout; pass the recipe's DEPLOY_TIMEOUT so a
|
`deploy_timeout` is the abra subprocess wrapper timeout; pass the recipe's DEPLOY_TIMEOUT so a
|
||||||
heavy stack's reconverge (e.g. lasuite-drive's slow collabora/onlyoffice boot) isn't SIGKILLed
|
heavy stack's reconverge (e.g. lasuite-drive's slow collabora/onlyoffice boot) isn't SIGKILLed
|
||||||
by the 900s default while abra is still legitimately waiting (its internal TIMEOUT can be larger
|
by the 900s default while abra is still legitimately waiting (its internal TIMEOUT can be larger
|
||||||
via the .env). Mirrors the install deploy_app timeout plumbing."""
|
via the .env). Mirrors the install deploy_app timeout plumbing.
|
||||||
abra.deploy(domain, chaos=True, timeout=deploy_timeout)
|
|
||||||
|
`no_converge_checks` (`abra … -c`): skip abra's own convergence monitor — the caller then owns a
|
||||||
|
stricter convergence+health wait (F2-12: abra FATAs on the heavy lasuite-drive prev→PR-head
|
||||||
|
crossover while the new collabora's healthcheck is still in its start_period, even though it
|
||||||
|
converges given swarm's healthcheck retries). The stack spec IS applied either way (docker stack
|
||||||
|
deploy runs before the monitor)."""
|
||||||
|
abra.deploy(domain, chaos=True, timeout=deploy_timeout, no_converge_checks=no_converge_checks)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None:
|
||||||
|
"""Poll a recipe's optional READY_PROBE endpoints until each returns an accepted status, or raise.
|
||||||
|
|
||||||
|
A recipe_meta may define `READY_PROBE(domain) -> [{"host":..., "path":..., "ok":(200,)}, ...]`
|
||||||
|
for readiness signals NOT captured by container-replica convergence or the app's HEALTH_PATH —
|
||||||
|
e.g. lasuite-drive's collabora WOPI discovery (`/hosting/discovery` on the collabora sibling
|
||||||
|
host): swarm reports collabora 1/1 'running' while coolwsd is still doing jail/config init and
|
||||||
|
its discovery endpoint 404s, so replica-convergence alone is not real readiness. Used after the
|
||||||
|
install deploy and after the upgrade chaos redeploy so 'reconverged' means genuinely ready."""
|
||||||
|
probe_fn = meta.get("READY_PROBE")
|
||||||
|
if not callable(probe_fn):
|
||||||
|
return
|
||||||
|
probes = probe_fn(domain) or []
|
||||||
|
for probe in probes:
|
||||||
|
host = probe["host"]
|
||||||
|
path = probe.get("path", "/")
|
||||||
|
ok = tuple(probe.get("ok", (200,)))
|
||||||
|
deadline = time.time() + timeout
|
||||||
|
last = 0
|
||||||
|
while time.time() < deadline:
|
||||||
|
last = http_get(host, path, timeout=15)
|
||||||
|
if last in ok:
|
||||||
|
print(f" ready-probe OK ({last}): https://{host}{path}", flush=True)
|
||||||
|
break
|
||||||
|
time.sleep(5)
|
||||||
|
else:
|
||||||
|
raise TimeoutError(
|
||||||
|
f"READY_PROBE not ready: https://{host}{path} (last status {last}) within {timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def backup_app(domain: str) -> str:
|
def backup_app(domain: str) -> str:
|
||||||
|
|||||||
@ -194,7 +194,7 @@ def _load_meta(recipe: str) -> dict:
|
|||||||
ns: dict = {}
|
ns: dict = {}
|
||||||
with open(path) as fh:
|
with open(path) as fh:
|
||||||
exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo)
|
exec(compile(fh.read(), path, "exec"), ns) # noqa: S102 (trusted, in-repo)
|
||||||
for k in list(meta) + ["BACKUP_CAPABLE", "SKIP_GENERIC", "OIDC_AT_INSTALL"]:
|
for k in list(meta) + ["BACKUP_CAPABLE", "SKIP_GENERIC", "OIDC_AT_INSTALL", "READY_PROBE"]:
|
||||||
if k in ns:
|
if k in ns:
|
||||||
meta[k] = ns[k]
|
meta[k] = ns[k]
|
||||||
return meta
|
return meta
|
||||||
@ -240,15 +240,17 @@ def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, met
|
|||||||
|
|
||||||
|
|
||||||
def _perform_op(
|
def _perform_op(
|
||||||
op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict, deploy_timeout: int = 900
|
op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict, deploy_timeout: int = 900,
|
||||||
|
meta: dict | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records
|
"""Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records
|
||||||
what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these
|
what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these
|
||||||
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a
|
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a
|
||||||
new install (HC1 reconciliation). `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the
|
new install (HC1 reconciliation). `deploy_timeout` (recipe DEPLOY_TIMEOUT) is plumbed to the
|
||||||
upgrade chaos redeploy so a heavy reconverge isn't SIGKILLed by the 900s default mid-wait."""
|
upgrade chaos redeploy so a heavy reconverge isn't SIGKILLed by the 900s default mid-wait; `meta`
|
||||||
|
lets the upgrade op own a recipe-aware convergence+health wait (F2-12, READY_PROBE)."""
|
||||||
if op == "upgrade":
|
if op == "upgrade":
|
||||||
before = generic.perform_upgrade(domain, recipe, head_ref, deploy_timeout=deploy_timeout)
|
before = generic.perform_upgrade(domain, recipe, head_ref, deploy_timeout=deploy_timeout, meta=meta)
|
||||||
op_state["upgrade"] = {"before": before, "head_ref": head_ref}
|
op_state["upgrade"] = {"before": before, "head_ref": head_ref}
|
||||||
elif op == "backup":
|
elif op == "backup":
|
||||||
op_state["backup"] = {"snapshot_id": generic.perform_backup(domain)}
|
op_state["backup"] = {"snapshot_id": generic.perform_backup(domain)}
|
||||||
@ -290,7 +292,10 @@ def run_lifecycle_tier(
|
|||||||
# 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail.
|
# 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail.
|
||||||
try:
|
try:
|
||||||
_run_pre_hook(recipe, op, repo_local, domain, meta)
|
_run_pre_hook(recipe, op, repo_local, domain, meta)
|
||||||
_perform_op(op, domain, recipe, head_ref, op_state, deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", 900)))
|
_perform_op(
|
||||||
|
op, domain, recipe, head_ref, op_state,
|
||||||
|
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", 900)), meta=meta,
|
||||||
|
)
|
||||||
with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f:
|
with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f:
|
||||||
json.dump(op_state, f)
|
json.dump(op_state, f)
|
||||||
except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash
|
except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash
|
||||||
@ -801,6 +806,9 @@ def main() -> int:
|
|||||||
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
deploy_timeout=meta["DEPLOY_TIMEOUT"],
|
||||||
http_timeout=meta["HTTP_TIMEOUT"],
|
http_timeout=meta["HTTP_TIMEOUT"],
|
||||||
)
|
)
|
||||||
|
# Recipe READY_PROBE (e.g. lasuite-drive collabora WOPI discovery) — readiness beyond
|
||||||
|
# replica convergence + app HEALTH_PATH; no-op for recipes without one.
|
||||||
|
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", 900)))
|
||||||
deploy_ok = True
|
deploy_ok = True
|
||||||
except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure
|
except Exception as e: # noqa: BLE001 — a failed deploy is a reported INSTALL failure
|
||||||
print(f"!! deploy/readiness failed: {e}", flush=True)
|
print(f"!! deploy/readiness failed: {e}", flush=True)
|
||||||
|
|||||||
@ -34,6 +34,17 @@ DEPS = ["keycloak"]
|
|||||||
OIDC_AT_INSTALL = True
|
OIDC_AT_INSTALL = True
|
||||||
|
|
||||||
|
|
||||||
|
def READY_PROBE(domain):
|
||||||
|
"""Readiness signals beyond replica-convergence + the app HEALTH_PATH (Q3.2/F2-12). collabora's
|
||||||
|
coolwsd reports its container 1/1 'running' while still doing jail/config init, and its WOPI
|
||||||
|
discovery endpoint 404s until ready — so the harness waits for `/hosting/discovery` → 200 on the
|
||||||
|
collabora sibling host after the install deploy AND after the upgrade chaos redeploy. This is what
|
||||||
|
makes the heavy prev→PR-head crossover reliably green (the new collabora 25.04.9.x finishes init
|
||||||
|
within swarm's healthcheck retries; abra's own converge monitor was too impatient — F2-12)."""
|
||||||
|
label, _, rest = domain.partition(".")
|
||||||
|
return [{"host": f"collabora-{domain}", "path": "/hosting/discovery", "ok": (200,)}]
|
||||||
|
|
||||||
|
|
||||||
def EXTRA_ENV(domain):
|
def EXTRA_ENV(domain):
|
||||||
# Two of lasuite-drive's services route on DOMAIN-DERIVED **nested** subdomains —
|
# Two of lasuite-drive's services route on DOMAIN-DERIVED **nested** subdomains —
|
||||||
# `MINIO_DOMAIN="minio.${DOMAIN}"` and `COLLABORA_DOMAIN="collabora.${DOMAIN}"`. The cc-ci
|
# `MINIO_DOMAIN="minio.${DOMAIN}"` and `COLLABORA_DOMAIN="collabora.${DOMAIN}"`. The cc-ci
|
||||||
|
|||||||
Reference in New Issue
Block a user