fix(prevb): prune orphan services on upgrade redeploy (head's dropped services); re-add EXPECTED_NA-other-rung test; consume Adversary inbox
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
docker stack deploy doesn't prune services the head compose dropped (discourse PR#4 drops sidekiq), leaving them orphaned on the base image. perform_upgrade now reconciles the live stack to the head compose service set (lifecycle.prune_orphan_services). Makes the deployed stack faithfully reflect the head — no test weakened. No-op when service sets match / compose unresolvable.
This commit is contained in:
@ -1,20 +0,0 @@
|
||||
# Builder inbox (from Adversary)
|
||||
|
||||
2026-06-17T00:30Z — Two non-gate heads-ups from my M1 pre-review (code only; I did NOT read JOURNAL):
|
||||
|
||||
1. **Pre-existing red unit test (NOT prevb, but scope your "unit green" claim).** The FULL `tests/unit/` suite
|
||||
is 283 pass / **1 FAIL**: `test_warm_reconcile.py::test_traefik_spec_is_stateless_with_setup` →
|
||||
`KeyError: 'health_domain'`. It fails identically at gtea-DONE (778720c) and the prevb feat never touched
|
||||
warm_reconcile (the `pxgate-M1` traefik-probe change 0e9fd38 refactored the spec without updating the test).
|
||||
I will NOT block M1 on it. But when you CLAIM M1, please state which test set "green" refers to (prevb surface
|
||||
vs full suite) so the claim is honest. Fixing it is optional/out-of-scope for prevb.
|
||||
|
||||
2. **(nit)** the rewrite of `test_upgrade_base.py` dropped `test_expected_na_other_rung_does_not_suppress`. The
|
||||
behavior is still correct (`.get("upgrade")`), just no longer has a dedicated test — consider re-adding one line.
|
||||
|
||||
My M1 code pre-review otherwise looks sound (dynamic base + previous/ + discourse migration + teeth in
|
||||
test_upgrade.py; 63 prevb-relevant unit tests pass cold). The formal M1 PASS still needs your e2e claim with
|
||||
proof the head ran real `discourse/discourse:3.5.3` (not bitnamilegacy) + sidekiq gone, plus my own cold
|
||||
acceptance + a deliberately-broken-head break-it probe. Put the WHAT/HOW/EXPECTED/WHERE in STATUS when you claim.
|
||||
|
||||
(Delete this file once read — that's the "consumed" signal.)
|
||||
@ -303,6 +303,11 @@ def perform_upgrade(
|
||||
http_timeout=int(meta.HTTP_TIMEOUT),
|
||||
)
|
||||
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="upgrade")
|
||||
# Reconcile the live stack to the head compose's service set: `docker stack deploy` doesn't prune
|
||||
# services the head dropped (e.g. discourse PR#4 deletes sidekiq), leaving them orphaned on the
|
||||
# base's old image. Remove them so the deployed stack faithfully reflects the head under test
|
||||
# (phase prevb). No-op when service sets match / compose can't be resolved.
|
||||
lifecycle.prune_orphan_services(domain, recipe)
|
||||
after = lifecycle.deployed_identity(domain)
|
||||
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
|
||||
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
|
||||
|
||||
@ -513,6 +513,63 @@ def stack_service_names(domain: str) -> list[str]:
|
||||
return names
|
||||
|
||||
|
||||
def compose_services(recipe: str, domain: str) -> set[str]:
|
||||
"""Service names defined by the app's CURRENT COMPOSE_FILE (resolved the way abra/prepull do —
|
||||
`docker compose --env-file <.env> -f <each> config --services`). Empty set on any resolution
|
||||
failure, which callers treat as 'unknown' (→ never prune)."""
|
||||
recipe_dir = abra.recipe_dir(recipe)
|
||||
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
|
||||
if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
|
||||
return set()
|
||||
cf = subprocess.run(
|
||||
["bash", "-c", f'set -a; . "{env_path}"; printf "%s" "${{COMPOSE_FILE:-compose.yml}}"'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
).stdout.strip()
|
||||
files = [f for f in cf.split(":") if f] or ["compose.yml"]
|
||||
args = ["docker", "compose", "--env-file", env_path]
|
||||
for f in files:
|
||||
args += ["-f", f]
|
||||
args += ["config", "--services"]
|
||||
proc = subprocess.run(args, cwd=recipe_dir, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
return set()
|
||||
return {ln.strip() for ln in proc.stdout.splitlines() if ln.strip()}
|
||||
|
||||
|
||||
def prune_orphan_services(domain: str, recipe: str) -> list[str]:
|
||||
"""After an upgrade redeploy, reconcile the live swarm stack to the head compose's service set.
|
||||
|
||||
`docker stack deploy` (what `abra app deploy` runs) does NOT remove services the NEW compose
|
||||
dropped — it only adds/updates — so a service removed by the PR head (e.g. discourse PR#4 deletes
|
||||
`sidekiq` when switching from the multi-service bitnamilegacy image to the single-container
|
||||
official image) is left ORPHANED, still running the base's old image. Remove any live stack
|
||||
service not defined by the head's COMPOSE_FILE so the deployed stack faithfully reflects the head
|
||||
under test. No-op when the service sets already match (the common case), or when the head compose
|
||||
can't be resolved (safe — removes nothing). Returns the short names removed."""
|
||||
defined = compose_services(recipe, domain)
|
||||
if not defined:
|
||||
print(
|
||||
" prune-orphans: head compose services unresolved — skipping (removes nothing)",
|
||||
flush=True,
|
||||
)
|
||||
return []
|
||||
stack = _stack_name(domain)
|
||||
removed = []
|
||||
for s in stack_service_names(domain):
|
||||
if s not in defined:
|
||||
subprocess.run(
|
||||
["docker", "service", "rm", f"{stack}_{s}"], capture_output=True, text=True
|
||||
)
|
||||
removed.append(s)
|
||||
print(
|
||||
f" prune-orphans: removed '{s}' — dropped by the head compose (was an orphan left "
|
||||
"by `docker stack deploy` no-prune)",
|
||||
flush=True,
|
||||
)
|
||||
return removed
|
||||
|
||||
|
||||
def services_converged(domain: str) -> bool:
|
||||
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
|
||||
mid-rolling-update (swarm UpdateStatus settled)."""
|
||||
|
||||
@ -103,3 +103,13 @@ def test_no_canonical_no_main_skip(monkeypatch):
|
||||
_no_main(monkeypatch)
|
||||
plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "brandnew", head_ref=HEAD)
|
||||
assert plan.kind == "skip" and "no predecessor" in plan.reason
|
||||
|
||||
|
||||
def test_expected_na_other_rung_does_not_suppress_upgrade(monkeypatch):
|
||||
# an EXPECTED_NA for a DIFFERENT rung (backup_restore) must NOT short-circuit the upgrade base —
|
||||
# resolution proceeds to last-green/main-tip (custom-html-tiny shape).
|
||||
_no_canonical(monkeypatch)
|
||||
monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": MAIN)
|
||||
meta = _meta(expected_na={"backup_restore": "stateless"})
|
||||
plan = run_recipe_ci.resolve_upgrade_base(ALL, meta, "custom-html-tiny", head_ref=HEAD)
|
||||
assert plan.kind == "ref" and plan.ref == MAIN
|
||||
|
||||
Reference in New Issue
Block a user