fix(prevb): prune orphan services on upgrade redeploy (head's dropped services); re-add EXPECTED_NA-other-rung test; consume Adversary inbox
All checks were successful
continuous-integration/drone/push Build is passing

docker stack deploy doesn't prune services the head compose dropped (discourse PR#4 drops sidekiq),
leaving them orphaned on the base image. perform_upgrade now reconciles the live stack to the head
compose service set (lifecycle.prune_orphan_services). Makes the deployed stack faithfully reflect
the head — no test weakened. No-op when service sets match / compose unresolvable.
This commit is contained in:
autonomic-bot
2026-06-17 00:29:00 +00:00
parent 7f3e7c26f6
commit e1b32ea650
4 changed files with 72 additions and 20 deletions

View File

@ -303,6 +303,11 @@ def perform_upgrade(
http_timeout=int(meta.HTTP_TIMEOUT),
)
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="upgrade")
# Reconcile the live stack to the head compose's service set: `docker stack deploy` doesn't prune
# services the head dropped (e.g. discourse PR#4 deletes sidekiq), leaving them orphaned on the
# base's old image. Remove them so the deployed stack faithfully reflects the head under test
# (phase prevb). No-op when service sets match / compose can't be resolved.
lifecycle.prune_orphan_services(domain, recipe)
after = lifecycle.deployed_identity(domain)
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.

View File

@ -513,6 +513,63 @@ def stack_service_names(domain: str) -> list[str]:
return names
def compose_services(recipe: str, domain: str) -> set[str]:
"""Service names defined by the app's CURRENT COMPOSE_FILE (resolved the way abra/prepull do —
`docker compose --env-file <.env> -f <each> config --services`). Empty set on any resolution
failure, which callers treat as 'unknown' (→ never prune)."""
recipe_dir = abra.recipe_dir(recipe)
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
return set()
cf = subprocess.run(
["bash", "-c", f'set -a; . "{env_path}"; printf "%s" "${{COMPOSE_FILE:-compose.yml}}"'],
capture_output=True,
text=True,
).stdout.strip()
files = [f for f in cf.split(":") if f] or ["compose.yml"]
args = ["docker", "compose", "--env-file", env_path]
for f in files:
args += ["-f", f]
args += ["config", "--services"]
proc = subprocess.run(args, cwd=recipe_dir, capture_output=True, text=True)
if proc.returncode != 0:
return set()
return {ln.strip() for ln in proc.stdout.splitlines() if ln.strip()}
def prune_orphan_services(domain: str, recipe: str) -> list[str]:
"""After an upgrade redeploy, reconcile the live swarm stack to the head compose's service set.
`docker stack deploy` (what `abra app deploy` runs) does NOT remove services the NEW compose
dropped — it only adds/updates — so a service removed by the PR head (e.g. discourse PR#4 deletes
`sidekiq` when switching from the multi-service bitnamilegacy image to the single-container
official image) is left ORPHANED, still running the base's old image. Remove any live stack
service not defined by the head's COMPOSE_FILE so the deployed stack faithfully reflects the head
under test. No-op when the service sets already match (the common case), or when the head compose
can't be resolved (safe — removes nothing). Returns the short names removed."""
defined = compose_services(recipe, domain)
if not defined:
print(
" prune-orphans: head compose services unresolved — skipping (removes nothing)",
flush=True,
)
return []
stack = _stack_name(domain)
removed = []
for s in stack_service_names(domain):
if s not in defined:
subprocess.run(
["docker", "service", "rm", f"{stack}_{s}"], capture_output=True, text=True
)
removed.append(s)
print(
f" prune-orphans: removed '{s}' — dropped by the head compose (was an orphan left "
"by `docker stack deploy` no-prune)",
flush=True,
)
return removed
def services_converged(domain: str) -> bool:
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
mid-rolling-update (swarm UpdateStatus settled)."""