diff --git a/machine-docs/BUILDER-INBOX.md b/machine-docs/BUILDER-INBOX.md deleted file mode 100644 index 5bdb033..0000000 --- a/machine-docs/BUILDER-INBOX.md +++ /dev/null @@ -1,20 +0,0 @@ -# Builder inbox (from Adversary) - -2026-06-17T00:30Z — Two non-gate heads-ups from my M1 pre-review (code only; I did NOT read JOURNAL): - -1. **Pre-existing red unit test (NOT prevb, but scope your "unit green" claim).** The FULL `tests/unit/` suite - is 283 pass / **1 FAIL**: `test_warm_reconcile.py::test_traefik_spec_is_stateless_with_setup` → - `KeyError: 'health_domain'`. It fails identically at gtea-DONE (778720c) and the prevb feat never touched - warm_reconcile (the `pxgate-M1` traefik-probe change 0e9fd38 refactored the spec without updating the test). - I will NOT block M1 on it. But when you CLAIM M1, please state which test set "green" refers to (prevb surface - vs full suite) so the claim is honest. Fixing it is optional/out-of-scope for prevb. - -2. **(nit)** the rewrite of `test_upgrade_base.py` dropped `test_expected_na_other_rung_does_not_suppress`. The - behavior is still correct (`.get("upgrade")`), just no longer has a dedicated test — consider re-adding one line. - -My M1 code pre-review otherwise looks sound (dynamic base + previous/ + discourse migration + teeth in -test_upgrade.py; 63 prevb-relevant unit tests pass cold). The formal M1 PASS still needs your e2e claim with -proof the head ran real `discourse/discourse:3.5.3` (not bitnamilegacy) + sidekiq gone, plus my own cold -acceptance + a deliberately-broken-head break-it probe. Put the WHAT/HOW/EXPECTED/WHERE in STATUS when you claim. - -(Delete this file once read — that's the "consumed" signal.) diff --git a/runner/harness/generic.py b/runner/harness/generic.py index 700ae5c..d548bff 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -303,6 +303,11 @@ def perform_upgrade( http_timeout=int(meta.HTTP_TIMEOUT), ) lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.DEPLOY_TIMEOUT), op="upgrade") + # Reconcile the live stack to the head compose's service set: `docker stack deploy` doesn't prune + # services the head dropped (e.g. discourse PR#4 deletes sidekiq), leaving them orphaned on the + # base's old image. Remove them so the deployed stack faithfully reflects the head under test + # (phase prevb). No-op when service sets match / compose can't be resolved. + lifecycle.prune_orphan_services(domain, recipe) after = lifecycle.deployed_identity(domain) # Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the # PR-head we checked out — proving the upgrade deployed the code under test, not a published tag. diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index e063e5e..583e4bf 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -513,6 +513,63 @@ def stack_service_names(domain: str) -> list[str]: return names +def compose_services(recipe: str, domain: str) -> set[str]: + """Service names defined by the app's CURRENT COMPOSE_FILE (resolved the way abra/prepull do — + `docker compose --env-file <.env> -f config --services`). Empty set on any resolution + failure, which callers treat as 'unknown' (→ never prune).""" + recipe_dir = abra.recipe_dir(recipe) + env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env") + if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path): + return set() + cf = subprocess.run( + ["bash", "-c", f'set -a; . "{env_path}"; printf "%s" "${{COMPOSE_FILE:-compose.yml}}"'], + capture_output=True, + text=True, + ).stdout.strip() + files = [f for f in cf.split(":") if f] or ["compose.yml"] + args = ["docker", "compose", "--env-file", env_path] + for f in files: + args += ["-f", f] + args += ["config", "--services"] + proc = subprocess.run(args, cwd=recipe_dir, capture_output=True, text=True) + if proc.returncode != 0: + return set() + return {ln.strip() for ln in proc.stdout.splitlines() if ln.strip()} + + +def prune_orphan_services(domain: str, recipe: str) -> list[str]: + """After an upgrade redeploy, reconcile the live swarm stack to the head compose's service set. + + `docker stack deploy` (what `abra app deploy` runs) does NOT remove services the NEW compose + dropped — it only adds/updates — so a service removed by the PR head (e.g. discourse PR#4 deletes + `sidekiq` when switching from the multi-service bitnamilegacy image to the single-container + official image) is left ORPHANED, still running the base's old image. Remove any live stack + service not defined by the head's COMPOSE_FILE so the deployed stack faithfully reflects the head + under test. No-op when the service sets already match (the common case), or when the head compose + can't be resolved (safe — removes nothing). Returns the short names removed.""" + defined = compose_services(recipe, domain) + if not defined: + print( + " prune-orphans: head compose services unresolved — skipping (removes nothing)", + flush=True, + ) + return [] + stack = _stack_name(domain) + removed = [] + for s in stack_service_names(domain): + if s not in defined: + subprocess.run( + ["docker", "service", "rm", f"{stack}_{s}"], capture_output=True, text=True + ) + removed.append(s) + print( + f" prune-orphans: removed '{s}' — dropped by the head compose (was an orphan left " + "by `docker stack deploy` no-prune)", + flush=True, + ) + return removed + + def services_converged(domain: str) -> bool: """True when every service in the stack reports replicas N/N (N>0) AND no service is mid-rolling-update (swarm UpdateStatus settled).""" diff --git a/tests/unit/test_upgrade_base.py b/tests/unit/test_upgrade_base.py index 43dcb1f..501ed10 100644 --- a/tests/unit/test_upgrade_base.py +++ b/tests/unit/test_upgrade_base.py @@ -103,3 +103,13 @@ def test_no_canonical_no_main_skip(monkeypatch): _no_main(monkeypatch) plan = run_recipe_ci.resolve_upgrade_base(ALL, _meta(), "brandnew", head_ref=HEAD) assert plan.kind == "skip" and "no predecessor" in plan.reason + + +def test_expected_na_other_rung_does_not_suppress_upgrade(monkeypatch): + # an EXPECTED_NA for a DIFFERENT rung (backup_restore) must NOT short-circuit the upgrade base — + # resolution proceeds to last-green/main-tip (custom-html-tiny shape). + _no_canonical(monkeypatch) + monkeypatch.setattr(lifecycle, "recipe_branch_commit", lambda r, b="main": MAIN) + meta = _meta(expected_na={"backup_restore": "stateless"}) + plan = run_recipe_ci.resolve_upgrade_base(ALL, meta, "custom-html-tiny", head_ref=HEAD) + assert plan.kind == "ref" and plan.ref == MAIN