diff --git a/runner/harness/warmsnap.py b/runner/harness/warmsnap.py index 42c8772..36170c6 100644 --- a/runner/harness/warmsnap.py +++ b/runner/harness/warmsnap.py @@ -12,10 +12,13 @@ Used by: Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold runs, not restored on a VM rebuild. -Layout (atomic dir swap on update; one last-good per app): +Layout (atomic dir swap of the `snapshot/` subdir; one last-good per app). Sibling per-app state +(e.g. the reconciler's `last_good`) lives in `/` and is NOT clobbered by the swap: $CCCI_WARM_ROOT// - meta.json # {recipe, domain, commit, version, ts, volumes:[...]} - volumes/.tar # raw tar of the volume root, one per stack volume + last_good # (reconciler) the version known healthy — survives snapshot swaps + snapshot/ + meta.json # {recipe, domain, commit, version, ts, volumes:[...]} + volumes/.tar # raw tar of the volume root, one per stack volume Implementation note: volumes are tarred from their host mountpoint (`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs @@ -47,12 +50,18 @@ def app_dir(recipe: str) -> str: return os.path.join(warm_root(), recipe) +def snap_dir(recipe: str) -> str: + """The snapshot subdir — atomically swapped on update. Kept SEPARATE from app_dir so sibling + per-app state (the reconciler's last_good) survives a snapshot swap.""" + return os.path.join(app_dir(recipe), "snapshot") + + def meta_path(recipe: str) -> str: - return os.path.join(app_dir(recipe), "meta.json") + return os.path.join(snap_dir(recipe), "meta.json") def volumes_dir(recipe: str) -> str: - return os.path.join(app_dir(recipe), "volumes") + return os.path.join(snap_dir(recipe), "volumes") def has_snapshot(recipe: str) -> bool: @@ -112,9 +121,8 @@ def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | if not volumes: raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot") - root = warm_root() - os.makedirs(root, exist_ok=True) - staging = os.path.join(root, f".{recipe}.staging") + os.makedirs(app_dir(recipe), exist_ok=True) + staging = os.path.join(app_dir(recipe), ".snapshot.staging") shutil.rmtree(staging, ignore_errors=True) os.makedirs(os.path.join(staging, "volumes"), exist_ok=True) @@ -138,9 +146,9 @@ def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | with open(os.path.join(staging, "meta.json"), "w") as f: json.dump(meta, f) - # Atomic-ish swap: move current aside, move staging in, drop the old. One last-good retained. - target = app_dir(recipe) - old = os.path.join(root, f".{recipe}.old") + # Atomic-ish swap of the snapshot subdir only (sibling state like last_good is untouched). + target = snap_dir(recipe) + old = os.path.join(app_dir(recipe), ".snapshot.old") shutil.rmtree(old, ignore_errors=True) if os.path.exists(target): os.rename(target, old) diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index d51a6d6..838c3d8 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -207,11 +207,26 @@ def release_notes(recipe: str, version: str) -> str: def deploy_version(recipe: str, domain: str, version: str, timeout: int) -> None: """Deploy a specific published version: checkout the tag (so the on-disk tree matches) then a pinned non-chaos redeploy with the version positional (so abra records TYPE=:). - `-f` makes it idempotent against an already-deployed app.""" + `-f` makes it idempotent against an already-deployed app. abra writes FATA to stdout, so include + both streams in the error.""" abra.recipe_checkout(recipe, version) r = _run(["abra", "app", "deploy", domain, version, "-o", "-n", "-f"], timeout=timeout) if r.returncode != 0: - raise RuntimeError(f"deploy {domain} {version} failed: {r.stderr.strip()[:300]}") + msg = (r.stderr.strip() + " " + r.stdout.strip()).strip()[:400] + raise RuntimeError(f"deploy {domain} {version} failed: {msg}") + + +def wait_undeployed(domain: str, timeout: int = 120) -> None: + """Block until the app's swarm stack is fully removed after an undeploy. abra's undeploy may + return before swarm finishes tearing down tasks; snapshot/restore (which require undeployed) and + an immediate redeploy of the same stack name otherwise race a half-removed stack.""" + stack = lifecycle._stack_name(domain) # noqa: SLF001 + deadline = time.time() + timeout + while time.time() < deadline: + if not lifecycle._docker_names("service", stack): # noqa: SLF001 + return + time.sleep(2) + raise RuntimeError(f"{domain} stack not fully undeployed after {timeout}s") # --------------------------------------------------------------------------- last-good + alerts @@ -332,6 +347,7 @@ def reconcile(app: str) -> str: print(f"[{app}] auto-upgrade {last_good} → {latest} (health-gated)", flush=True) if stateful: abra.undeploy(domain) + wait_undeployed(domain) warmsnap.snapshot(recipe, domain, version=last_good) # snapshot requires undeployed; now bring up latest. # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a @@ -353,6 +369,7 @@ def reconcile(app: str) -> str: print(f"[{app}] latest {latest} UNHEALTHY → rolling back to {last_good}", flush=True) if stateful: abra.undeploy(domain) + wait_undeployed(domain) warmsnap.restore(recipe, domain) deploy_version(recipe, domain, last_good, dt) recovered = wait_healthy(spec) diff --git a/tests/unit/test_warmsnap.py b/tests/unit/test_warmsnap.py index 63e99d9..6e3788a 100644 --- a/tests/unit/test_warmsnap.py +++ b/tests/unit/test_warmsnap.py @@ -18,8 +18,10 @@ def test_warm_root_env_override(monkeypatch): monkeypatch.setenv("CCCI_WARM_ROOT", "/tmp/ci-warm-test") assert warmsnap.warm_root() == "/tmp/ci-warm-test" assert warmsnap.app_dir("keycloak") == "/tmp/ci-warm-test/keycloak" - assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/meta.json" - assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/volumes" + # snapshot lives in a subdir so sibling state (last_good) survives the atomic swap. + assert warmsnap.snap_dir("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot" + assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot/meta.json" + assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot/volumes" def test_warm_root_default(monkeypatch): @@ -34,11 +36,11 @@ def test_read_meta_absent(monkeypatch, tmp_path): def _write_snapshot(tmp_path, recipe, volumes): - appdir = tmp_path / recipe - (appdir / "volumes").mkdir(parents=True) - (appdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes})) + snapdir = tmp_path / recipe / "snapshot" + (snapdir / "volumes").mkdir(parents=True) + (snapdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes})) for v in volumes: - (appdir / "volumes" / f"{v}.tar").write_bytes(b"fake") + (snapdir / "volumes" / f"{v}.tar").write_bytes(b"fake") def test_has_snapshot_complete(monkeypatch, tmp_path): @@ -49,11 +51,20 @@ def test_has_snapshot_complete(monkeypatch, tmp_path): assert meta["volumes"] == ["a_mariadb", "a_providers"] +def test_last_good_survives_sibling_of_snapshot(monkeypatch, tmp_path): + # The reconciler's last_good lives in / (sibling of snapshot/) — a test that the layout + # keeps them separate so a snapshot swap can't clobber last_good. + monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path)) + appdir = tmp_path / "keycloak" + assert warmsnap.snap_dir("keycloak").startswith(str(appdir)) + assert os.path.dirname(warmsnap.snap_dir("keycloak")) == str(appdir) + + def test_has_snapshot_incomplete_missing_tar(monkeypatch, tmp_path): monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path)) # meta lists two volumes but only one tar present -> incomplete -> not usable. - appdir = tmp_path / "keycloak" - (appdir / "volumes").mkdir(parents=True) - (appdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]})) - (appdir / "volumes" / "a.tar").write_bytes(b"fake") + snapdir = tmp_path / "keycloak" / "snapshot" + (snapdir / "volumes").mkdir(parents=True) + (snapdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]})) + (snapdir / "volumes" / "a.tar").write_bytes(b"fake") assert warmsnap.has_snapshot("keycloak") is False