From 4cc1e15a532e5d36df187f1f0fdb54d3e2e1c1ae Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Fri, 29 May 2026 00:12:46 +0100 Subject: [PATCH] feat(2w): W0.5 WC3 snapshot/restore helper (warmsnap.py) runner/harness/warmsnap.py: raw per-volume tar of an app's stack volumes while UNDEPLOYED, under /var/lib/ci-warm// (meta.json + volumes/.tar); one last-good, atomic dir swap; restore clears+untars each volume back. Asserts undeployed (consistency). Reused by WC1.1 (pre-upgrade keycloak snapshot) + WC5. +5 unit tests (48 unit pass). LIVE round-trip PROVEN on warm keycloak: create marker realm -> undeploy -> snapshot (mariadb+providers vols) -> deploy -> delete marker (mutate DB) -> undeploy -> restore -> deploy -> marker realm BACK; keycloak healthy. WC3 core. Co-Authored-By: Claude Opus 4.8 (1M context) --- runner/harness/warmsnap.py | 178 ++++++++++++++++++++++++++++++++++++ tests/unit/test_warmsnap.py | 59 ++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 runner/harness/warmsnap.py create mode 100644 tests/unit/test_warmsnap.py diff --git a/runner/harness/warmsnap.py b/runner/harness/warmsnap.py new file mode 100644 index 0000000..42c8772 --- /dev/null +++ b/runner/harness/warmsnap.py @@ -0,0 +1,178 @@ +"""Known-good snapshot/restore of an app's data volumes (Phase 2w / WC3). + +A snapshot is a **raw copy of every docker volume belonging to an app's stack, taken while the app is +UNDEPLOYED** (nothing is writing → consistent). Stored under `/var/lib/ci-warm//` as one +last-known-good per app, replaced atomically. Restore clears each volume and untars it back. + +Used by: +- WC1.1 — snapshot keycloak's data volume BEFORE an auto-upgrade; restore on health-gate rollback + (a forward DB migration makes a version-only rollback unsafe). +- WC5 — promote-on-green-cold re-snapshots a canonical at teardown. + +Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold +runs, not restored on a VM rebuild. + +Layout (atomic dir swap on update; one last-good per app): + $CCCI_WARM_ROOT// + meta.json # {recipe, domain, commit, version, ts, volumes:[...]} + volumes/.tar # raw tar of the volume root, one per stack volume + +Implementation note: volumes are tarred from their host mountpoint +(`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs +as root (the reconciler / runner on cc-ci) — direct mountpoint access is available there. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess + +from . import lifecycle + +DEFAULT_WARM_ROOT = "/var/lib/ci-warm" + + +class SnapshotError(RuntimeError): + pass + + +def warm_root() -> str: + """Root for warm snapshots; overridable via $CCCI_WARM_ROOT (tests).""" + return os.environ.get("CCCI_WARM_ROOT", DEFAULT_WARM_ROOT) + + +def app_dir(recipe: str) -> str: + return os.path.join(warm_root(), recipe) + + +def meta_path(recipe: str) -> str: + return os.path.join(app_dir(recipe), "meta.json") + + +def volumes_dir(recipe: str) -> str: + return os.path.join(app_dir(recipe), "volumes") + + +def has_snapshot(recipe: str) -> bool: + """True iff a complete last-good snapshot (meta + at least its declared volume tars) exists.""" + meta = read_meta(recipe) + if not meta: + return False + for v in meta.get("volumes", []): + if not os.path.isfile(os.path.join(volumes_dir(recipe), f"{v}.tar")): + return False + return True + + +def read_meta(recipe: str) -> dict | None: + try: + with open(meta_path(recipe)) as f: + return json.load(f) + except (OSError, ValueError): + return None + + +def _run(cmd: list[str], timeout: int = 600) -> subprocess.CompletedProcess: + return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + + +def _volume_mountpoint(volume: str) -> str: + r = _run(["docker", "volume", "inspect", "-f", "{{.Mountpoint}}", volume], timeout=30) + mp = r.stdout.strip() + if r.returncode != 0 or not mp: + raise SnapshotError(f"cannot inspect volume {volume}: {r.stderr.strip()}") + return mp + + +def stack_volumes(domain: str) -> list[str]: + """Names of the docker volumes belonging to the app's stack (reuses lifecycle's stack scan).""" + stack = lifecycle._stack_name(domain) # noqa: SLF001 — shared internal in our package + return sorted(lifecycle._docker_names("volume", stack)) # noqa: SLF001 + + +def _assert_undeployed(domain: str) -> None: + """Snapshots/restores must happen while the app is UNDEPLOYED (consistency + safe volume writes). + Raise if any service of the stack is still running.""" + stack = lifecycle._stack_name(domain) # noqa: SLF001 + svcs = lifecycle._docker_names("service", stack) # noqa: SLF001 + if svcs: + raise SnapshotError( + f"refusing to snapshot/restore {domain} while deployed (services: {svcs}); " + "undeploy first (WC3: snapshot while undeployed)" + ) + + +def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | None = None) -> dict: + """Take a last-known-good snapshot of every data volume of 's stack. The app MUST be + undeployed. Atomically replaces the prior last-good. Returns the written meta dict.""" + _assert_undeployed(domain) + volumes = stack_volumes(domain) + if not volumes: + raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot") + + root = warm_root() + os.makedirs(root, exist_ok=True) + staging = os.path.join(root, f".{recipe}.staging") + shutil.rmtree(staging, ignore_errors=True) + os.makedirs(os.path.join(staging, "volumes"), exist_ok=True) + + for vol in volumes: + mp = _volume_mountpoint(vol) + tar_path = os.path.join(staging, "volumes", f"{vol}.tar") + # Tar the volume contents (relative to the mountpoint) so restore can untar back in place. + r = _run(["tar", "-C", mp, "-cf", tar_path, "."]) + if r.returncode != 0: + shutil.rmtree(staging, ignore_errors=True) + raise SnapshotError(f"tar of volume {vol} failed: {r.stderr.strip()}") + + meta = { + "recipe": recipe, + "domain": domain, + "commit": commit, + "version": version, + "volumes": volumes, + "ts": _now(), + } + with open(os.path.join(staging, "meta.json"), "w") as f: + json.dump(meta, f) + + # Atomic-ish swap: move current aside, move staging in, drop the old. One last-good retained. + target = app_dir(recipe) + old = os.path.join(root, f".{recipe}.old") + shutil.rmtree(old, ignore_errors=True) + if os.path.exists(target): + os.rename(target, old) + os.rename(staging, target) + shutil.rmtree(old, ignore_errors=True) + return meta + + +def restore(recipe: str, domain: str) -> dict: + """Restore the last-known-good snapshot into 's stack volumes. The app MUST be undeployed. + Clears each volume then untars the snapshot back. Returns the snapshot meta. Raises if no + snapshot exists or a snapshot volume is missing from the current stack.""" + _assert_undeployed(domain) + meta = read_meta(recipe) + if not meta or not has_snapshot(recipe): + raise SnapshotError(f"no complete snapshot for {recipe} to restore") + current = set(stack_volumes(domain)) + for vol in meta.get("volumes", []): + tar_path = os.path.join(volumes_dir(recipe), f"{vol}.tar") + if vol not in current: + raise SnapshotError(f"snapshot volume {vol} absent from current stack {sorted(current)}") + mp = _volume_mountpoint(vol) + # Clear the volume contents (incl. dotfiles) without removing the mountpoint itself. + r = _run(["sh", "-c", f'rm -rf -- "{mp}"/* "{mp}"/.[!.]* "{mp}"/..?* 2>/dev/null; true']) + r = _run(["tar", "-C", mp, "-xf", tar_path]) + if r.returncode != 0: + raise SnapshotError(f"untar of volume {vol} failed: {r.stderr.strip()}") + return meta + + +def _now() -> str: + # Import here (not module top) — keeps the pure path/meta helpers importable in restricted envs. + import datetime + + return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z" diff --git a/tests/unit/test_warmsnap.py b/tests/unit/test_warmsnap.py new file mode 100644 index 0000000..63e99d9 --- /dev/null +++ b/tests/unit/test_warmsnap.py @@ -0,0 +1,59 @@ +"""Unit tests for the WC3 snapshot helper's pure path/meta logic (runner/harness/warmsnap.py). + +The docker/tar snapshot+restore round-trip is integration (proven live on cc-ci against a real +keycloak volume, W0.5). Here we cover the layout, meta read, and the has_snapshot completeness check. +""" + +from __future__ import annotations + +import json +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import warmsnap # noqa: E402 + + +def test_warm_root_env_override(monkeypatch): + monkeypatch.setenv("CCCI_WARM_ROOT", "/tmp/ci-warm-test") + assert warmsnap.warm_root() == "/tmp/ci-warm-test" + assert warmsnap.app_dir("keycloak") == "/tmp/ci-warm-test/keycloak" + assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/meta.json" + assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/volumes" + + +def test_warm_root_default(monkeypatch): + monkeypatch.delenv("CCCI_WARM_ROOT", raising=False) + assert warmsnap.warm_root() == "/var/lib/ci-warm" + + +def test_read_meta_absent(monkeypatch, tmp_path): + monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path)) + assert warmsnap.read_meta("keycloak") is None + assert warmsnap.has_snapshot("keycloak") is False + + +def _write_snapshot(tmp_path, recipe, volumes): + appdir = tmp_path / recipe + (appdir / "volumes").mkdir(parents=True) + (appdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes})) + for v in volumes: + (appdir / "volumes" / f"{v}.tar").write_bytes(b"fake") + + +def test_has_snapshot_complete(monkeypatch, tmp_path): + monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path)) + _write_snapshot(tmp_path, "keycloak", ["a_mariadb", "a_providers"]) + assert warmsnap.has_snapshot("keycloak") is True + meta = warmsnap.read_meta("keycloak") + assert meta["volumes"] == ["a_mariadb", "a_providers"] + + +def test_has_snapshot_incomplete_missing_tar(monkeypatch, tmp_path): + monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path)) + # meta lists two volumes but only one tar present -> incomplete -> not usable. + appdir = tmp_path / "keycloak" + (appdir / "volumes").mkdir(parents=True) + (appdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]})) + (appdir / "volumes" / "a.tar").write_bytes(b"fake") + assert warmsnap.has_snapshot("keycloak") is False