"""Known-good snapshot/restore of an app's data volumes (Phase 2w / WC3). A snapshot is a **raw copy of every docker volume belonging to an app's stack, taken while the app is UNDEPLOYED** (nothing is writing → consistent). Stored under `/var/lib/ci-warm//` as one last-known-good per app, replaced atomically. Restore clears each volume and untars it back. Used by: - WC1.1 — snapshot keycloak's data volume BEFORE an auto-upgrade; restore on health-gate rollback (a forward DB migration makes a version-only rollback unsafe). - WC5 — promote-on-green-cold re-snapshots a canonical at teardown. Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold runs, not restored on a VM rebuild. Layout (atomic dir swap of the `snapshot/` subdir; one last-good per app). Sibling per-app state (e.g. the reconciler's `last_good`) lives in `/` and is NOT clobbered by the swap: $CCCI_WARM_ROOT// last_good # (reconciler) the version known healthy — survives snapshot swaps snapshot/ meta.json # {recipe, domain, commit, version, ts, volumes:[...]} volumes/.tar # raw tar of the volume root, one per stack volume Implementation note: volumes are tarred from their host mountpoint (`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs as root (the reconciler / runner on cc-ci) — direct mountpoint access is available there. """ from __future__ import annotations import json import os import shutil import subprocess from . import lifecycle DEFAULT_WARM_ROOT = "/var/lib/ci-warm" class SnapshotError(RuntimeError): pass def warm_root() -> str: """Root for warm snapshots; overridable via $CCCI_WARM_ROOT (tests).""" return os.environ.get("CCCI_WARM_ROOT", DEFAULT_WARM_ROOT) def app_dir(recipe: str) -> str: return os.path.join(warm_root(), recipe) def snap_dir(recipe: str) -> str: """The snapshot subdir — atomically swapped on update. Kept SEPARATE from app_dir so sibling per-app state (the reconciler's last_good) survives a snapshot swap.""" return os.path.join(app_dir(recipe), "snapshot") def meta_path(recipe: str) -> str: return os.path.join(snap_dir(recipe), "meta.json") def volumes_dir(recipe: str) -> str: return os.path.join(snap_dir(recipe), "volumes") def has_snapshot(recipe: str) -> bool: """True iff a complete last-good snapshot (meta + at least its declared volume tars) exists.""" meta = read_meta(recipe) if not meta: return False for v in meta.get("volumes", []): if not os.path.isfile(os.path.join(volumes_dir(recipe), f"{v}.tar")): return False return True def read_meta(recipe: str) -> dict | None: try: with open(meta_path(recipe)) as f: return json.load(f) except (OSError, ValueError): return None def _run(cmd: list[str], timeout: int = 600) -> subprocess.CompletedProcess: return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) def _volume_mountpoint(volume: str) -> str: r = _run(["docker", "volume", "inspect", "-f", "{{.Mountpoint}}", volume], timeout=30) mp = r.stdout.strip() if r.returncode != 0 or not mp: raise SnapshotError(f"cannot inspect volume {volume}: {r.stderr.strip()}") return mp def stack_volumes(domain: str) -> list[str]: """Names of the docker volumes belonging to the app's stack (reuses lifecycle's stack scan).""" stack = lifecycle._stack_name(domain) # noqa: SLF001 — shared internal in our package return sorted(lifecycle._docker_names("volume", stack)) # noqa: SLF001 def _assert_undeployed(domain: str) -> None: """Snapshots/restores must happen while the app is UNDEPLOYED (consistency + safe volume writes). Raise if any service of the stack is still running.""" stack = lifecycle._stack_name(domain) # noqa: SLF001 svcs = lifecycle._docker_names("service", stack) # noqa: SLF001 if svcs: raise SnapshotError( f"refusing to snapshot/restore {domain} while deployed (services: {svcs}); " "undeploy first (WC3: snapshot while undeployed)" ) def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | None = None) -> dict: """Take a last-known-good snapshot of every data volume of 's stack. The app MUST be undeployed. Atomically replaces the prior last-good. Returns the written meta dict.""" _assert_undeployed(domain) volumes = stack_volumes(domain) if not volumes: raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot") os.makedirs(app_dir(recipe), exist_ok=True) staging = os.path.join(app_dir(recipe), ".snapshot.staging") shutil.rmtree(staging, ignore_errors=True) os.makedirs(os.path.join(staging, "volumes"), exist_ok=True) for vol in volumes: mp = _volume_mountpoint(vol) tar_path = os.path.join(staging, "volumes", f"{vol}.tar") # Tar the volume contents (relative to the mountpoint) so restore can untar back in place. r = _run(["tar", "-C", mp, "-cf", tar_path, "."]) if r.returncode != 0: shutil.rmtree(staging, ignore_errors=True) raise SnapshotError(f"tar of volume {vol} failed: {r.stderr.strip()}") meta = { "recipe": recipe, "domain": domain, "commit": commit, "version": version, "volumes": volumes, "ts": _now(), } with open(os.path.join(staging, "meta.json"), "w") as f: json.dump(meta, f) # Atomic-ish swap of the snapshot subdir only (sibling state like last_good is untouched). target = snap_dir(recipe) old = os.path.join(app_dir(recipe), ".snapshot.old") shutil.rmtree(old, ignore_errors=True) if os.path.exists(target): os.rename(target, old) os.rename(staging, target) shutil.rmtree(old, ignore_errors=True) return meta def restore(recipe: str, domain: str) -> dict: """Restore the last-known-good snapshot into 's stack volumes. The app MUST be undeployed. Clears each volume then untars the snapshot back. Returns the snapshot meta. Raises if no snapshot exists or a snapshot volume is missing from the current stack.""" _assert_undeployed(domain) meta = read_meta(recipe) if not meta or not has_snapshot(recipe): raise SnapshotError(f"no complete snapshot for {recipe} to restore") current = set(stack_volumes(domain)) for vol in meta.get("volumes", []): tar_path = os.path.join(volumes_dir(recipe), f"{vol}.tar") if vol not in current: raise SnapshotError(f"snapshot volume {vol} absent from current stack {sorted(current)}") mp = _volume_mountpoint(vol) # Clear the volume contents (incl. dotfiles) without removing the mountpoint itself. r = _run(["sh", "-c", f'rm -rf -- "{mp}"/* "{mp}"/.[!.]* "{mp}"/..?* 2>/dev/null; true']) r = _run(["tar", "-C", mp, "-xf", tar_path]) if r.returncode != 0: raise SnapshotError(f"untar of volume {vol} failed: {r.stderr.strip()}") return meta def _now() -> str: # Import here (not module top) — keeps the pure path/meta helpers importable in restricted envs. import datetime return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"