feat(2w): W0.5 WC3 snapshot/restore helper (warmsnap.py)
runner/harness/warmsnap.py: raw per-volume tar of an app's stack volumes while UNDEPLOYED, under /var/lib/ci-warm/<recipe>/ (meta.json + volumes/<vol>.tar); one last-good, atomic dir swap; restore clears+untars each volume back. Asserts undeployed (consistency). Reused by WC1.1 (pre-upgrade keycloak snapshot) + WC5. +5 unit tests (48 unit pass). LIVE round-trip PROVEN on warm keycloak: create marker realm -> undeploy -> snapshot (mariadb+providers vols) -> deploy -> delete marker (mutate DB) -> undeploy -> restore -> deploy -> marker realm BACK; keycloak healthy. WC3 core. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
178
runner/harness/warmsnap.py
Normal file
178
runner/harness/warmsnap.py
Normal file
@ -0,0 +1,178 @@
|
||||
"""Known-good snapshot/restore of an app's data volumes (Phase 2w / WC3).
|
||||
|
||||
A snapshot is a **raw copy of every docker volume belonging to an app's stack, taken while the app is
|
||||
UNDEPLOYED** (nothing is writing → consistent). Stored under `/var/lib/ci-warm/<recipe>/` as one
|
||||
last-known-good per app, replaced atomically. Restore clears each volume and untars it back.
|
||||
|
||||
Used by:
|
||||
- WC1.1 — snapshot keycloak's data volume BEFORE an auto-upgrade; restore on health-gate rollback
|
||||
(a forward DB migration makes a version-only rollback unsafe).
|
||||
- WC5 — promote-on-green-cold re-snapshots a canonical at teardown.
|
||||
|
||||
Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold
|
||||
runs, not restored on a VM rebuild.
|
||||
|
||||
Layout (atomic dir swap on update; one last-good per app):
|
||||
$CCCI_WARM_ROOT/<recipe>/
|
||||
meta.json # {recipe, domain, commit, version, ts, volumes:[...]}
|
||||
volumes/<volname>.tar # raw tar of the volume root, one per stack volume
|
||||
|
||||
Implementation note: volumes are tarred from their host mountpoint
|
||||
(`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs
|
||||
as root (the reconciler / runner on cc-ci) — direct mountpoint access is available there.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
from . import lifecycle
|
||||
|
||||
DEFAULT_WARM_ROOT = "/var/lib/ci-warm"
|
||||
|
||||
|
||||
class SnapshotError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def warm_root() -> str:
|
||||
"""Root for warm snapshots; overridable via $CCCI_WARM_ROOT (tests)."""
|
||||
return os.environ.get("CCCI_WARM_ROOT", DEFAULT_WARM_ROOT)
|
||||
|
||||
|
||||
def app_dir(recipe: str) -> str:
|
||||
return os.path.join(warm_root(), recipe)
|
||||
|
||||
|
||||
def meta_path(recipe: str) -> str:
|
||||
return os.path.join(app_dir(recipe), "meta.json")
|
||||
|
||||
|
||||
def volumes_dir(recipe: str) -> str:
|
||||
return os.path.join(app_dir(recipe), "volumes")
|
||||
|
||||
|
||||
def has_snapshot(recipe: str) -> bool:
|
||||
"""True iff a complete last-good snapshot (meta + at least its declared volume tars) exists."""
|
||||
meta = read_meta(recipe)
|
||||
if not meta:
|
||||
return False
|
||||
for v in meta.get("volumes", []):
|
||||
if not os.path.isfile(os.path.join(volumes_dir(recipe), f"{v}.tar")):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def read_meta(recipe: str) -> dict | None:
|
||||
try:
|
||||
with open(meta_path(recipe)) as f:
|
||||
return json.load(f)
|
||||
except (OSError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _run(cmd: list[str], timeout: int = 600) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||
|
||||
|
||||
def _volume_mountpoint(volume: str) -> str:
|
||||
r = _run(["docker", "volume", "inspect", "-f", "{{.Mountpoint}}", volume], timeout=30)
|
||||
mp = r.stdout.strip()
|
||||
if r.returncode != 0 or not mp:
|
||||
raise SnapshotError(f"cannot inspect volume {volume}: {r.stderr.strip()}")
|
||||
return mp
|
||||
|
||||
|
||||
def stack_volumes(domain: str) -> list[str]:
|
||||
"""Names of the docker volumes belonging to the app's stack (reuses lifecycle's stack scan)."""
|
||||
stack = lifecycle._stack_name(domain) # noqa: SLF001 — shared internal in our package
|
||||
return sorted(lifecycle._docker_names("volume", stack)) # noqa: SLF001
|
||||
|
||||
|
||||
def _assert_undeployed(domain: str) -> None:
|
||||
"""Snapshots/restores must happen while the app is UNDEPLOYED (consistency + safe volume writes).
|
||||
Raise if any service of the stack is still running."""
|
||||
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
||||
svcs = lifecycle._docker_names("service", stack) # noqa: SLF001
|
||||
if svcs:
|
||||
raise SnapshotError(
|
||||
f"refusing to snapshot/restore {domain} while deployed (services: {svcs}); "
|
||||
"undeploy first (WC3: snapshot while undeployed)"
|
||||
)
|
||||
|
||||
|
||||
def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | None = None) -> dict:
|
||||
"""Take a last-known-good snapshot of every data volume of <domain>'s stack. The app MUST be
|
||||
undeployed. Atomically replaces the prior last-good. Returns the written meta dict."""
|
||||
_assert_undeployed(domain)
|
||||
volumes = stack_volumes(domain)
|
||||
if not volumes:
|
||||
raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot")
|
||||
|
||||
root = warm_root()
|
||||
os.makedirs(root, exist_ok=True)
|
||||
staging = os.path.join(root, f".{recipe}.staging")
|
||||
shutil.rmtree(staging, ignore_errors=True)
|
||||
os.makedirs(os.path.join(staging, "volumes"), exist_ok=True)
|
||||
|
||||
for vol in volumes:
|
||||
mp = _volume_mountpoint(vol)
|
||||
tar_path = os.path.join(staging, "volumes", f"{vol}.tar")
|
||||
# Tar the volume contents (relative to the mountpoint) so restore can untar back in place.
|
||||
r = _run(["tar", "-C", mp, "-cf", tar_path, "."])
|
||||
if r.returncode != 0:
|
||||
shutil.rmtree(staging, ignore_errors=True)
|
||||
raise SnapshotError(f"tar of volume {vol} failed: {r.stderr.strip()}")
|
||||
|
||||
meta = {
|
||||
"recipe": recipe,
|
||||
"domain": domain,
|
||||
"commit": commit,
|
||||
"version": version,
|
||||
"volumes": volumes,
|
||||
"ts": _now(),
|
||||
}
|
||||
with open(os.path.join(staging, "meta.json"), "w") as f:
|
||||
json.dump(meta, f)
|
||||
|
||||
# Atomic-ish swap: move current aside, move staging in, drop the old. One last-good retained.
|
||||
target = app_dir(recipe)
|
||||
old = os.path.join(root, f".{recipe}.old")
|
||||
shutil.rmtree(old, ignore_errors=True)
|
||||
if os.path.exists(target):
|
||||
os.rename(target, old)
|
||||
os.rename(staging, target)
|
||||
shutil.rmtree(old, ignore_errors=True)
|
||||
return meta
|
||||
|
||||
|
||||
def restore(recipe: str, domain: str) -> dict:
|
||||
"""Restore the last-known-good snapshot into <domain>'s stack volumes. The app MUST be undeployed.
|
||||
Clears each volume then untars the snapshot back. Returns the snapshot meta. Raises if no
|
||||
snapshot exists or a snapshot volume is missing from the current stack."""
|
||||
_assert_undeployed(domain)
|
||||
meta = read_meta(recipe)
|
||||
if not meta or not has_snapshot(recipe):
|
||||
raise SnapshotError(f"no complete snapshot for {recipe} to restore")
|
||||
current = set(stack_volumes(domain))
|
||||
for vol in meta.get("volumes", []):
|
||||
tar_path = os.path.join(volumes_dir(recipe), f"{vol}.tar")
|
||||
if vol not in current:
|
||||
raise SnapshotError(f"snapshot volume {vol} absent from current stack {sorted(current)}")
|
||||
mp = _volume_mountpoint(vol)
|
||||
# Clear the volume contents (incl. dotfiles) without removing the mountpoint itself.
|
||||
r = _run(["sh", "-c", f'rm -rf -- "{mp}"/* "{mp}"/.[!.]* "{mp}"/..?* 2>/dev/null; true'])
|
||||
r = _run(["tar", "-C", mp, "-xf", tar_path])
|
||||
if r.returncode != 0:
|
||||
raise SnapshotError(f"untar of volume {vol} failed: {r.stderr.strip()}")
|
||||
return meta
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
# Import here (not module top) — keeps the pure path/meta helpers importable in restricted envs.
|
||||
import datetime
|
||||
|
||||
return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
|
||||
59
tests/unit/test_warmsnap.py
Normal file
59
tests/unit/test_warmsnap.py
Normal file
@ -0,0 +1,59 @@
|
||||
"""Unit tests for the WC3 snapshot helper's pure path/meta logic (runner/harness/warmsnap.py).
|
||||
|
||||
The docker/tar snapshot+restore round-trip is integration (proven live on cc-ci against a real
|
||||
keycloak volume, W0.5). Here we cover the layout, meta read, and the has_snapshot completeness check.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import warmsnap # noqa: E402
|
||||
|
||||
|
||||
def test_warm_root_env_override(monkeypatch):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", "/tmp/ci-warm-test")
|
||||
assert warmsnap.warm_root() == "/tmp/ci-warm-test"
|
||||
assert warmsnap.app_dir("keycloak") == "/tmp/ci-warm-test/keycloak"
|
||||
assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/meta.json"
|
||||
assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/volumes"
|
||||
|
||||
|
||||
def test_warm_root_default(monkeypatch):
|
||||
monkeypatch.delenv("CCCI_WARM_ROOT", raising=False)
|
||||
assert warmsnap.warm_root() == "/var/lib/ci-warm"
|
||||
|
||||
|
||||
def test_read_meta_absent(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path))
|
||||
assert warmsnap.read_meta("keycloak") is None
|
||||
assert warmsnap.has_snapshot("keycloak") is False
|
||||
|
||||
|
||||
def _write_snapshot(tmp_path, recipe, volumes):
|
||||
appdir = tmp_path / recipe
|
||||
(appdir / "volumes").mkdir(parents=True)
|
||||
(appdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes}))
|
||||
for v in volumes:
|
||||
(appdir / "volumes" / f"{v}.tar").write_bytes(b"fake")
|
||||
|
||||
|
||||
def test_has_snapshot_complete(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path))
|
||||
_write_snapshot(tmp_path, "keycloak", ["a_mariadb", "a_providers"])
|
||||
assert warmsnap.has_snapshot("keycloak") is True
|
||||
meta = warmsnap.read_meta("keycloak")
|
||||
assert meta["volumes"] == ["a_mariadb", "a_providers"]
|
||||
|
||||
|
||||
def test_has_snapshot_incomplete_missing_tar(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path))
|
||||
# meta lists two volumes but only one tar present -> incomplete -> not usable.
|
||||
appdir = tmp_path / "keycloak"
|
||||
(appdir / "volumes").mkdir(parents=True)
|
||||
(appdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]}))
|
||||
(appdir / "volumes" / "a.tar").write_bytes(b"fake")
|
||||
assert warmsnap.has_snapshot("keycloak") is False
|
||||
Reference in New Issue
Block a user