Files
cc-ci/runner/harness/warmsnap.py
autonomic-bot 32f00717ac fix(2w): W0.9 WC1.1 hardening (proven live: healthy upgrade + marquee rollback)
Bugs found by the live proof, fixed:
- warmsnap: snapshot now swaps a <recipe>/snapshot/ SUBDIR, not the whole
  <recipe>/ dir — so the reconciler's sibling last_good file survives a
  snapshot swap (was being clobbered).
- warm_reconcile: deploy_version captures abra's stdout (it writes FATA to
  stdout) in the error; add wait_undeployed() after every undeploy so
  snapshot/restore/redeploy don't race a half-removed swarm stack; the upgrade
  deploy is wrapped so a deploy FAILURE (not just unhealthy) also triggers
  rollback. (57 unit pass.)

LIVE PROOF on warm keycloak (annotated fake tags via CCCI_SKIP_FETCH):
(a) healthy upgrade 10.7.1->10.7.9: snapshot+deploy+health-pass, last_good
    committed=10.7.9, marker realm preserved.
(b) MARQUEE rollback: broken latest 10.7.10 (lint-fail) -> rollback to 10.7.9,
    HEALTHY, marker realm INTACT (data preserved through broken-upgrade+restore),
    last_good NOT advanced, rollback alert written (attempted=10.7.10,
    last_good=10.7.9, recovered=True). keycloak recovered to canonical
    10.7.1+26.6.2 healthy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 01:21:05 +01:00

187 lines
7.3 KiB
Python

"""Known-good snapshot/restore of an app's data volumes (Phase 2w / WC3).
A snapshot is a **raw copy of every docker volume belonging to an app's stack, taken while the app is
UNDEPLOYED** (nothing is writing → consistent). Stored under `/var/lib/ci-warm/<recipe>/` as one
last-known-good per app, replaced atomically. Restore clears each volume and untars it back.
Used by:
- WC1.1 — snapshot keycloak's data volume BEFORE an auto-upgrade; restore on health-gate rollback
(a forward DB migration makes a version-only rollback unsafe).
- WC5 — promote-on-green-cold re-snapshots a canonical at teardown.
Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold
runs, not restored on a VM rebuild.
Layout (atomic dir swap of the `snapshot/` subdir; one last-good per app). Sibling per-app state
(e.g. the reconciler's `last_good`) lives in `<recipe>/` and is NOT clobbered by the swap:
$CCCI_WARM_ROOT/<recipe>/
last_good # (reconciler) the version known healthy — survives snapshot swaps
snapshot/
meta.json # {recipe, domain, commit, version, ts, volumes:[...]}
volumes/<volname>.tar # raw tar of the volume root, one per stack volume
Implementation note: volumes are tarred from their host mountpoint
(`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs
as root (the reconciler / runner on cc-ci) — direct mountpoint access is available there.
"""
from __future__ import annotations
import json
import os
import shutil
import subprocess
from . import lifecycle
DEFAULT_WARM_ROOT = "/var/lib/ci-warm"
class SnapshotError(RuntimeError):
pass
def warm_root() -> str:
"""Root for warm snapshots; overridable via $CCCI_WARM_ROOT (tests)."""
return os.environ.get("CCCI_WARM_ROOT", DEFAULT_WARM_ROOT)
def app_dir(recipe: str) -> str:
return os.path.join(warm_root(), recipe)
def snap_dir(recipe: str) -> str:
"""The snapshot subdir — atomically swapped on update. Kept SEPARATE from app_dir so sibling
per-app state (the reconciler's last_good) survives a snapshot swap."""
return os.path.join(app_dir(recipe), "snapshot")
def meta_path(recipe: str) -> str:
return os.path.join(snap_dir(recipe), "meta.json")
def volumes_dir(recipe: str) -> str:
return os.path.join(snap_dir(recipe), "volumes")
def has_snapshot(recipe: str) -> bool:
"""True iff a complete last-good snapshot (meta + at least its declared volume tars) exists."""
meta = read_meta(recipe)
if not meta:
return False
for v in meta.get("volumes", []):
if not os.path.isfile(os.path.join(volumes_dir(recipe), f"{v}.tar")):
return False
return True
def read_meta(recipe: str) -> dict | None:
try:
with open(meta_path(recipe)) as f:
return json.load(f)
except (OSError, ValueError):
return None
def _run(cmd: list[str], timeout: int = 600) -> subprocess.CompletedProcess:
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
def _volume_mountpoint(volume: str) -> str:
r = _run(["docker", "volume", "inspect", "-f", "{{.Mountpoint}}", volume], timeout=30)
mp = r.stdout.strip()
if r.returncode != 0 or not mp:
raise SnapshotError(f"cannot inspect volume {volume}: {r.stderr.strip()}")
return mp
def stack_volumes(domain: str) -> list[str]:
"""Names of the docker volumes belonging to the app's stack (reuses lifecycle's stack scan)."""
stack = lifecycle._stack_name(domain) # noqa: SLF001 — shared internal in our package
return sorted(lifecycle._docker_names("volume", stack)) # noqa: SLF001
def _assert_undeployed(domain: str) -> None:
"""Snapshots/restores must happen while the app is UNDEPLOYED (consistency + safe volume writes).
Raise if any service of the stack is still running."""
stack = lifecycle._stack_name(domain) # noqa: SLF001
svcs = lifecycle._docker_names("service", stack) # noqa: SLF001
if svcs:
raise SnapshotError(
f"refusing to snapshot/restore {domain} while deployed (services: {svcs}); "
"undeploy first (WC3: snapshot while undeployed)"
)
def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | None = None) -> dict:
"""Take a last-known-good snapshot of every data volume of <domain>'s stack. The app MUST be
undeployed. Atomically replaces the prior last-good. Returns the written meta dict."""
_assert_undeployed(domain)
volumes = stack_volumes(domain)
if not volumes:
raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot")
os.makedirs(app_dir(recipe), exist_ok=True)
staging = os.path.join(app_dir(recipe), ".snapshot.staging")
shutil.rmtree(staging, ignore_errors=True)
os.makedirs(os.path.join(staging, "volumes"), exist_ok=True)
for vol in volumes:
mp = _volume_mountpoint(vol)
tar_path = os.path.join(staging, "volumes", f"{vol}.tar")
# Tar the volume contents (relative to the mountpoint) so restore can untar back in place.
r = _run(["tar", "-C", mp, "-cf", tar_path, "."])
if r.returncode != 0:
shutil.rmtree(staging, ignore_errors=True)
raise SnapshotError(f"tar of volume {vol} failed: {r.stderr.strip()}")
meta = {
"recipe": recipe,
"domain": domain,
"commit": commit,
"version": version,
"volumes": volumes,
"ts": _now(),
}
with open(os.path.join(staging, "meta.json"), "w") as f:
json.dump(meta, f)
# Atomic-ish swap of the snapshot subdir only (sibling state like last_good is untouched).
target = snap_dir(recipe)
old = os.path.join(app_dir(recipe), ".snapshot.old")
shutil.rmtree(old, ignore_errors=True)
if os.path.exists(target):
os.rename(target, old)
os.rename(staging, target)
shutil.rmtree(old, ignore_errors=True)
return meta
def restore(recipe: str, domain: str) -> dict:
"""Restore the last-known-good snapshot into <domain>'s stack volumes. The app MUST be undeployed.
Clears each volume then untars the snapshot back. Returns the snapshot meta. Raises if no
snapshot exists or a snapshot volume is missing from the current stack."""
_assert_undeployed(domain)
meta = read_meta(recipe)
if not meta or not has_snapshot(recipe):
raise SnapshotError(f"no complete snapshot for {recipe} to restore")
current = set(stack_volumes(domain))
for vol in meta.get("volumes", []):
tar_path = os.path.join(volumes_dir(recipe), f"{vol}.tar")
if vol not in current:
raise SnapshotError(f"snapshot volume {vol} absent from current stack {sorted(current)}")
mp = _volume_mountpoint(vol)
# Clear the volume contents (incl. dotfiles) without removing the mountpoint itself.
r = _run(["sh", "-c", f'rm -rf -- "{mp}"/* "{mp}"/.[!.]* "{mp}"/..?* 2>/dev/null; true'])
r = _run(["tar", "-C", mp, "-xf", tar_path])
if r.returncode != 0:
raise SnapshotError(f"untar of volume {vol} failed: {r.stderr.strip()}")
return meta
def _now() -> str:
# Import here (not module top) — keeps the pure path/meta helpers importable in restricted envs.
import datetime
return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"