fix(2w): W0.9 WC1.1 hardening (proven live: healthy upgrade + marquee rollback)
Bugs found by the live proof, fixed:
- warmsnap: snapshot now swaps a <recipe>/snapshot/ SUBDIR, not the whole
<recipe>/ dir — so the reconciler's sibling last_good file survives a
snapshot swap (was being clobbered).
- warm_reconcile: deploy_version captures abra's stdout (it writes FATA to
stdout) in the error; add wait_undeployed() after every undeploy so
snapshot/restore/redeploy don't race a half-removed swarm stack; the upgrade
deploy is wrapped so a deploy FAILURE (not just unhealthy) also triggers
rollback. (57 unit pass.)
LIVE PROOF on warm keycloak (annotated fake tags via CCCI_SKIP_FETCH):
(a) healthy upgrade 10.7.1->10.7.9: snapshot+deploy+health-pass, last_good
committed=10.7.9, marker realm preserved.
(b) MARQUEE rollback: broken latest 10.7.10 (lint-fail) -> rollback to 10.7.9,
HEALTHY, marker realm INTACT (data preserved through broken-upgrade+restore),
last_good NOT advanced, rollback alert written (attempted=10.7.10,
last_good=10.7.9, recovered=True). keycloak recovered to canonical
10.7.1+26.6.2 healthy.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -12,10 +12,13 @@ Used by:
|
||||
Warm snapshots are **cache, excluded from the D8 reproducibility closure** (WC8) — re-seeded by cold
|
||||
runs, not restored on a VM rebuild.
|
||||
|
||||
Layout (atomic dir swap on update; one last-good per app):
|
||||
Layout (atomic dir swap of the `snapshot/` subdir; one last-good per app). Sibling per-app state
|
||||
(e.g. the reconciler's `last_good`) lives in `<recipe>/` and is NOT clobbered by the swap:
|
||||
$CCCI_WARM_ROOT/<recipe>/
|
||||
meta.json # {recipe, domain, commit, version, ts, volumes:[...]}
|
||||
volumes/<volname>.tar # raw tar of the volume root, one per stack volume
|
||||
last_good # (reconciler) the version known healthy — survives snapshot swaps
|
||||
snapshot/
|
||||
meta.json # {recipe, domain, commit, version, ts, volumes:[...]}
|
||||
volumes/<volname>.tar # raw tar of the volume root, one per stack volume
|
||||
|
||||
Implementation note: volumes are tarred from their host mountpoint
|
||||
(`docker volume inspect -f '{{.Mountpoint}}'`), so no sidecar image pull is needed. The caller runs
|
||||
@ -47,12 +50,18 @@ def app_dir(recipe: str) -> str:
|
||||
return os.path.join(warm_root(), recipe)
|
||||
|
||||
|
||||
def snap_dir(recipe: str) -> str:
|
||||
"""The snapshot subdir — atomically swapped on update. Kept SEPARATE from app_dir so sibling
|
||||
per-app state (the reconciler's last_good) survives a snapshot swap."""
|
||||
return os.path.join(app_dir(recipe), "snapshot")
|
||||
|
||||
|
||||
def meta_path(recipe: str) -> str:
|
||||
return os.path.join(app_dir(recipe), "meta.json")
|
||||
return os.path.join(snap_dir(recipe), "meta.json")
|
||||
|
||||
|
||||
def volumes_dir(recipe: str) -> str:
|
||||
return os.path.join(app_dir(recipe), "volumes")
|
||||
return os.path.join(snap_dir(recipe), "volumes")
|
||||
|
||||
|
||||
def has_snapshot(recipe: str) -> bool:
|
||||
@ -112,9 +121,8 @@ def snapshot(recipe: str, domain: str, commit: str | None = None, version: str |
|
||||
if not volumes:
|
||||
raise SnapshotError(f"no volumes found for {domain} — nothing to snapshot")
|
||||
|
||||
root = warm_root()
|
||||
os.makedirs(root, exist_ok=True)
|
||||
staging = os.path.join(root, f".{recipe}.staging")
|
||||
os.makedirs(app_dir(recipe), exist_ok=True)
|
||||
staging = os.path.join(app_dir(recipe), ".snapshot.staging")
|
||||
shutil.rmtree(staging, ignore_errors=True)
|
||||
os.makedirs(os.path.join(staging, "volumes"), exist_ok=True)
|
||||
|
||||
@ -138,9 +146,9 @@ def snapshot(recipe: str, domain: str, commit: str | None = None, version: str |
|
||||
with open(os.path.join(staging, "meta.json"), "w") as f:
|
||||
json.dump(meta, f)
|
||||
|
||||
# Atomic-ish swap: move current aside, move staging in, drop the old. One last-good retained.
|
||||
target = app_dir(recipe)
|
||||
old = os.path.join(root, f".{recipe}.old")
|
||||
# Atomic-ish swap of the snapshot subdir only (sibling state like last_good is untouched).
|
||||
target = snap_dir(recipe)
|
||||
old = os.path.join(app_dir(recipe), ".snapshot.old")
|
||||
shutil.rmtree(old, ignore_errors=True)
|
||||
if os.path.exists(target):
|
||||
os.rename(target, old)
|
||||
|
||||
@ -207,11 +207,26 @@ def release_notes(recipe: str, version: str) -> str:
|
||||
def deploy_version(recipe: str, domain: str, version: str, timeout: int) -> None:
|
||||
"""Deploy a specific published version: checkout the tag (so the on-disk tree matches) then a
|
||||
pinned non-chaos redeploy with the version positional (so abra records TYPE=<recipe>:<version>).
|
||||
`-f` makes it idempotent against an already-deployed app."""
|
||||
`-f` makes it idempotent against an already-deployed app. abra writes FATA to stdout, so include
|
||||
both streams in the error."""
|
||||
abra.recipe_checkout(recipe, version)
|
||||
r = _run(["abra", "app", "deploy", domain, version, "-o", "-n", "-f"], timeout=timeout)
|
||||
if r.returncode != 0:
|
||||
raise RuntimeError(f"deploy {domain} {version} failed: {r.stderr.strip()[:300]}")
|
||||
msg = (r.stderr.strip() + " " + r.stdout.strip()).strip()[:400]
|
||||
raise RuntimeError(f"deploy {domain} {version} failed: {msg}")
|
||||
|
||||
|
||||
def wait_undeployed(domain: str, timeout: int = 120) -> None:
|
||||
"""Block until the app's swarm stack is fully removed after an undeploy. abra's undeploy may
|
||||
return before swarm finishes tearing down tasks; snapshot/restore (which require undeployed) and
|
||||
an immediate redeploy of the same stack name otherwise race a half-removed stack."""
|
||||
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if not lifecycle._docker_names("service", stack): # noqa: SLF001
|
||||
return
|
||||
time.sleep(2)
|
||||
raise RuntimeError(f"{domain} stack not fully undeployed after {timeout}s")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- last-good + alerts
|
||||
@ -332,6 +347,7 @@ def reconcile(app: str) -> str:
|
||||
print(f"[{app}] auto-upgrade {last_good} → {latest} (health-gated)", flush=True)
|
||||
if stateful:
|
||||
abra.undeploy(domain)
|
||||
wait_undeployed(domain)
|
||||
warmsnap.snapshot(recipe, domain, version=last_good)
|
||||
# snapshot requires undeployed; now bring up latest.
|
||||
# A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
|
||||
@ -353,6 +369,7 @@ def reconcile(app: str) -> str:
|
||||
print(f"[{app}] latest {latest} UNHEALTHY → rolling back to {last_good}", flush=True)
|
||||
if stateful:
|
||||
abra.undeploy(domain)
|
||||
wait_undeployed(domain)
|
||||
warmsnap.restore(recipe, domain)
|
||||
deploy_version(recipe, domain, last_good, dt)
|
||||
recovered = wait_healthy(spec)
|
||||
|
||||
@ -18,8 +18,10 @@ def test_warm_root_env_override(monkeypatch):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", "/tmp/ci-warm-test")
|
||||
assert warmsnap.warm_root() == "/tmp/ci-warm-test"
|
||||
assert warmsnap.app_dir("keycloak") == "/tmp/ci-warm-test/keycloak"
|
||||
assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/meta.json"
|
||||
assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/volumes"
|
||||
# snapshot lives in a subdir so sibling state (last_good) survives the atomic swap.
|
||||
assert warmsnap.snap_dir("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot"
|
||||
assert warmsnap.meta_path("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot/meta.json"
|
||||
assert warmsnap.volumes_dir("keycloak") == "/tmp/ci-warm-test/keycloak/snapshot/volumes"
|
||||
|
||||
|
||||
def test_warm_root_default(monkeypatch):
|
||||
@ -34,11 +36,11 @@ def test_read_meta_absent(monkeypatch, tmp_path):
|
||||
|
||||
|
||||
def _write_snapshot(tmp_path, recipe, volumes):
|
||||
appdir = tmp_path / recipe
|
||||
(appdir / "volumes").mkdir(parents=True)
|
||||
(appdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes}))
|
||||
snapdir = tmp_path / recipe / "snapshot"
|
||||
(snapdir / "volumes").mkdir(parents=True)
|
||||
(snapdir / "meta.json").write_text(json.dumps({"recipe": recipe, "volumes": volumes}))
|
||||
for v in volumes:
|
||||
(appdir / "volumes" / f"{v}.tar").write_bytes(b"fake")
|
||||
(snapdir / "volumes" / f"{v}.tar").write_bytes(b"fake")
|
||||
|
||||
|
||||
def test_has_snapshot_complete(monkeypatch, tmp_path):
|
||||
@ -49,11 +51,20 @@ def test_has_snapshot_complete(monkeypatch, tmp_path):
|
||||
assert meta["volumes"] == ["a_mariadb", "a_providers"]
|
||||
|
||||
|
||||
def test_last_good_survives_sibling_of_snapshot(monkeypatch, tmp_path):
|
||||
# The reconciler's last_good lives in <recipe>/ (sibling of snapshot/) — a test that the layout
|
||||
# keeps them separate so a snapshot swap can't clobber last_good.
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path))
|
||||
appdir = tmp_path / "keycloak"
|
||||
assert warmsnap.snap_dir("keycloak").startswith(str(appdir))
|
||||
assert os.path.dirname(warmsnap.snap_dir("keycloak")) == str(appdir)
|
||||
|
||||
|
||||
def test_has_snapshot_incomplete_missing_tar(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("CCCI_WARM_ROOT", str(tmp_path))
|
||||
# meta lists two volumes but only one tar present -> incomplete -> not usable.
|
||||
appdir = tmp_path / "keycloak"
|
||||
(appdir / "volumes").mkdir(parents=True)
|
||||
(appdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]}))
|
||||
(appdir / "volumes" / "a.tar").write_bytes(b"fake")
|
||||
snapdir = tmp_path / "keycloak" / "snapshot"
|
||||
(snapdir / "volumes").mkdir(parents=True)
|
||||
(snapdir / "meta.json").write_text(json.dumps({"recipe": "keycloak", "volumes": ["a", "b"]}))
|
||||
(snapdir / "volumes" / "a.tar").write_bytes(b"fake")
|
||||
assert warmsnap.has_snapshot("keycloak") is False
|
||||
|
||||
Reference in New Issue
Block a user