feat(2): HQ1 image pre-pull (plan-prepull-images.md) — warm local store before deploy

lifecycle.prepull_images(recipe, domain): resolve images via docker compose config --images (COMPOSE_FILE
from the app .env — handles $VERSION interpolation + multi-compose) → docker pull each, skip-if-present
(zero network for cached pinned tags). Called in deploy_app before the (unchanged, real) abra.deploy AND
in generic.perform_upgrade before the chaos redeploy (warms new-version images). A pull failure RAISES a
clear pre-deploy error (not a converge timeout); deploy path unchanged (no docker service update/scale).
Removes PULL time not app-INIT time. 4 unit tests (tests/unit/test_prepull.py): present→skip, missing→
pull, pull-fail→raise, no-images→skip. NOT claimed yet — validating cold-verify criteria next.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-29 16:02:21 +01:00
parent e6e5436942
commit 2bf40d69d6
3 changed files with 142 additions and 0 deletions

View File

@ -237,6 +237,9 @@ def perform_upgrade(
before = lifecycle.deployed_identity(domain)
if head_ref:
lifecycle.recipe_checkout_ref(recipe, head_ref)
# HQ1: warm the NEW-version image set before the chaos redeploy (the head_ref checkout's pinned
# tags) so a pull failure is a clear pre-deploy error and convergence isn't pull-bound.
lifecycle.prepull_images(recipe, domain)
lifecycle.chaos_redeploy(domain, deploy_timeout=deploy_timeout, no_converge_checks=True)
# Own the convergence verification (abra's monitor was skipped via -c).
lifecycle.wait_healthy(

View File

@ -122,6 +122,62 @@ def _run_install_steps(hook: tuple[str, str], recipe: str, domain: str) -> None:
)
def prepull_images(recipe: str, domain: str) -> None:
"""HQ1 (plan-prepull-images.md): pre-pull a recipe's images into the local store BEFORE the deploy.
A pull failure (rate-limit / bad tag / slow) then fails FAST as a CLEAR pull error here, instead
of surfacing later as a murky 'not converged' deploy timeout (the F2-12-class confusion); and
images-already-local lets the deploy converge within abra's native window. Resolves images via
`docker compose config --images` using abra's COMPOSE_FILE from the app .env (handles $VERSION
interpolation + multi-compose recipes — a naive `grep image:` misses both), then `docker pull`
each, SKIP-IF-PRESENT (zero network for already-cached pinned tags). The deploy itself stays
UNCHANGED (real `abra app deploy`) — this only warms the local store. Removes PULL time, NOT
app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE).
Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real
pull error (don't mask it)."""
import os
recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}")
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
print(f" prepull: recipe dir or .env missing for {recipe} — skipping", flush=True)
return
# COMPOSE_FILE is a shell-style ':'-separated list (may self-reference $COMPOSE_FILE for
# multi-compose); evaluate it the way abra does, then pass each file to docker compose. The
# --env-file supplies $VERSION-style interpolation so pinned tags resolve correctly.
cf = subprocess.run(
["bash", "-c", f'set -a; . "{env_path}"; printf "%s" "${{COMPOSE_FILE:-compose.yml}}"'],
capture_output=True, text=True,
).stdout.strip()
files = [f for f in cf.split(":") if f] or ["compose.yml"]
args = ["docker", "compose", "--env-file", env_path]
for f in files:
args += ["-f", f]
args += ["config", "--images"]
proc = subprocess.run(args, cwd=recipe_dir, capture_output=True, text=True)
# `config --images` prints one image ref per line to stdout (warnings go to stderr).
images = sorted({ln.strip() for ln in proc.stdout.splitlines() if ln.strip()})
if not images:
print(
f" prepull: no images resolved for {recipe} (config --images rc={proc.returncode}) — "
f"skipping (deploy will pull as usual). stderr: {proc.stderr.strip()[-160:]}",
flush=True,
)
return
for img in images:
if subprocess.run(["docker", "image", "inspect", img], capture_output=True).returncode == 0:
print(f" prepull: present {img}", flush=True)
continue
print(f" prepull: pulling {img}", flush=True)
r = subprocess.run(["docker", "pull", img], capture_output=True, text=True)
if r.returncode != 0:
raise RuntimeError(
f"prepull: `docker pull {img}` failed (rc={r.returncode}) — clear pull error BEFORE "
f"deploy: {r.stderr.strip()[-300:] or r.stdout.strip()[-300:]}"
)
print(f" prepull: {len(images)} image(s) present/pulled for {recipe}", flush=True)
def deploy_app(
recipe: str,
domain: str,
@ -173,6 +229,8 @@ def deploy_app(
abra.secret_generate(domain)
if install_steps_hook:
_run_install_steps(install_steps_hook, recipe, domain)
# HQ1: warm the local image store before the (real, unchanged) abra deploy.
prepull_images(recipe, domain)
abra.deploy(domain, chaos=chaos, timeout=deploy_timeout)

View File

@ -0,0 +1,81 @@
"""Unit tests for HQ1 image pre-pull (lifecycle.prepull_images) — deterministic, mocked docker.
Proves the pre-pull is non-vacuous (the Adversary's criteria, REVIEW-2 754f508):
- present image → SKIP (no `docker pull`, zero network — the warm-cache property).
- missing image → `docker pull` it.
- a pull FAILURE → RAISE a clear pull error (so a bad tag fails fast PRE-deploy, not as a converge
timeout). NOT vacuous.
- no images resolved → best-effort skip (deploy pulls as usual), no raise.
And that resolution uses the recipe's COMPOSE_FILE via `docker compose config --images` (not grep).
"""
from __future__ import annotations
import os
import sys
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle as lc # noqa: E402
class _R:
def __init__(self, stdout="", stderr="", returncode=0):
self.stdout, self.stderr, self.returncode = stdout, stderr, returncode
def _patch_paths(monkeypatch):
monkeypatch.setattr(os.path, "isdir", lambda p: True)
monkeypatch.setattr(os.path, "isfile", lambda p: True)
def _runner(monkeypatch, *, images="img-a:1\nimg-b:2\n", present=(), pull_rc=0, pull_err=""):
"""Install a fake subprocess.run; record calls; return the calls list."""
calls: list[list[str]] = []
def fake_run(args, **kw):
calls.append(list(args))
if args[0] == "bash":
return _R(stdout="compose.yml") # COMPOSE_FILE eval
if "config" in args and "--images" in args:
return _R(stdout=images)
if args[:3] == ["docker", "image", "inspect"]:
return _R(returncode=0 if args[3] in present else 1)
if args[:2] == ["docker", "pull"]:
return _R(returncode=pull_rc, stderr=pull_err)
return _R()
monkeypatch.setattr(lc.subprocess, "run", fake_run)
return calls
def test_prepull_skips_present_images_zero_network(monkeypatch):
_patch_paths(monkeypatch)
calls = _runner(monkeypatch, present=("img-a:1", "img-b:2"))
lc.prepull_images("r", "d") # both present → no pull
assert not any(c[:2] == ["docker", "pull"] for c in calls), "must NOT pull a present image"
# it DID resolve via `docker compose ... config --images`
assert any("config" in c and "--images" in c for c in calls)
def test_prepull_pulls_missing_image(monkeypatch):
_patch_paths(monkeypatch)
calls = _runner(monkeypatch, present=("img-a:1",)) # img-b:2 missing
lc.prepull_images("r", "d")
pulled = [c[2] for c in calls if c[:2] == ["docker", "pull"]]
assert pulled == ["img-b:2"], f"should pull only the missing image; pulled={pulled}"
def test_prepull_raises_clear_error_on_pull_failure(monkeypatch):
_patch_paths(monkeypatch)
_runner(monkeypatch, present=(), pull_rc=1, pull_err="manifest unknown: bad tag")
with pytest.raises(RuntimeError, match="clear pull error BEFORE deploy"):
lc.prepull_images("r", "d")
def test_prepull_skips_when_no_images_resolved(monkeypatch):
_patch_paths(monkeypatch)
calls = _runner(monkeypatch, images="") # config --images returns nothing
lc.prepull_images("r", "d") # no raise
assert not any(c[:2] == ["docker", "pull"] for c in calls)