fix(1e): F1e-1 exec_in_app race + HC1 head_ref/move hardening

F1e-1 (Adversary): exec_in_app silently returned '' on a failed docker exec, flipping a healthy
recipe RED under opt-out (post-backup container cycle, no readiness buffer). Now polls (re-resolve
container + re-exec) until rc==0 or 90s, then RAISES — never masks an exec failure as empty data.
No assertion weakened. Verified: opt-out install,backup,restore on custom-html now PASS.

HC1: head_ref = ref or recipe_head_commit (prefer explicit PR head sha $REF — robust, no git race;
production !testme always sets REF). assert_upgraded, when head_ref known, REQUIRES the deployed
chaos-version commit to MATCH head_ref (direct + non-vacuous proof the PR-head code was deployed; a
stale prev-checkout chaos redeploy fails). Falls back to version/image/chaos move check otherwise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 03:41:42 +01:00
parent 4334e19a7b
commit 6eabfdc0fb
5 changed files with 149 additions and 35 deletions

View File

@ -252,11 +252,11 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None
- `version` = the `coop-cloud.<stack>.version` label (bumped per published recipe version).
- `image` = the running container image (usually bumps with a published version).
- `chaos` = the chaos label value (a chaos deploy stamps the recipe git commit/dirty state here)
— present after `abra app deploy --chaos`, absent on a clean pinned-tag deploy. For prev→PR-head
this is THE proof PR-head was deployed even when the version label is unbumped (HC1). The exact
chaos label key varies by abra version, so we capture any `coop-cloud.<stack>.*` label whose key
contains "chaos"."""
- `chaos` = the chaos deploy's recipe git commit. abra stamps `coop-cloud.<stack>.chaos-version`
= the deployed recipe commit (e.g. "91b27ceb") + `coop-cloud.<stack>.chaos`="true" on a
`--chaos` deploy; both are absent on a clean pinned-tag deploy. We prefer the `.chaos-version`
commit — for prev→PR-head it IS the proof the PR-head code under test was deployed even when the
version label is unbumped (HC1); fall back to the `.chaos` flag if no commit is present."""
name = f"{_stack_name(domain)}_{service}"
proc = subprocess.run(
[
@ -274,22 +274,43 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None
if "|" not in out:
return {"version": None, "image": None, "chaos": None}
labels_json, _, image = out.partition("|")
ver = chaos = None
ver = chaos = chaos_flag = None
with contextlib.suppress(ValueError, json.JSONDecodeError):
for k, v in json.loads(labels_json).items():
if not k.startswith("coop-cloud."):
continue
if k.endswith(".version"):
ver = v
elif "chaos" in k:
chaos = v
return {"version": ver, "image": image.strip() or None, "chaos": chaos}
elif k.endswith(".chaos-version"):
chaos = v # the deployed recipe commit — the strongest signal
elif k.endswith(".chaos"):
chaos_flag = v
return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag}
def upgrade_app(domain: str, version: str | None = None) -> None:
abra.upgrade(domain, version=version)
def recipe_head_commit(recipe: str) -> str | None:
"""The recipe checkout's current HEAD commit (captured right after fetch, before any version-tag
checkout) so the upgrade tier can re-checkout the PR head for the chaos redeploy (HC1)."""
return abra.recipe_head_commit(recipe)
def recipe_checkout_ref(recipe: str, ref: str) -> None:
"""git-checkout the recipe to an arbitrary ref/commit (HC1: restore the PR-head checkout before
the chaos upgrade — the prev-tag base deploy reset it to the published tag)."""
abra.recipe_checkout(recipe, ref)
def chaos_redeploy(domain: str) -> None:
"""In-place `abra app deploy --chaos`: redeploy the running app at the CURRENT recipe checkout
(HC1: the PR-head code under test). This is the upgrade op, not a fresh install — it does NOT go
through deploy_app, so the deploy-count guard (DG4.1) is not incremented."""
abra.deploy(domain, chaos=True)
def backup_app(domain: str) -> str:
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
return abra.backup_create(domain)
@ -326,10 +347,26 @@ def _app_container(domain: str, service: str = "app", timeout: int = 60) -> str:
time.sleep(3)
def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str:
cid = _app_container(domain, service)
proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True)
return proc.stdout
def exec_in_app(domain: str, cmd: list[str], service: str = "app", timeout: int = 90) -> str:
"""Run `docker exec` in the app's container and return stdout. Hardened (Adversary F1e-1): a
lifecycle op (backup/restore) cycles the container, so a freshly-resolved container can be
mid-transition and `docker exec` FAILS — poll (re-resolving the container each try) until the exec
succeeds (returncode 0) or timeout, then RAISE. Never silently return '' on a failed exec: that
masked a container-cycle race as empty data, flipping a healthy recipe RED under opt-out (no
accidental generic-pytest timing buffer) — and could mask a real failure as a pass elsewhere."""
deadline = time.time() + timeout
last = ""
while True:
cid = _app_container(domain, service)
proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True)
if proc.returncode == 0:
return proc.stdout
last = (proc.stderr or proc.stdout).strip()
if time.time() >= deadline:
raise RuntimeError(
f"docker exec in {domain}/{service} failed (rc={proc.returncode}) after {timeout}s: {last}"
)
time.sleep(3)
def http_body(domain: str, path: str = "/", timeout: int = 15) -> str: