fix(1e): F1e-1 exec_in_app race + HC1 head_ref/move hardening

F1e-1 (Adversary): exec_in_app silently returned '' on a failed docker exec, flipping a healthy
recipe RED under opt-out (post-backup container cycle, no readiness buffer). Now polls (re-resolve
container + re-exec) until rc==0 or 90s, then RAISES — never masks an exec failure as empty data.
No assertion weakened. Verified: opt-out install,backup,restore on custom-html now PASS.

HC1: head_ref = ref or recipe_head_commit (prefer explicit PR head sha $REF — robust, no git race;
production !testme always sets REF). assert_upgraded, when head_ref known, REQUIRES the deployed
chaos-version commit to MATCH head_ref (direct + non-vacuous proof the PR-head code was deployed; a
stale prev-checkout chaos redeploy fails). Falls back to version/image/chaos move check otherwise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 03:41:42 +01:00
parent 4334e19a7b
commit 6eabfdc0fb
5 changed files with 149 additions and 35 deletions

View File

@ -142,24 +142,43 @@ def op_state() -> dict:
def assert_upgraded(domain: str, meta: dict) -> None:
"""Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once.
Assert it reconverged + still serves AND that the deployment actually MOVED — guarding against a
vacuous no-op upgrade silently passing (F1d-2). HC1: prev→PR-head may NOT bump the version label,
so a MOVE is ANY of: version-label change, image change, or a chaos label now present (a chaos
deploy stamps the PR-head commit — THE proof the code under test was deployed)."""
before = op_state().get("upgrade", {}).get("before") or {}
"""Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via
`abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that
the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2).
The chaos deploy stamps `coop-cloud.<stack>.chaos-version` = the deployed recipe commit. When the
intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE
proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would
stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring
a move vs the pre-upgrade state (version/image/chaos changed)."""
st = op_state().get("upgrade", {})
before = st.get("before") or {}
head_ref = st.get("head_ref")
assert_serving(domain, meta)
after = lifecycle.deployed_identity(domain)
chaos = after.get("chaos")
if head_ref:
assert chaos, (
f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the "
"PR-head checkout (the code under test was not exercised by the upgrade)"
)
# chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short.
assert head_ref.startswith(chaos) or chaos.startswith(head_ref), (
f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head "
f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is "
"not exercising the PR's changes (HC1)"
)
return
moved = (
(before.get("version") and after.get("version") and before["version"] != after["version"])
or (before.get("image") and after.get("image") and before["image"] != after["image"])
or (after.get("chaos") and after.get("chaos") != before.get("chaos"))
or (chaos and chaos != before.get("chaos"))
)
assert moved, (
f"{domain}: upgrade did not move the deployment "
f"(version {before.get('version')}->{after.get('version')}, "
f"image {before.get('image')}->{after.get('image')}, "
f"chaos {before.get('chaos')}->{after.get('chaos')}) — "
f"chaos {before.get('chaos')}->{chaos}) — "
"not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)"
)
@ -195,12 +214,25 @@ def assert_restore_healthy(domain: str, meta: dict) -> None:
# ---- Op primitives (orchestrator-only; perform the op once, never assert) --------------------
def perform_upgrade(domain: str, target: str | None) -> dict[str, str | None]:
"""Perform the UPGRADE op once (in place). E1 baseline: `abra app upgrade` -> target. (HC1/E2
redefines this as a chaos redeploy of the PR-head checkout.) Returns the pre-upgrade identity so
the orchestrator can record it for `assert_upgraded`'s move check."""
def perform_upgrade(domain: str, recipe: str, head_ref: str | None) -> dict[str, str | None]:
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
to redeploy the running app at that checkout. This is the real upgrade the PR's changes are
exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code).
Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check
— after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it."""
before = lifecycle.deployed_identity(domain)
lifecycle.upgrade_app(domain, version=target)
if head_ref:
lifecycle.recipe_checkout_ref(recipe, head_ref)
lifecycle.chaos_redeploy(domain)
after = lifecycle.deployed_identity(domain)
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
print(
f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} "
f"chaos-version={after.get('chaos')} version={before.get('version')}{after.get('version')}",
flush=True,
)
return before