fix(1e): F1e-1 exec_in_app race + HC1 head_ref/move hardening

F1e-1 (Adversary): exec_in_app silently returned '' on a failed docker exec, flipping a healthy
recipe RED under opt-out (post-backup container cycle, no readiness buffer). Now polls (re-resolve
container + re-exec) until rc==0 or 90s, then RAISES — never masks an exec failure as empty data.
No assertion weakened. Verified: opt-out install,backup,restore on custom-html now PASS.

HC1: head_ref = ref or recipe_head_commit (prefer explicit PR head sha $REF — robust, no git race;
production !testme always sets REF). assert_upgraded, when head_ref known, REQUIRES the deployed
chaos-version commit to MATCH head_ref (direct + non-vacuous proof the PR-head code was deployed; a
stale prev-checkout chaos redeploy fails). Falls back to version/image/chaos move check otherwise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 03:41:42 +01:00
parent 4334e19a7b
commit 6eabfdc0fb
5 changed files with 149 additions and 35 deletions

View File

@ -211,13 +211,14 @@ def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, met
sys.path.remove(d)
def _perform_op(op: str, domain: str, target: str | None, op_state: dict) -> None:
def _perform_op(op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict) -> None:
"""Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records
what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place upgrade is not a new
install (HC1 reconciliation)."""
call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a
new install (HC1 reconciliation)."""
if op == "upgrade":
op_state["upgrade"] = {"before": generic.perform_upgrade(domain, target)}
before = generic.perform_upgrade(domain, recipe, head_ref)
op_state["upgrade"] = {"before": before, "head_ref": head_ref}
elif op == "backup":
op_state["backup"] = {"snapshot_id": generic.perform_backup(domain)}
elif op == "restore":
@ -231,12 +232,13 @@ def run_lifecycle_tier(
repo_local: str | None,
domain: str,
meta: dict,
target: str | None,
head_ref: str | None,
op_state: dict,
) -> str:
"""Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic
assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op
deployment. Returns 'pass' | 'fail' | 'skip'."""
deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns
'pass' | 'fail' | 'skip'."""
overlay = discovery.resolve_overlay_op(recipe, op, repo_local)
skip_gen = _skip_generic(op, meta)
files: list[tuple[str, str]] = []
@ -257,7 +259,7 @@ def run_lifecycle_tier(
# 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail.
try:
_run_pre_hook(recipe, op, repo_local, domain, meta)
_perform_op(op, domain, target, op_state)
_perform_op(op, domain, recipe, head_ref, op_state)
with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f:
json.dump(op_state, f)
except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash
@ -312,6 +314,10 @@ def main() -> int:
f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}"
)
fetch_recipe(recipe, ref, src)
# The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test
# (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout
# HEAD (the catalogue current) for a non-PR `!testme`. Captured before any version-tag checkout.
head_ref = ref or lifecycle.recipe_head_commit(recipe)
repo_local = snapshot_recipe_tests(recipe)
meta = _load_meta(recipe)
domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref)
@ -361,7 +367,7 @@ def main() -> int:
# ---- INSTALL tier (always; additive generic + overlay, no op) ----
if "install" in stages:
results["install"] = (
run_lifecycle_tier(recipe, "install", repo_local, domain, meta, target, op_state)
run_lifecycle_tier(recipe, "install", repo_local, domain, meta, head_ref, op_state)
if deploy_ok
else "fail"
)
@ -379,7 +385,9 @@ def main() -> int:
# ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ----
if "backup" in stages:
results["backup"] = (
run_lifecycle_tier(recipe, "backup", repo_local, domain, meta, target, op_state)
run_lifecycle_tier(
recipe, "backup", repo_local, domain, meta, head_ref, op_state
)
if backup_cap
else "skip"
)