diff --git a/machine-docs/JOURNAL-1e.md b/machine-docs/JOURNAL-1e.md index 71f23d2..9452093 100644 --- a/machine-docs/JOURNAL-1e.md +++ b/machine-docs/JOURNAL-1e.md @@ -76,3 +76,28 @@ Next: confirm opt-out result, claim E1/HC3 gate, then E2 (HC1 chaos-to-PR-head). leftover custom-html stack). Log: /root/ccci-1e-optout.log. - HC3 proven both ways: default = generic+overlay additive on one deployment (op once); opt-out = generic floor skipped, overlay still runs. Gate E1/HC3 CLAIMED for Adversary. + +## 2026-05-28 — Adversary F1e-1 (HC3 opt-out race) + HC1 hardening +- **F1e-1 (E1/HC3 FAIL withheld):** under `CCCI_SKIP_GENERIC=1`, `test_backup_captures_state` flaked + `'' == 'original'`. Root cause (valid): `lifecycle.exec_in_app` returned `proc.stdout` WITHOUT + checking returncode — when backup-bot cycles the app container, `docker exec` fails and the empty + stdout was silently returned as data; the generic pytest spawn (~1s) had been an accidental timing + buffer that opt-out removes. **Fix (no assertion weakened):** `exec_in_app` now polls — re-resolves + the container + re-execs until returncode==0 or a 90s timeout, then RAISES. A container-cycle race + now waits-and-succeeds; a genuine exec failure is loud, never masquerades as empty data. This makes + the backup/restore overlays robust to the post-op cycle independent of the generic timing buffer, so + opt-out is behavior-neutral. +- **HC1 hardening (my own findings from E2 e2e):** + - `head_ref` capture was racy (returned None under a concurrent run wiping the shared recipe dir), + and a chaos-redeploy of the SAME prev checkout falsely "moved" via the chaos label alone. Fixes: + `head_ref = ref or recipe_head_commit(recipe)` (prefer the explicit PR head sha $REF — robust, no + git race; production `!testme` always sets REF); store head_ref in op_state. + - `assert_upgraded` now, when head_ref is known, REQUIRES the deployed `chaos-version` commit to + MATCH head_ref — direct proof the PR-head code under test was deployed, and non-vacuous (a stale + prev-checkout chaos redeploy stamps prev's commit ≠ head_ref → FAIL). Falls back to the + version/image/chaos move check only when head_ref is unknown. +- **Coordination note:** my E2 manual custom-html e2e ran concurrently with the Adversary's E1 + cold-verify — both share `/root/.abra/recipes/custom-html` + (at PR=0) the same run domain, so they + collided (explains my non-deterministic 1.10→1.11 vs 1.10→1.10 and the None head_ref). Manual ad-hoc + runs bypass Drone's capacity=1 queue. Going forward I serialize: don't run a recipe manually while a + gate is under Adversary verification; verify when `pgrep run_recipe_ci` is clear. diff --git a/runner/harness/abra.py b/runner/harness/abra.py index 094b8bf..765bf48 100644 --- a/runner/harness/abra.py +++ b/runner/harness/abra.py @@ -152,6 +152,18 @@ def restore(domain: str, timeout: int = 900) -> None: _run_pty(["app", "restore", domain, "-n", "-C", "-o"], timeout=timeout) +def recipe_head_commit(recipe: str) -> str | None: + """The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or + the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the + prev-tag base deploy reset the working tree (HC1).""" + import os + + path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True) + out = proc.stdout.strip() + return out or None + + def recipe_versions(recipe: str) -> list[str]: """Published versions of a recipe, oldest→newest (from the recipe git tags).""" import os diff --git a/runner/harness/generic.py b/runner/harness/generic.py index 94a161f..4c0329c 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -142,24 +142,43 @@ def op_state() -> dict: def assert_upgraded(domain: str, meta: dict) -> None: - """Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once. - Assert it reconverged + still serves AND that the deployment actually MOVED — guarding against a - vacuous no-op upgrade silently passing (F1d-2). HC1: prev→PR-head may NOT bump the version label, - so a MOVE is ANY of: version-label change, image change, or a chaos label now present (a chaos - deploy stamps the PR-head commit — THE proof the code under test was deployed).""" - before = op_state().get("upgrade", {}).get("before") or {} + """Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once via + `abra app deploy --chaos` of the PR-head checkout. Assert it reconverged + still serves AND that + the deployment is genuinely the PR-head code under test (HC1) — non-vacuously (guarding F1d-2). + + The chaos deploy stamps `coop-cloud..chaos-version` = the deployed recipe commit. When the + intended PR-head commit is known (head_ref), require the deployed chaos commit to MATCH it — THE + proof the code under test was deployed, and non-vacuous: a stale prev-checkout chaos redeploy would + stamp prev's commit, not head_ref, and fail here. When head_ref is unknown, fall back to requiring + a move vs the pre-upgrade state (version/image/chaos changed).""" + st = op_state().get("upgrade", {}) + before = st.get("before") or {} + head_ref = st.get("head_ref") assert_serving(domain, meta) after = lifecycle.deployed_identity(domain) + chaos = after.get("chaos") + if head_ref: + assert chaos, ( + f"{domain}: upgrade left no chaos label — `abra app deploy --chaos` did not deploy the " + "PR-head checkout (the code under test was not exercised by the upgrade)" + ) + # chaos-version is an abbreviated commit (e.g. '8a026066'); head_ref may be full or short. + assert head_ref.startswith(chaos) or chaos.startswith(head_ref), ( + f"{domain}: upgrade deployed chaos commit {chaos!r}, not the intended PR-head " + f"{head_ref[:12]!r} — the re-checkout to the code under test failed, so the upgrade is " + "not exercising the PR's changes (HC1)" + ) + return moved = ( (before.get("version") and after.get("version") and before["version"] != after["version"]) or (before.get("image") and after.get("image") and before["image"] != after["image"]) - or (after.get("chaos") and after.get("chaos") != before.get("chaos")) + or (chaos and chaos != before.get("chaos")) ) assert moved, ( f"{domain}: upgrade did not move the deployment " f"(version {before.get('version')}->{after.get('version')}, " f"image {before.get('image')}->{after.get('image')}, " - f"chaos {before.get('chaos')}->{after.get('chaos')}) — " + f"chaos {before.get('chaos')}->{chaos}) — " "not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)" ) @@ -195,12 +214,25 @@ def assert_restore_healthy(domain: str, meta: dict) -> None: # ---- Op primitives (orchestrator-only; perform the op once, never assert) -------------------- -def perform_upgrade(domain: str, target: str | None) -> dict[str, str | None]: - """Perform the UPGRADE op once (in place). E1 baseline: `abra app upgrade` -> target. (HC1/E2 - redefines this as a chaos redeploy of the PR-head checkout.) Returns the pre-upgrade identity so - the orchestrator can record it for `assert_upgraded`'s move check.""" +def perform_upgrade(domain: str, recipe: str, head_ref: str | None) -> dict[str, str | None]: + """Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the + PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos` + to redeploy the running app at that checkout. This is the real upgrade the PR's changes are + exercised by (vs the old 'upgrade to newest published tag', which never deployed PR-head code). + Returns the pre-upgrade identity so the orchestrator records it for `assert_upgraded`'s move check + — after the chaos deploy the `chaos`(-version) label carries the PR-head commit, proving it.""" before = lifecycle.deployed_identity(domain) - lifecycle.upgrade_app(domain, version=target) + if head_ref: + lifecycle.recipe_checkout_ref(recipe, head_ref) + lifecycle.chaos_redeploy(domain) + after = lifecycle.deployed_identity(domain) + # Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the + # PR-head we checked out — proving the upgrade deployed the code under test, not a published tag. + print( + f" upgrade→PR-head: head_ref={(head_ref or '')[:8] or None} " + f"chaos-version={after.get('chaos')} version={before.get('version')}→{after.get('version')}", + flush=True, + ) return before diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index ae54367..9ab0fa0 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -252,11 +252,11 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None - `version` = the `coop-cloud..version` label (bumped per published recipe version). - `image` = the running container image (usually bumps with a published version). - - `chaos` = the chaos label value (a chaos deploy stamps the recipe git commit/dirty state here) - — present after `abra app deploy --chaos`, absent on a clean pinned-tag deploy. For prev→PR-head - this is THE proof PR-head was deployed even when the version label is unbumped (HC1). The exact - chaos label key varies by abra version, so we capture any `coop-cloud..*` label whose key - contains "chaos".""" + - `chaos` = the chaos deploy's recipe git commit. abra stamps `coop-cloud..chaos-version` + = the deployed recipe commit (e.g. "91b27ceb") + `coop-cloud..chaos`="true" on a + `--chaos` deploy; both are absent on a clean pinned-tag deploy. We prefer the `.chaos-version` + commit — for prev→PR-head it IS the proof the PR-head code under test was deployed even when the + version label is unbumped (HC1); fall back to the `.chaos` flag if no commit is present.""" name = f"{_stack_name(domain)}_{service}" proc = subprocess.run( [ @@ -274,22 +274,43 @@ def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None if "|" not in out: return {"version": None, "image": None, "chaos": None} labels_json, _, image = out.partition("|") - ver = chaos = None + ver = chaos = chaos_flag = None with contextlib.suppress(ValueError, json.JSONDecodeError): for k, v in json.loads(labels_json).items(): if not k.startswith("coop-cloud."): continue if k.endswith(".version"): ver = v - elif "chaos" in k: - chaos = v - return {"version": ver, "image": image.strip() or None, "chaos": chaos} + elif k.endswith(".chaos-version"): + chaos = v # the deployed recipe commit — the strongest signal + elif k.endswith(".chaos"): + chaos_flag = v + return {"version": ver, "image": image.strip() or None, "chaos": chaos or chaos_flag} def upgrade_app(domain: str, version: str | None = None) -> None: abra.upgrade(domain, version=version) +def recipe_head_commit(recipe: str) -> str | None: + """The recipe checkout's current HEAD commit (captured right after fetch, before any version-tag + checkout) so the upgrade tier can re-checkout the PR head for the chaos redeploy (HC1).""" + return abra.recipe_head_commit(recipe) + + +def recipe_checkout_ref(recipe: str, ref: str) -> None: + """git-checkout the recipe to an arbitrary ref/commit (HC1: restore the PR-head checkout before + the chaos upgrade — the prev-tag base deploy reset it to the published tag).""" + abra.recipe_checkout(recipe, ref) + + +def chaos_redeploy(domain: str) -> None: + """In-place `abra app deploy --chaos`: redeploy the running app at the CURRENT recipe checkout + (HC1: the PR-head code under test). This is the upgrade op, not a fresh install — it does NOT go + through deploy_app, so the deploy-count guard (DG4.1) is not incremented.""" + abra.deploy(domain, chaos=True) + + def backup_app(domain: str) -> str: """Create a backup; return the abra/restic output (carries the produced snapshot_id).""" return abra.backup_create(domain) @@ -326,10 +347,26 @@ def _app_container(domain: str, service: str = "app", timeout: int = 60) -> str: time.sleep(3) -def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str: - cid = _app_container(domain, service) - proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True) - return proc.stdout +def exec_in_app(domain: str, cmd: list[str], service: str = "app", timeout: int = 90) -> str: + """Run `docker exec` in the app's container and return stdout. Hardened (Adversary F1e-1): a + lifecycle op (backup/restore) cycles the container, so a freshly-resolved container can be + mid-transition and `docker exec` FAILS — poll (re-resolving the container each try) until the exec + succeeds (returncode 0) or timeout, then RAISE. Never silently return '' on a failed exec: that + masked a container-cycle race as empty data, flipping a healthy recipe RED under opt-out (no + accidental generic-pytest timing buffer) — and could mask a real failure as a pass elsewhere.""" + deadline = time.time() + timeout + last = "" + while True: + cid = _app_container(domain, service) + proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True) + if proc.returncode == 0: + return proc.stdout + last = (proc.stderr or proc.stdout).strip() + if time.time() >= deadline: + raise RuntimeError( + f"docker exec in {domain}/{service} failed (rc={proc.returncode}) after {timeout}s: {last}" + ) + time.sleep(3) def http_body(domain: str, path: str = "/", timeout: int = 15) -> str: diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 9d87584..89d13af 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -211,13 +211,14 @@ def _run_pre_hook(recipe: str, op: str, repo_local: str | None, domain: str, met sys.path.remove(d) -def _perform_op(op: str, domain: str, target: str | None, op_state: dict) -> None: +def _perform_op(op: str, domain: str, recipe: str, head_ref: str | None, op_state: dict) -> None: """Perform the single mutating op ONCE (the harness owns the op, HC3). install has no op. Records what the assertions need (pre-upgrade identity, backup snapshot_id) into op_state. None of these - call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place upgrade is not a new - install (HC1 reconciliation).""" + call deploy_app, so the deploy-count guard (DG4.1) stays 1 — the in-place chaos upgrade is not a + new install (HC1 reconciliation).""" if op == "upgrade": - op_state["upgrade"] = {"before": generic.perform_upgrade(domain, target)} + before = generic.perform_upgrade(domain, recipe, head_ref) + op_state["upgrade"] = {"before": before, "head_ref": head_ref} elif op == "backup": op_state["backup"] = {"snapshot_id": generic.perform_backup(domain)} elif op == "restore": @@ -231,12 +232,13 @@ def run_lifecycle_tier( repo_local: str | None, domain: str, meta: dict, - target: str | None, + head_ref: str | None, op_state: dict, ) -> str: """Additive lifecycle tier (HC3): seed (pre-op hook) → perform the op ONCE → run the generic assertion file (unless opted out) AND the overlay assertion file, both against the shared post-op - deployment. Returns 'pass' | 'fail' | 'skip'.""" + deployment. The upgrade op redeploys the PR head (head_ref) via chaos (HC1). Returns + 'pass' | 'fail' | 'skip'.""" overlay = discovery.resolve_overlay_op(recipe, op, repo_local) skip_gen = _skip_generic(op, meta) files: list[tuple[str, str]] = [] @@ -257,7 +259,7 @@ def run_lifecycle_tier( # 1) pre-op seed hook + 2) the op ONCE (harness-owned). A failure here is an op failure → tier fail. try: _run_pre_hook(recipe, op, repo_local, domain, meta) - _perform_op(op, domain, target, op_state) + _perform_op(op, domain, recipe, head_ref, op_state) with open(os.environ["CCCI_OP_STATE_FILE"], "w") as f: json.dump(op_state, f) except Exception as e: # noqa: BLE001 — a failed op is a reported tier failure, not a crash @@ -312,6 +314,10 @@ def main() -> int: f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}" ) fetch_recipe(recipe, ref, src) + # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test + # (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout + # HEAD (the catalogue current) for a non-PR `!testme`. Captured before any version-tag checkout. + head_ref = ref or lifecycle.recipe_head_commit(recipe) repo_local = snapshot_recipe_tests(recipe) meta = _load_meta(recipe) domain = naming.app_domain(recipe, os.environ.get("PR", "0"), ref) @@ -361,7 +367,7 @@ def main() -> int: # ---- INSTALL tier (always; additive generic + overlay, no op) ---- if "install" in stages: results["install"] = ( - run_lifecycle_tier(recipe, "install", repo_local, domain, meta, target, op_state) + run_lifecycle_tier(recipe, "install", repo_local, domain, meta, head_ref, op_state) if deploy_ok else "fail" ) @@ -379,7 +385,9 @@ def main() -> int: # ---- BACKUP + RESTORE tiers (backup-capable only; else clean N/A) ---- if "backup" in stages: results["backup"] = ( - run_lifecycle_tier(recipe, "backup", repo_local, domain, meta, target, op_state) + run_lifecycle_tier( + recipe, "backup", repo_local, domain, meta, head_ref, op_state + ) if backup_cap else "skip" )