feat(1e): HC3 additive generic + op/assertion split (orchestrator owns the op)

- orchestrator: per mutating tier, run optional pre-op seed hook (ops.py pre_<op>) → perform the op
  ONCE (harness-owned) → run generic assertion (unless opted out) AND overlay assertion, both against
  the shared post-op deployment. Op results passed op→assertion via run-scoped CCCI_OP_STATE_FILE.
- opt-out: CCCI_SKIP_GENERIC / CCCI_SKIP_GENERIC_<OP> / recipe_meta.SKIP_GENERIC (declarative).
- generic.py: split do_* into op primitives (perform_upgrade/backup/restore) + assertions
  (assert_upgraded/backup_artifact/restore_healthy) reading op_state(); deployed_identity now returns
  {version,image,chaos} (chaos label ready for HC1).
- generic test_<op>.py + all 6 recipe overlays migrated to assertion-only; pre-op seeding moved to
  per-recipe ops.py (pre_upgrade/pre_backup/pre_restore). install overlays unchanged (no op).
- deploy-count stays 1 (op primitives never call deploy_app). lint PASS; 8 unit tests PASS on cc-ci.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 03:12:04 +01:00
parent 6a59343996
commit b7e6cbd7be
31 changed files with 623 additions and 412 deletions

View File

@ -118,22 +118,49 @@ def assert_serving(domain: str, meta: dict) -> None:
assert "commoninternet.net" in detail.lower(), f"{domain}: served cert unexpected — {detail}"
def do_upgrade(domain: str, target: str | None, meta: dict) -> None:
"""UPGRADE op (in place on the shared deployment): abra app upgrade -> target, then assert it
reconverges + still serves AND that the deployment actually MOVED (version label and/or image
changed). The move assertion guards against a vacuous no-op upgrade silently passing — the exact
F1d-2 failure where a mis-pinned base deployed LATEST so 'upgrade to latest' changed nothing."""
before = lifecycle.deployed_identity(domain)
lifecycle.upgrade_app(domain, version=target)
# ---- Op/assertion split (Phase 1e HC3) -------------------------------------------------------
# The orchestrator performs each mutating op ONCE (the harness owns the op), records what an
# assertion needs (pre-upgrade identity, backup snapshot_id) into a run-scoped JSON state file at
# $CCCI_OP_STATE_FILE, then runs the generic assertion file (unless opted out) AND the overlay
# assertion file against the shared post-op state. The assertion functions below read that state via
# `op_state()`. They NEVER perform the op — that keeps the op single + lets generic+overlay coexist.
import json as _json # noqa: E402
def op_state() -> dict:
"""The run-scoped op state the orchestrator wrote between op and assertions (or {} if unset).
Carries e.g. {"upgrade": {"before": {...}}, "backup": {"snapshot_id": "..."}}."""
path = os.environ.get("CCCI_OP_STATE_FILE")
if not path or not os.path.exists(path):
return {}
try:
with open(path) as f:
return _json.load(f)
except (OSError, ValueError):
return {}
def assert_upgraded(domain: str, meta: dict) -> None:
"""Generic UPGRADE assertion (post-op): the orchestrator already performed the upgrade once.
Assert it reconverged + still serves AND that the deployment actually MOVED — guarding against a
vacuous no-op upgrade silently passing (F1d-2). HC1: prev→PR-head may NOT bump the version label,
so a MOVE is ANY of: version-label change, image change, or a chaos label now present (a chaos
deploy stamps the PR-head commit — THE proof the code under test was deployed)."""
before = op_state().get("upgrade", {}).get("before") or {}
assert_serving(domain, meta)
after = lifecycle.deployed_identity(domain)
moved = (before[0] and after[0] and before[0] != after[0]) or (
before[1] and after[1] and before[1] != after[1]
moved = (
(before.get("version") and after.get("version") and before["version"] != after["version"])
or (before.get("image") and after.get("image") and before["image"] != after["image"])
or (after.get("chaos") and after.get("chaos") != before.get("chaos"))
)
assert moved, (
f"{domain}: upgrade did not move the deployment "
f"(version {before[0]}->{after[0]}, image {before[1]}->{after[1]}) — "
"not a real previous->target upgrade (DG2 must be non-vacuous)"
f"(version {before.get('version')}->{after.get('version')}, "
f"image {before.get('image')}->{after.get('image')}, "
f"chaos {before.get('chaos')}->{after.get('chaos')}) — "
"not a real upgrade to the code under test (HC1/DG2 must be non-vacuous)"
)
@ -148,10 +175,10 @@ def parse_snapshot_id(backup_output: str) -> str | None:
return m.group(1) if m else None
def do_backup(domain: str) -> str:
"""BACKUP op: create a backup, then assert a snapshot artifact was produced (returns its id)."""
out = lifecycle.backup_app(domain)
snap_id = parse_snapshot_id(out)
def assert_backup_artifact(domain: str) -> str:
"""Generic BACKUP assertion (post-op): the orchestrator already ran the backup once. Assert a
snapshot artifact was produced (its id recorded in op state). Returns the id."""
snap_id = op_state().get("backup", {}).get("snapshot_id")
assert snap_id, (
f"{domain}: backup produced no snapshot artifact "
"(no snapshot_id in `abra app backup create` output)"
@ -159,8 +186,29 @@ def do_backup(domain: str) -> str:
return snap_id
def do_restore(domain: str, meta: dict) -> None:
"""RESTORE op: restore the latest snapshot, then assert the app is healthy + serving again
(assert_serving polls, so the post-restore reconverge settles)."""
lifecycle.restore_app(domain)
def assert_restore_healthy(domain: str, meta: dict) -> None:
"""Generic RESTORE assertion (post-op): the orchestrator already restored. Assert the app is
healthy + serving again (assert_serving polls, so the post-restore reconverge settles)."""
assert_serving(domain, meta)
# ---- Op primitives (orchestrator-only; perform the op once, never assert) --------------------
def perform_upgrade(domain: str, target: str | None) -> dict[str, str | None]:
"""Perform the UPGRADE op once (in place). E1 baseline: `abra app upgrade` -> target. (HC1/E2
redefines this as a chaos redeploy of the PR-head checkout.) Returns the pre-upgrade identity so
the orchestrator can record it for `assert_upgraded`'s move check."""
before = lifecycle.deployed_identity(domain)
lifecycle.upgrade_app(domain, version=target)
return before
def perform_backup(domain: str) -> str | None:
"""Perform the BACKUP op once. Returns the produced snapshot_id (or None) for the assertion."""
return parse_snapshot_id(lifecycle.backup_app(domain))
def perform_restore(domain: str) -> None:
"""Perform the RESTORE op once (restore the latest snapshot)."""
lifecycle.restore_app(domain)

View File

@ -245,11 +245,18 @@ def wait_healthy(
raise TimeoutError(f"{domain}: not healthy over HTTPS {path} (last status {last})")
def deployed_identity(domain: str, service: str = "app") -> tuple[str | None, str | None]:
"""(coop-cloud version label, image) of the running app service. Used to prove an upgrade
actually MOVED the deployment prev→target (not a vacuous no-op — Adversary F1d-2). The version
label (`coop-cloud.<stack>.version`) is bumped per published recipe version; the image usually
bumps too. Either changing proves the upgrade did something."""
def deployed_identity(domain: str, service: str = "app") -> dict[str, str | None]:
"""Identity of the running app service: {"version", "image", "chaos"}. Used to prove an upgrade
actually MOVED the deployment (not a vacuous no-op — Adversary F1d-2), AND (Phase 1e HC1) that an
`abra app deploy --chaos` upgrade actually deployed the PR-head code under test.
- `version` = the `coop-cloud.<stack>.version` label (bumped per published recipe version).
- `image` = the running container image (usually bumps with a published version).
- `chaos` = the chaos label value (a chaos deploy stamps the recipe git commit/dirty state here)
— present after `abra app deploy --chaos`, absent on a clean pinned-tag deploy. For prev→PR-head
this is THE proof PR-head was deployed even when the version label is unbumped (HC1). The exact
chaos label key varies by abra version, so we capture any `coop-cloud.<stack>.*` label whose key
contains "chaos"."""
name = f"{_stack_name(domain)}_{service}"
proc = subprocess.run(
[
@ -265,15 +272,18 @@ def deployed_identity(domain: str, service: str = "app") -> tuple[str | None, st
)
out = proc.stdout.strip()
if "|" not in out:
return (None, None)
return {"version": None, "image": None, "chaos": None}
labels_json, _, image = out.partition("|")
ver = None
ver = chaos = None
with contextlib.suppress(ValueError, json.JSONDecodeError):
for k, v in json.loads(labels_json).items():
if k.startswith("coop-cloud.") and k.endswith(".version"):
if not k.startswith("coop-cloud."):
continue
if k.endswith(".version"):
ver = v
break
return (ver, image.strip() or None)
elif "chaos" in k:
chaos = v
return {"version": ver, "image": image.strip() or None, "chaos": chaos}
def upgrade_app(domain: str, version: str | None = None) -> None: