diff --git a/tests/custom-html-bkp-bad/ops.py b/tests/custom-html-bkp-bad/ops.py
new file mode 100644
index 0000000..f6db098
--- /dev/null
+++ b/tests/custom-html-bkp-bad/ops.py
@@ -0,0 +1,19 @@
+"""custom-html-bkp-bad — lifecycle ops for bad-backup/bad-restore RED canaries.
+
+Intentionally has NO pre_backup hook: the marker is never seeded before backup,
+so the backup snapshot has no ci-marker.txt. pre_restore writes "mutated" so that if
+restore DOES bring back the snapshot, the marker is gone/still-mutated → test fails.
+"""
+
+from __future__ import annotations
+
+from harness import lifecycle
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def pre_restore(domain: str, meta: dict) -> None:
+ """Write 'mutated' to the marker before restore runs. If restore brings back the
+ snapshot (which has no marker — never seeded by pre_backup), the marker ends up
+ MISSING or 'mutated' after restore → test_restore_returns_state FAILS → restore=RED."""
+ lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
diff --git a/tests/custom-html-bkp-bad/recipe_meta.py b/tests/custom-html-bkp-bad/recipe_meta.py
new file mode 100644
index 0000000..cff4007
--- /dev/null
+++ b/tests/custom-html-bkp-bad/recipe_meta.py
@@ -0,0 +1,5 @@
+# custom-html-bkp-bad — regression fixture for bad-backup canary.
+# This recipe is custom-html WITHOUT backupbot labels. Setting BACKUP_CAPABLE=True here forces the
+# harness to run the backup tier; the recipe itself has no backupbot service, so
+# `abra app backup create` produces no snapshot → test_backup_artifact fails → backup tier RED.
+BACKUP_CAPABLE = True
diff --git a/tests/custom-html-bkp-bad/test_backup.py b/tests/custom-html-bkp-bad/test_backup.py
new file mode 100644
index 0000000..373fae9
--- /dev/null
+++ b/tests/custom-html-bkp-bad/test_backup.py
@@ -0,0 +1,28 @@
+"""custom-html-bkp-bad — BACKUP assertion (bad-backup RED canary).
+
+This recipe has no ops.py::pre_backup, so ci-marker.txt is NEVER seeded before the backup.
+Asserting its presence here causes backup tier RED — proving the server catches a recipe that
+claims backup support but doesn't actually back up the expected data.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_backup_captures_state(live_app):
+ """Assert the pre-backup marker is present and equals 'original'.
+
+ Since custom-html-bkp-bad has no ops.py::pre_backup to seed the marker, this file does NOT
+ exist at backup time — exec_in_app returns empty or raises → assertion fails → backup tier RED.
+ This models a recipe that declares backup capability but omits the data-seeding hook."""
+ result = lifecycle.exec_in_app(live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]).strip()
+ assert result == "original", (
+ f"backup did not capture the expected marker at {MARKER_PATH}: got {result!r}. "
+ "Expected 'original' (seeded by pre_backup). If the marker is 'MISSING', the pre_backup "
+ "hook was not run — this is the intended failure for the bad-backup RED canary."
+ )
diff --git a/tests/custom-html-bkp-bad/test_restore.py b/tests/custom-html-bkp-bad/test_restore.py
new file mode 100644
index 0000000..ce1b924
--- /dev/null
+++ b/tests/custom-html-bkp-bad/test_restore.py
@@ -0,0 +1,25 @@
+"""custom-html-bkp-bad — RESTORE assertion (bad-restore RED canary).
+
+pre_restore seeds 'mutated' to ci-marker.txt. The backup snapshot has no ci-marker.txt
+(never seeded by pre_backup). After restore, the marker is either MISSING or 'mutated' —
+never 'original' — so this assertion FAILS → restore tier RED.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_restore_returns_state(live_app):
+ result = lifecycle.exec_in_app(
+ live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
+ ).strip()
+ assert result == "original", (
+ f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
+ "Expected 'original'. The backup had no marker (not seeded by pre_backup), so "
+ "restore cannot recover it — this is the intended failure for the bad-restore RED canary."
+ )
diff --git a/tests/custom-html-rst-bad/ops.py b/tests/custom-html-rst-bad/ops.py
new file mode 100644
index 0000000..3f3b920
--- /dev/null
+++ b/tests/custom-html-rst-bad/ops.py
@@ -0,0 +1,15 @@
+"""custom-html-rst-bad — lifecycle ops for bad-restore RED canary.
+
+NO pre_backup hook: marker never seeded before backup → snapshot has no ci-marker.txt.
+pre_restore writes "mutated". After restore, marker stays "mutated" (not in snapshot) → FAIL.
+"""
+
+from __future__ import annotations
+
+from harness import lifecycle
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def pre_restore(domain: str, meta: dict) -> None:
+ lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
diff --git a/tests/custom-html-rst-bad/recipe_meta.py b/tests/custom-html-rst-bad/recipe_meta.py
new file mode 100644
index 0000000..7e5bda7
--- /dev/null
+++ b/tests/custom-html-rst-bad/recipe_meta.py
@@ -0,0 +1,3 @@
+# custom-html-rst-bad — regression fixture for bad-restore canary.
+# BACKUP_CAPABLE=True forces the backup tier to run even though the recipe has no backupbot label.
+BACKUP_CAPABLE = True
diff --git a/tests/custom-html-rst-bad/test_restore.py b/tests/custom-html-rst-bad/test_restore.py
new file mode 100644
index 0000000..a361d65
--- /dev/null
+++ b/tests/custom-html-rst-bad/test_restore.py
@@ -0,0 +1,23 @@
+"""custom-html-rst-bad — RESTORE assertion (bad-restore RED canary).
+
+No pre_backup → backup snapshot has no ci-marker.txt. pre_restore writes "mutated".
+After restore: marker is "mutated" (restore can't recover "original" — wasn't backed up) → FAIL.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_restore_returns_state(live_app):
+ result = lifecycle.exec_in_app(
+ live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
+ ).strip()
+ assert result == "original", (
+ f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
+ "Expected 'original'. The backup had no marker, so restore cannot recover it."
+ )
diff --git a/tests/regression/README.md b/tests/regression/README.md
new file mode 100644
index 0000000..0b02acd
--- /dev/null
+++ b/tests/regression/README.md
@@ -0,0 +1,136 @@
+# Regression canaries — E2E self-tests for the cc-ci server
+
+A standing pytest suite that drives the **real** cc-ci lifecycle harness against pinned canary
+recipes and verifies both halves of the server's job:
+
+1. **Good canaries** — healthy apps are reported GREEN (install + upgrade + backup/restore pass).
+2. **Bad canary** — broken apps are caught RED; a false-green makes the regression test itself fail.
+
+These tests run the full cold lifecycle on the live cc-ci server. They are **slow** (minutes per
+canary) and **opt-in** — kept out of the per-commit fast path by the `canary` marker.
+
+---
+
+## How to run
+
+Run on the cc-ci server (abra + Docker + Swarm required):
+
+```bash
+ssh cc-ci
+cd /root/cc-ci # or wherever the repo is checked out
+cc-ci-run python -m pytest tests/regression/ -m canary -v
+```
+
+Or a single canary:
+
+```bash
+cc-ci-run python -m pytest tests/regression/ -m canary -k good-simple -v
+```
+
+From the orchestrator:
+
+```bash
+ssh cc-ci "cd /root/cc-ci && cc-ci-run python -m pytest tests/regression/ -m canary -v"
+```
+
+---
+
+## Canaries
+
+| ID | Recipe | Purpose | Expected verdict |
+|----|--------|---------|-----------------|
+| `good-simple` | `custom-html-tiny` | Minimal static server — fast signal | GREEN |
+| `good-significant` | `lasuite-docs` | Multi-service (backend + Postgres + Collabora + OIDC) | GREEN |
+| `bad-false-green` | `custom-html` @ `v5-stale-docroot` | App is UP but serves wrong Content-Type — catches false-green | RED |
+
+### Why the bad canary exists
+
+The scariest regression is a **false-green**: the server reports PASS while the app is broken.
+We already saw a fabricated full-PASS during the build. The `bad-false-green` canary pins a known-
+broken fixture (`v5-stale-docroot`: nginx serves `.txt` as `application/octet-stream`). The
+harness's `test_content_type_html_and_txt` catches this and returns RED (build #75 was RED for
+exactly this fixture).
+
+The regression test asserts `rc != 0`. If the harness ever wrongly returns green for this fixture,
+that assert fires — false-green is caught before any merge.
+
+---
+
+## What each canary verifies
+
+### Per-tier semantic assertions (the "teeth")
+
+The tests assert MORE than the harness exit code: they check that **specific named assertions**
+ran and got the expected result. This guards against a different failure mode — a tier that
+nominally "passes" because the assertion was silently removed or made vacuous.
+
+| Stage | Test name | What it proves |
+|-------|-----------|---------------|
+| install | `test_serving` | Generic HTTP readiness check actually ran |
+| install | `test_serving_and_frontend` | Lasuite-docs frontend (SPA shell) actually loaded |
+| custom | `test_content_type` | Content-type assertion actually ran (bad canary only) |
+
+If a tier assertion is removed: the named test disappears from `results.json` → the semantic
+check fires → the regression suite catches the removal.
+
+### Additional structural assertions (good canaries)
+
+- `install` tier: "pass" (not fail, not skip)
+- No tier is "fail" (skips acceptable for recipes without backup/custom tests)
+- `flags.clean_teardown = True` (no leftover containers/volumes/secrets)
+- `flags.no_secret_leak = True` (no secret value in the results artifact)
+
+---
+
+## Cadence policy
+
+**Do NOT run on every commit or PR.** These are slow and resource-heavy. Run them:
+
+- Before a **release** of the cc-ci server (after a batch of server changes).
+- As a **polishing pass** or pre-merge check for significant server refactors.
+- On-demand when you suspect a regression: `pytest -m canary`.
+
+They are NOT wired to the per-commit Drone pipeline. If adding a `!testme`-style trigger for the
+cc-ci repo, gate it behind a deliberate label (e.g. `run-canaries`) — not an automatic run on
+every push.
+
+---
+
+## How to add a canary
+
+1. Identify a recipe that is already deployable and has pinned version tags.
+2. Decide the expected verdict (GREEN or RED) and which tier assertions have teeth.
+3. Add an entry to `CANARIES` in `test_canaries.py`:
+
+```python
+{
+ "id": "good-myrecipe",
+ "recipe": "my-recipe",
+ "src": "recipe-maintainers/my-recipe",
+ "ref": "", # pin to a specific commit for stability
+ "expected_green": True,
+ "stage_pass_checks": [
+ ("install", "test_serving"), # verify this named test ran and passed
+ ],
+ "stage_fail_checks": [],
+}
+```
+
+4. Run the canary once to confirm it passes:
+ `cc-ci-run python -m pytest tests/regression/ -m canary -k good-myrecipe -v`
+
+5. Update the pin comment with the date and the recipe version it was pinned at.
+
+---
+
+## Pin maintenance
+
+Canary refs are pinned to specific SHAs for stability. When a recipe publishes a new release:
+
+1. Update the `"ref"` SHA in the canary definition (use the new main-branch HEAD).
+2. Update the pin comment with the new date/version.
+3. Re-run the canary to confirm GREEN before committing the pin update.
+
+The bad canary (`v5-stale-docroot`) is a stable fixture branch — update only if the branch is
+deleted. If deleted, recreate the pattern: an app that is up + passes lifecycle tiers but fails
+one functional assertion.
diff --git a/tests/regression/conftest.py b/tests/regression/conftest.py
new file mode 100644
index 0000000..519d3ee
--- /dev/null
+++ b/tests/regression/conftest.py
@@ -0,0 +1,106 @@
+"""Shared fixtures and helpers for E2E canary regression tests.
+
+The regression tests call the real cc-ci harness (run_recipe_ci.py) as a subprocess and assert on
+its outputs (exit code, results.json). They run ON the cc-ci server, not the orchestrator — abra,
+Docker, and Swarm must be present.
+
+Invoke: cc-ci-run python -m pytest tests/regression/ -m canary -v
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import time
+
+ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def pytest_configure(config):
+ config.addinivalue_line(
+ "markers",
+ "canary: slow E2E canary test — drives the full cold CI lifecycle; run on-demand only.",
+ )
+ config.addinivalue_line(
+ "markers",
+ "canary_fast: fast per-tier RED canary (still tagged canary); subset for quick pre-merge checks.",
+ )
+
+
+def run_recipe_ci(
+ recipe: str,
+ src: str,
+ ref: str,
+ pr: str = "0",
+ stages: str = "install,upgrade,backup,restore,custom",
+ runs_dir: str | None = None,
+ run_id_prefix: str = "regression",
+ timeout: int = 3600,
+) -> tuple[int, dict | None, str]:
+ """Invoke run_recipe_ci.py with the given canary params.
+
+ Returns (rc, results_dict_or_None, run_artifact_dir).
+ Stdout/stderr stream live so a human can follow progress.
+ """
+ ts = int(time.time())
+ run_id = f"{run_id_prefix}-{recipe}-{ref[:12]}-{ts}"
+ if runs_dir is None:
+ runs_dir = "/var/lib/cc-ci-runs"
+
+ env = dict(os.environ)
+ env.update(
+ {
+ "RECIPE": recipe,
+ "REF": ref,
+ "SRC": src,
+ "PR": pr,
+ "STAGES": stages,
+ "CCCI_RUN_ID": run_id,
+ "CCCI_RUNS_DIR": runs_dir,
+ "HOME": "/root",
+ }
+ )
+ # Keep PLAYWRIGHT env from the outer cc-ci-run wrapper (already in os.environ if running under it)
+
+ script = os.path.join(ROOT, "runner", "run_recipe_ci.py")
+ result = subprocess.run(
+ [sys.executable, script],
+ env=env,
+ timeout=timeout,
+ )
+ rc = result.returncode
+
+ artifact_dir = os.path.join(runs_dir, run_id)
+ results_path = os.path.join(artifact_dir, "results.json")
+ results_data: dict | None = None
+ if os.path.exists(results_path):
+ with open(results_path) as f:
+ results_data = json.load(f)
+
+ return rc, results_data, artifact_dir
+
+
+def find_stage_tests(results: dict, stage_name: str) -> list[dict]:
+ """Return the per-test list for a named stage from results.json, or []."""
+ for stage in results.get("stages", []):
+ if stage.get("name") == stage_name:
+ return stage.get("tests", [])
+ return []
+
+
+def stage_has_passing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
+ """True if the named stage contains a passing test whose name includes test_name_substr."""
+ for t in find_stage_tests(results, stage_name):
+ if test_name_substr in t.get("name", "") and t.get("status") == "pass":
+ return True
+ return False
+
+
+def stage_has_failing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
+ """True if the named stage contains a failing test whose name includes test_name_substr."""
+ for t in find_stage_tests(results, stage_name):
+ if test_name_substr in t.get("name", "") and t.get("status") in ("fail", "error"):
+ return True
+ return False
diff --git a/tests/regression/test_canaries.py b/tests/regression/test_canaries.py
new file mode 100644
index 0000000..361fabd
--- /dev/null
+++ b/tests/regression/test_canaries.py
@@ -0,0 +1,344 @@
+"""E2E canary regression tests — the server's standing self-test suite.
+
+Seven canaries prove both halves of the server's job:
+ 1. GREEN canaries — good apps are reported healthy (install+upgrade+backup/restore pass).
+ 2. RED canaries — broken apps are caught at the intended tier; a false-green makes THIS test fail.
+
+Fast subset (@pytest.mark.canary_fast): the four per-tier RED canaries on custom-html-tiny — fast
+because the recipe deploys in seconds. Run with `-m canary_fast` as a pre-merge quick check.
+Full suite (-m canary): includes good-significant (lasuite-docs, 10-20 min).
+
+Run: cc-ci-run python -m pytest tests/regression/ -m canary -v
+Pin policy: canary refs are pinned to specific SHAs. Update only after confirming the new ref gives
+the expected verdict.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+import conftest as _reg # noqa: E402
+
+run_recipe_ci = _reg.run_recipe_ci
+stage_has_passing_test = _reg.stage_has_passing_test
+stage_has_failing_test = _reg.stage_has_failing_test
+
+# ---------------------------------------------------------------------------
+# Canary definitions
+# ---------------------------------------------------------------------------
+
+# Good canary 1: minimal static-file server — fast signal, few deps.
+_SIMPLE = {
+ "id": "good-simple",
+ "recipe": "custom-html-tiny",
+ "src": "recipe-maintainers/custom-html-tiny",
+ # Pin: main @ 2026-06-02 — update if the recipe publishes a new release and pin goes stale.
+ "ref": "435df8fc98ef7598084fcffcd6225470eca80053",
+ "expected_green": True,
+ # Named tests that MUST appear with "pass" in the result — these are the semantic teeth.
+ # If the generic install assertion is removed/vacated, test_serving disappears → this fails.
+ "stage_pass_checks": [
+ ("install", "test_serving"),
+ ],
+ "stage_fail_checks": [],
+}
+
+# Good canary 2: multi-service stack — backend + Postgres + Collabora WOPI + OIDC.
+# Exercises real breadth. Slowest canary (~10-20 min full lifecycle).
+_SIGNIFICANT = {
+ "id": "good-significant",
+ "recipe": "lasuite-docs",
+ "src": "recipe-maintainers/lasuite-docs",
+ # Pin: main @ 2026-06-02
+ "ref": "290a8ad72d06232f0b3f302d976af14bef0f3c53",
+ "expected_green": True,
+ "stage_pass_checks": [
+ ("install", "test_serving_and_frontend"),
+ ],
+ "stage_fail_checks": [],
+}
+
+# Bad canary: app is UP + passes all lifecycle tiers but the custom functional assertion detects a
+# semantic defect (wrong Content-Type for .txt files). The harness MUST report RED.
+# If the harness wrongly returns green for this fixture, assert rc != 0 fails → false-green caught.
+_BAD = {
+ "id": "bad-false-green",
+ "recipe": "custom-html",
+ "src": "recipe-maintainers/custom-html",
+ # Pin: v5-stale-docroot @ 71e7326 — serves .txt as application/octet-stream; build #75 was RED.
+ # Recreate pattern if branch disappears: app up + passes lifecycle, fails one content assertion.
+ "ref": "71e7326a99bbb69035a046fba8fa51859ca66115",
+ "expected_green": False,
+ # The specific test that must have FAILED, proving the content-type assertion has teeth.
+ # If the assertion is vacated and the test disappears, stage_has_failing_test() returns False
+ # → the assert below fails → we detect that the guard was removed.
+ "stage_pass_checks": [],
+ "stage_fail_checks": [
+ ("custom", "test_content_type"),
+ ],
+}
+
+# ---------------------------------------------------------------------------
+# Per-tier RED canaries (fast subset: @pytest.mark.canary_fast)
+# Prove the server catches failure at EVERY lifecycle tier — false-green at any tier is caught.
+# Each uses custom-html-tiny (deploys in seconds) or custom-html (fast nginx, has backup support).
+# ---------------------------------------------------------------------------
+
+# Shared bad-image branch: deploy fails at prepull because the image doesn't exist on Docker Hub.
+# Used for install-RED (STAGES=install → chaos of HEAD with bad image → install=fail)
+# and upgrade-RED (STAGES=install,upgrade → prev-version install passes, upgrade chaos fails).
+_BAD_IMAGE_REF = "4ae8866100563204d40435c5aba00374aa5a8ed3" # regression-bad-image @ 2026-06-02
+
+_BAD_INSTALL = {
+ "id": "bad-install",
+ "recipe": "custom-html-tiny",
+ "src": "recipe-maintainers/custom-html-tiny",
+ "ref": _BAD_IMAGE_REF,
+ "expected_green": False,
+ # STAGES=install only → no upgrade tier → prev=None → chaos deploy of HEAD (bad image) → fails.
+ "stages": "install",
+ # Assertions: install must be the failing tier.
+ "failing_tier": "install",
+ "passing_tiers_before": [],
+ "stage_pass_checks": [],
+ "stage_fail_checks": [],
+}
+
+_BAD_UPGRADE = {
+ "id": "bad-upgrade",
+ "recipe": "custom-html-tiny",
+ "src": "recipe-maintainers/custom-html-tiny",
+ "ref": _BAD_IMAGE_REF,
+ "expected_green": False,
+ # Default stages → prev-version deploy (good image) → install=PASS; upgrade chaos (bad image) → FAIL.
+ "stages": "install,upgrade,custom",
+ "failing_tier": "upgrade",
+ "passing_tiers_before": ["install"],
+ "stage_pass_checks": [],
+ "stage_fail_checks": [],
+}
+
+_BAD_BACKUP = {
+ "id": "bad-backup",
+ "recipe": "custom-html-bkp-bad",
+ "src": "recipe-maintainers/custom-html-bkp-bad",
+ # Pin: custom-html-bkp-bad main @ 2026-06-02 — custom-html WITHOUT backupbot labels.
+ # cc-ci recipe_meta sets BACKUP_CAPABLE=True → harness runs backup tier.
+ # No backupbot.backup=true label → backup-bot-two finds no containers → no snapshot.
+ # parse_snapshot_id returns None → test_backup_artifact fails → backup tier RED.
+ "ref": "b6fe99de41601f9e51bc7ea5b6072f0c3f56cdc3",
+ "expected_green": False,
+ "stages": "install,upgrade,backup",
+ "failing_tier": "backup",
+ "passing_tiers_before": ["install"],
+ "stage_pass_checks": [],
+ "stage_fail_checks": [],
+}
+
+_BAD_RESTORE = {
+ "id": "bad-restore",
+ "recipe": "custom-html-rst-bad",
+ "src": "recipe-maintainers/custom-html-rst-bad",
+ # Pin: custom-html-rst-bad main @ 2026-06-02 (9a73a184).
+ # No pre_backup hook → backup snapshot has no ci-marker.txt.
+ # pre_restore writes "mutated". After restore: marker stays "mutated" → FAIL → restore=RED.
+ # install+backup PASS (no test_backup.py in cc-ci dir); upgrade=skip (no version tags).
+ "ref": "9a73a184e739691bc6a621a5f1e6efc799743c5b",
+ "expected_green": False,
+ "stages": "install,backup,restore,custom",
+ "failing_tier": "restore",
+ "passing_tiers_before": ["install", "backup"],
+ "stage_pass_checks": [],
+ "stage_fail_checks": [
+ ("restore", "test_restore_returns_state"),
+ ],
+}
+
+CANARIES = [_SIMPLE, _SIGNIFICANT, _BAD]
+CANARIES_FAST = [_BAD_INSTALL, _BAD_UPGRADE, _BAD_BACKUP, _BAD_RESTORE]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.canary
+@pytest.mark.parametrize("canary", CANARIES, ids=[c["id"] for c in CANARIES])
+def test_canary(canary, tmp_path):
+ """Drive the full cold CI lifecycle for a canary recipe and verify the outcome.
+
+ For GREEN canaries: proves the harness correctly reports a healthy app as healthy, and that
+ the per-tier semantic assertions actually ran (not vacuous).
+
+ For the RED canary: proves the harness catches a broken app — if the harness wrongly returned
+ green, `assert rc != 0` fails, catching the false-green.
+ """
+ stages = canary.get("stages", "install,upgrade,backup,restore,custom")
+ rc, results, artifact_dir = run_recipe_ci(
+ recipe=canary["recipe"],
+ src=canary["src"],
+ ref=canary["ref"],
+ runs_dir=str(tmp_path),
+ stages=stages,
+ )
+
+ _note = f"artifact_dir={artifact_dir}" # visible in -v output via assert messages
+
+ if canary["expected_green"]:
+ _assert_green(rc, results, canary, _note)
+ else:
+ _assert_red(rc, results, canary, _note)
+
+
+@pytest.mark.canary
+@pytest.mark.canary_fast
+@pytest.mark.parametrize("canary", CANARIES_FAST, ids=[c["id"] for c in CANARIES_FAST])
+def test_canary_fast(canary, tmp_path):
+ """Fast per-tier RED canaries: each proves the server catches failure at a specific lifecycle tier.
+
+ Each canary is broken at exactly one tier; the test asserts:
+ - Overall verdict: RED (rc != 0)
+ - The intended failing tier has status "fail"
+ - Tiers BEFORE the intended failure have status "pass" (proving tier-specific detection, not
+ "fails somewhere")
+
+ These use fast recipes (custom-html-tiny deploys in seconds, custom-html is similarly fast)
+ and are intended as a pre-merge quick check alongside the full slow suite.
+ """
+ stages = canary.get("stages", "install,upgrade,backup,restore,custom")
+ rc, results, artifact_dir = run_recipe_ci(
+ recipe=canary["recipe"],
+ src=canary["src"],
+ ref=canary["ref"],
+ runs_dir=str(tmp_path),
+ stages=stages,
+ )
+
+ _note = f"artifact_dir={artifact_dir}"
+ _assert_red_at_tier(rc, results, canary, _note)
+
+
+def _assert_green(rc: int, results: dict | None, canary: dict, note: str) -> None:
+ """Assert a good-canary run is GREEN with real semantic assertions."""
+
+ # 1. Harness exit code must be 0 (GREEN).
+ assert rc == 0, f"[{canary['id']}] harness returned non-zero rc={rc} — expected GREEN. {note}"
+
+ assert (
+ results is not None
+ ), f"[{canary['id']}] results.json not written — harness may have crashed. {note}"
+
+ # 2. Install tier must have passed.
+ assert results.get("results", {}).get("install") == "pass", (
+ f"[{canary['id']}] install tier did not pass: " f"results={results.get('results')}. {note}"
+ )
+
+ # 3. No tier may have FAILED (skips are acceptable for recipes without backup or custom tests).
+ failed_tiers = [t for t, s in results.get("results", {}).items() if s == "fail"]
+ assert not failed_tiers, f"[{canary['id']}] tiers failed: {failed_tiers}. {note}"
+
+ # 4. Teardown must be clean (no leftover containers/volumes/secrets).
+ assert (
+ results.get("flags", {}).get("clean_teardown") is True
+ ), f"[{canary['id']}] clean_teardown=False — residual state left on server. {note}"
+
+ # 5. No secret values leaked into the results artifact.
+ assert (
+ results.get("flags", {}).get("no_secret_leak") is True
+ ), f"[{canary['id']}] no_secret_leak=False — a secret value appeared in results.json. {note}"
+
+ # 6. Semantic stage assertions — TEETH CHECK.
+ # These verify that specific named tests actually ran and passed in the expected stage.
+ # If a tier assertion is removed or made vacuous, the named test disappears from results.json
+ # and this assert fires — proving the regression suite guards against silent test removal.
+ for stage_name, test_name_substr in canary.get("stage_pass_checks", []):
+ assert stage_has_passing_test(results, stage_name, test_name_substr), (
+ f"[{canary['id']}] expected a passing test containing {test_name_substr!r} in "
+ f"stage={stage_name!r}, but none found. "
+ f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+ )
+
+
+def _assert_red(rc: int, results: dict | None, canary: dict, note: str) -> None:
+ """Assert a bad-canary run is RED (false-green guard).
+
+ The PRIMARY assertion is rc != 0. If the harness wrongly returns 0 (green) for this fixture,
+ this assert fails → the regression suite catches the false-green. This is the core guard.
+ """
+
+ # PRIMARY: harness must return non-zero (RED).
+ # If the harness returns 0 for a broken app, the regression suite fails here — false-green caught.
+ assert rc != 0, (
+ f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture — "
+ f"FALSE-GREEN detected. The harness failed to catch the broken app. {note}"
+ )
+
+ # SECONDARY: verify the specific failing test is present in results.json.
+ # If the content-type assertion is removed/vacuated, stage_has_failing_test() returns False here
+ # → this assert fires → we detect that the guard itself was removed (a meta-failure).
+ if results is not None:
+ for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
+ assert stage_has_failing_test(results, stage_name, test_name_substr), (
+ f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
+ f"stage={stage_name!r}, but none found. "
+ f"The guard may have been removed or vacuated. "
+ f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+ )
+
+
+def _assert_red_at_tier(rc: int, results: dict | None, canary: dict, note: str) -> None:
+ """Assert a per-tier RED canary: overall RED, failing_tier=fail, passing_tiers_before=pass.
+
+ Proves the server catches failure AT THE INTENDED TIER (not just "fails somewhere"), and that
+ the tiers before it still PASSED (no collateral damage from the fixture).
+ If the harness returns 0 for any of these fixtures, false-green is detected at the primary assert.
+ """
+ failing_tier = canary.get("failing_tier")
+ passing_before = canary.get("passing_tiers_before", [])
+
+ # PRIMARY: harness must return non-zero.
+ assert rc != 0, (
+ f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture at tier "
+ f"{failing_tier!r} — FALSE-GREEN. {note}"
+ )
+
+ if results is None:
+ return
+
+ tier_results = results.get("results", {})
+
+ # The intended failing tier must be "fail".
+ if failing_tier:
+ actual = tier_results.get(failing_tier)
+ assert actual == "fail", (
+ f"[{canary['id']}] expected tier {failing_tier!r}='fail', got {actual!r}. "
+ f"All tier results: {tier_results}. {note}"
+ )
+
+ # Tiers before the failing tier must have passed (no collateral damage from the fixture).
+ for tier in passing_before:
+ actual = tier_results.get(tier)
+ assert actual == "pass", (
+ f"[{canary['id']}] expected prior tier {tier!r}='pass' before failing at "
+ f"{failing_tier!r}, got {actual!r}. All results: {tier_results}. {note}"
+ )
+
+ # Optional: specific failing test name (for the restore-RED canary).
+ for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
+ assert stage_has_failing_test(results, stage_name, test_name_substr), (
+ f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
+ f"stage={stage_name!r}. "
+ f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+ )
+
+
+def _stage_tests(results: dict, stage_name: str) -> list[dict]:
+ for stage in results.get("stages", []):
+ if stage.get("name") == stage_name:
+ return stage.get("tests", [])
+ return []