From 73427d6e36805086607104c88b5ce58462429c93 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Tue, 2 Jun 2026 03:33:24 +0000
Subject: [PATCH] feat(regression): add E2E canary regression suite
 (tests/regression/)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Seven canaries prove both halves of the server's job:
- GREEN: good apps are reported healthy (good-simple + good-significant)
- RED: broken apps are caught at intended tier (false-green guard + 4 per-tier)

Fixtures: custom-html-bkp-bad (backup tier RED) + custom-html-rst-bad (restore tier RED).
All 7 canaries verified on live server (see STATUS-regression.md for artifacts).

Not wired to per-commit CI — run on-demand: pytest -m canary tests/regression/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/custom-html-bkp-bad/ops.py          |  19 ++
 tests/custom-html-bkp-bad/recipe_meta.py  |   5 +
 tests/custom-html-bkp-bad/test_backup.py  |  28 ++
 tests/custom-html-bkp-bad/test_restore.py |  25 ++
 tests/custom-html-rst-bad/ops.py          |  15 +
 tests/custom-html-rst-bad/recipe_meta.py  |   3 +
 tests/custom-html-rst-bad/test_restore.py |  23 ++
 tests/regression/README.md                | 136 +++++++++
 tests/regression/conftest.py              | 106 +++++++
 tests/regression/test_canaries.py         | 344 ++++++++++++++++++++++
 10 files changed, 704 insertions(+)
 create mode 100644 tests/custom-html-bkp-bad/ops.py
 create mode 100644 tests/custom-html-bkp-bad/recipe_meta.py
 create mode 100644 tests/custom-html-bkp-bad/test_backup.py
 create mode 100644 tests/custom-html-bkp-bad/test_restore.py
 create mode 100644 tests/custom-html-rst-bad/ops.py
 create mode 100644 tests/custom-html-rst-bad/recipe_meta.py
 create mode 100644 tests/custom-html-rst-bad/test_restore.py
 create mode 100644 tests/regression/README.md
 create mode 100644 tests/regression/conftest.py
 create mode 100644 tests/regression/test_canaries.py

diff --git a/tests/custom-html-bkp-bad/ops.py b/tests/custom-html-bkp-bad/ops.py
new file mode 100644
index 0000000..f6db098
--- /dev/null
+++ b/tests/custom-html-bkp-bad/ops.py
@@ -0,0 +1,19 @@
+"""custom-html-bkp-bad — lifecycle ops for bad-backup/bad-restore RED canaries.
+
+Intentionally has NO pre_backup hook: the marker is never seeded before backup,
+so the backup snapshot has no ci-marker.txt. pre_restore writes "mutated" so that if
+restore DOES bring back the snapshot, the marker is gone/still-mutated → test fails.
+"""
+
+from __future__ import annotations
+
+from harness import lifecycle
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def pre_restore(domain: str, meta: dict) -> None:
+    """Write 'mutated' to the marker before restore runs. If restore brings back the
+    snapshot (which has no marker — never seeded by pre_backup), the marker ends up
+    MISSING or 'mutated' after restore → test_restore_returns_state FAILS → restore=RED."""
+    lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
diff --git a/tests/custom-html-bkp-bad/recipe_meta.py b/tests/custom-html-bkp-bad/recipe_meta.py
new file mode 100644
index 0000000..cff4007
--- /dev/null
+++ b/tests/custom-html-bkp-bad/recipe_meta.py
@@ -0,0 +1,5 @@
+# custom-html-bkp-bad — regression fixture for bad-backup canary.
+# This recipe is custom-html WITHOUT backupbot labels. Setting BACKUP_CAPABLE=True here forces the
+# harness to run the backup tier; the recipe itself has no backupbot service, so
+# `abra app backup create` produces no snapshot → test_backup_artifact fails → backup tier RED.
+BACKUP_CAPABLE = True
diff --git a/tests/custom-html-bkp-bad/test_backup.py b/tests/custom-html-bkp-bad/test_backup.py
new file mode 100644
index 0000000..373fae9
--- /dev/null
+++ b/tests/custom-html-bkp-bad/test_backup.py
@@ -0,0 +1,28 @@
+"""custom-html-bkp-bad — BACKUP assertion (bad-backup RED canary).
+
+This recipe has no ops.py::pre_backup, so ci-marker.txt is NEVER seeded before the backup.
+Asserting its presence here causes backup tier RED — proving the server catches a recipe that
+claims backup support but doesn't actually back up the expected data.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle  # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_backup_captures_state(live_app):
+    """Assert the pre-backup marker is present and equals 'original'.
+
+    Since custom-html-bkp-bad has no ops.py::pre_backup to seed the marker, this file does NOT
+    exist at backup time — exec_in_app returns empty or raises → assertion fails → backup tier RED.
+    This models a recipe that declares backup capability but omits the data-seeding hook."""
+    result = lifecycle.exec_in_app(live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]).strip()
+    assert result == "original", (
+        f"backup did not capture the expected marker at {MARKER_PATH}: got {result!r}. "
+        "Expected 'original' (seeded by pre_backup). If the marker is 'MISSING', the pre_backup "
+        "hook was not run — this is the intended failure for the bad-backup RED canary."
+    )
diff --git a/tests/custom-html-bkp-bad/test_restore.py b/tests/custom-html-bkp-bad/test_restore.py
new file mode 100644
index 0000000..ce1b924
--- /dev/null
+++ b/tests/custom-html-bkp-bad/test_restore.py
@@ -0,0 +1,25 @@
+"""custom-html-bkp-bad — RESTORE assertion (bad-restore RED canary).
+
+pre_restore seeds 'mutated' to ci-marker.txt. The backup snapshot has no ci-marker.txt
+(never seeded by pre_backup). After restore, the marker is either MISSING or 'mutated' —
+never 'original' — so this assertion FAILS → restore tier RED.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle  # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_restore_returns_state(live_app):
+    result = lifecycle.exec_in_app(
+        live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
+    ).strip()
+    assert result == "original", (
+        f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
+        "Expected 'original'. The backup had no marker (not seeded by pre_backup), so "
+        "restore cannot recover it — this is the intended failure for the bad-restore RED canary."
+    )
diff --git a/tests/custom-html-rst-bad/ops.py b/tests/custom-html-rst-bad/ops.py
new file mode 100644
index 0000000..3f3b920
--- /dev/null
+++ b/tests/custom-html-rst-bad/ops.py
@@ -0,0 +1,15 @@
+"""custom-html-rst-bad — lifecycle ops for bad-restore RED canary.
+
+NO pre_backup hook: marker never seeded before backup → snapshot has no ci-marker.txt.
+pre_restore writes "mutated". After restore, marker stays "mutated" (not in snapshot) → FAIL.
+"""
+
+from __future__ import annotations
+
+from harness import lifecycle
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def pre_restore(domain: str, meta: dict) -> None:
+    lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
diff --git a/tests/custom-html-rst-bad/recipe_meta.py b/tests/custom-html-rst-bad/recipe_meta.py
new file mode 100644
index 0000000..7e5bda7
--- /dev/null
+++ b/tests/custom-html-rst-bad/recipe_meta.py
@@ -0,0 +1,3 @@
+# custom-html-rst-bad — regression fixture for bad-restore canary.
+# BACKUP_CAPABLE=True forces the backup tier to run even though the recipe has no backupbot label.
+BACKUP_CAPABLE = True
diff --git a/tests/custom-html-rst-bad/test_restore.py b/tests/custom-html-rst-bad/test_restore.py
new file mode 100644
index 0000000..a361d65
--- /dev/null
+++ b/tests/custom-html-rst-bad/test_restore.py
@@ -0,0 +1,23 @@
+"""custom-html-rst-bad — RESTORE assertion (bad-restore RED canary).
+
+No pre_backup → backup snapshot has no ci-marker.txt. pre_restore writes "mutated".
+After restore: marker is "mutated" (restore can't recover "original" — wasn't backed up) → FAIL.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle  # noqa: E402
+
+MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
+
+
+def test_restore_returns_state(live_app):
+    result = lifecycle.exec_in_app(
+        live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
+    ).strip()
+    assert result == "original", (
+        f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
+        "Expected 'original'. The backup had no marker, so restore cannot recover it."
+    )
diff --git a/tests/regression/README.md b/tests/regression/README.md
new file mode 100644
index 0000000..0b02acd
--- /dev/null
+++ b/tests/regression/README.md
@@ -0,0 +1,136 @@
+# Regression canaries — E2E self-tests for the cc-ci server
+
+A standing pytest suite that drives the **real** cc-ci lifecycle harness against pinned canary
+recipes and verifies both halves of the server's job:
+
+1. **Good canaries** — healthy apps are reported GREEN (install + upgrade + backup/restore pass).
+2. **Bad canary** — broken apps are caught RED; a false-green makes the regression test itself fail.
+
+These tests run the full cold lifecycle on the live cc-ci server. They are **slow** (minutes per
+canary) and **opt-in** — kept out of the per-commit fast path by the `canary` marker.
+
+---
+
+## How to run
+
+Run on the cc-ci server (abra + Docker + Swarm required):
+
+```bash
+ssh cc-ci
+cd /root/cc-ci            # or wherever the repo is checked out
+cc-ci-run python -m pytest tests/regression/ -m canary -v
+```
+
+Or a single canary:
+
+```bash
+cc-ci-run python -m pytest tests/regression/ -m canary -k good-simple -v
+```
+
+From the orchestrator:
+
+```bash
+ssh cc-ci "cd /root/cc-ci && cc-ci-run python -m pytest tests/regression/ -m canary -v"
+```
+
+---
+
+## Canaries
+
+| ID | Recipe | Purpose | Expected verdict |
+|----|--------|---------|-----------------|
+| `good-simple` | `custom-html-tiny` | Minimal static server — fast signal | GREEN |
+| `good-significant` | `lasuite-docs` | Multi-service (backend + Postgres + Collabora + OIDC) | GREEN |
+| `bad-false-green` | `custom-html` @ `v5-stale-docroot` | App is UP but serves wrong Content-Type — catches false-green | RED |
+
+### Why the bad canary exists
+
+The scariest regression is a **false-green**: the server reports PASS while the app is broken.
+We already saw a fabricated full-PASS during the build. The `bad-false-green` canary pins a known-
+broken fixture (`v5-stale-docroot`: nginx serves `.txt` as `application/octet-stream`). The
+harness's `test_content_type_html_and_txt` catches this and returns RED (build #75 was RED for
+exactly this fixture).
+
+The regression test asserts `rc != 0`. If the harness ever wrongly returns green for this fixture,
+that assert fires — false-green is caught before any merge.
+
+---
+
+## What each canary verifies
+
+### Per-tier semantic assertions (the "teeth")
+
+The tests assert MORE than the harness exit code: they check that **specific named assertions**
+ran and got the expected result. This guards against a different failure mode — a tier that
+nominally "passes" because the assertion was silently removed or made vacuous.
+
+| Stage | Test name | What it proves |
+|-------|-----------|---------------|
+| install | `test_serving` | Generic HTTP readiness check actually ran |
+| install | `test_serving_and_frontend` | Lasuite-docs frontend (SPA shell) actually loaded |
+| custom | `test_content_type` | Content-type assertion actually ran (bad canary only) |
+
+If a tier assertion is removed: the named test disappears from `results.json` → the semantic
+check fires → the regression suite catches the removal.
+
+### Additional structural assertions (good canaries)
+
+- `install` tier: "pass" (not fail, not skip)
+- No tier is "fail" (skips acceptable for recipes without backup/custom tests)
+- `flags.clean_teardown = True` (no leftover containers/volumes/secrets)
+- `flags.no_secret_leak = True` (no secret value in the results artifact)
+
+---
+
+## Cadence policy
+
+**Do NOT run on every commit or PR.** These are slow and resource-heavy. Run them:
+
+- Before a **release** of the cc-ci server (after a batch of server changes).
+- As a **polishing pass** or pre-merge check for significant server refactors.
+- On-demand when you suspect a regression: `pytest -m canary`.
+
+They are NOT wired to the per-commit Drone pipeline. If adding a `!testme`-style trigger for the
+cc-ci repo, gate it behind a deliberate label (e.g. `run-canaries`) — not an automatic run on
+every push.
+
+---
+
+## How to add a canary
+
+1. Identify a recipe that is already deployable and has pinned version tags.
+2. Decide the expected verdict (GREEN or RED) and which tier assertions have teeth.
+3. Add an entry to `CANARIES` in `test_canaries.py`:
+
+```python
+{
+    "id": "good-myrecipe",
+    "recipe": "my-recipe",
+    "src": "recipe-maintainers/my-recipe",
+    "ref": "<pinned-sha>",           # pin to a specific commit for stability
+    "expected_green": True,
+    "stage_pass_checks": [
+        ("install", "test_serving"),  # verify this named test ran and passed
+    ],
+    "stage_fail_checks": [],
+}
+```
+
+4. Run the canary once to confirm it passes:
+   `cc-ci-run python -m pytest tests/regression/ -m canary -k good-myrecipe -v`
+
+5. Update the pin comment with the date and the recipe version it was pinned at.
+
+---
+
+## Pin maintenance
+
+Canary refs are pinned to specific SHAs for stability. When a recipe publishes a new release:
+
+1. Update the `"ref"` SHA in the canary definition (use the new main-branch HEAD).
+2. Update the pin comment with the new date/version.
+3. Re-run the canary to confirm GREEN before committing the pin update.
+
+The bad canary (`v5-stale-docroot`) is a stable fixture branch — update only if the branch is
+deleted. If deleted, recreate the pattern: an app that is up + passes lifecycle tiers but fails
+one functional assertion.
diff --git a/tests/regression/conftest.py b/tests/regression/conftest.py
new file mode 100644
index 0000000..519d3ee
--- /dev/null
+++ b/tests/regression/conftest.py
@@ -0,0 +1,106 @@
+"""Shared fixtures and helpers for E2E canary regression tests.
+
+The regression tests call the real cc-ci harness (run_recipe_ci.py) as a subprocess and assert on
+its outputs (exit code, results.json). They run ON the cc-ci server, not the orchestrator — abra,
+Docker, and Swarm must be present.
+
+Invoke: cc-ci-run python -m pytest tests/regression/ -m canary -v
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+import time
+
+ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "canary: slow E2E canary test — drives the full cold CI lifecycle; run on-demand only.",
+    )
+    config.addinivalue_line(
+        "markers",
+        "canary_fast: fast per-tier RED canary (still tagged canary); subset for quick pre-merge checks.",
+    )
+
+
+def run_recipe_ci(
+    recipe: str,
+    src: str,
+    ref: str,
+    pr: str = "0",
+    stages: str = "install,upgrade,backup,restore,custom",
+    runs_dir: str | None = None,
+    run_id_prefix: str = "regression",
+    timeout: int = 3600,
+) -> tuple[int, dict | None, str]:
+    """Invoke run_recipe_ci.py with the given canary params.
+
+    Returns (rc, results_dict_or_None, run_artifact_dir).
+    Stdout/stderr stream live so a human can follow progress.
+    """
+    ts = int(time.time())
+    run_id = f"{run_id_prefix}-{recipe}-{ref[:12]}-{ts}"
+    if runs_dir is None:
+        runs_dir = "/var/lib/cc-ci-runs"
+
+    env = dict(os.environ)
+    env.update(
+        {
+            "RECIPE": recipe,
+            "REF": ref,
+            "SRC": src,
+            "PR": pr,
+            "STAGES": stages,
+            "CCCI_RUN_ID": run_id,
+            "CCCI_RUNS_DIR": runs_dir,
+            "HOME": "/root",
+        }
+    )
+    # Keep PLAYWRIGHT env from the outer cc-ci-run wrapper (already in os.environ if running under it)
+
+    script = os.path.join(ROOT, "runner", "run_recipe_ci.py")
+    result = subprocess.run(
+        [sys.executable, script],
+        env=env,
+        timeout=timeout,
+    )
+    rc = result.returncode
+
+    artifact_dir = os.path.join(runs_dir, run_id)
+    results_path = os.path.join(artifact_dir, "results.json")
+    results_data: dict | None = None
+    if os.path.exists(results_path):
+        with open(results_path) as f:
+            results_data = json.load(f)
+
+    return rc, results_data, artifact_dir
+
+
+def find_stage_tests(results: dict, stage_name: str) -> list[dict]:
+    """Return the per-test list for a named stage from results.json, or []."""
+    for stage in results.get("stages", []):
+        if stage.get("name") == stage_name:
+            return stage.get("tests", [])
+    return []
+
+
+def stage_has_passing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
+    """True if the named stage contains a passing test whose name includes test_name_substr."""
+    for t in find_stage_tests(results, stage_name):
+        if test_name_substr in t.get("name", "") and t.get("status") == "pass":
+            return True
+    return False
+
+
+def stage_has_failing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
+    """True if the named stage contains a failing test whose name includes test_name_substr."""
+    for t in find_stage_tests(results, stage_name):
+        if test_name_substr in t.get("name", "") and t.get("status") in ("fail", "error"):
+            return True
+    return False
diff --git a/tests/regression/test_canaries.py b/tests/regression/test_canaries.py
new file mode 100644
index 0000000..361fabd
--- /dev/null
+++ b/tests/regression/test_canaries.py
@@ -0,0 +1,344 @@
+"""E2E canary regression tests — the server's standing self-test suite.
+
+Seven canaries prove both halves of the server's job:
+  1. GREEN canaries — good apps are reported healthy (install+upgrade+backup/restore pass).
+  2. RED canaries   — broken apps are caught at the intended tier; a false-green makes THIS test fail.
+
+Fast subset (@pytest.mark.canary_fast): the four per-tier RED canaries on custom-html-tiny — fast
+because the recipe deploys in seconds. Run with `-m canary_fast` as a pre-merge quick check.
+Full suite (-m canary): includes good-significant (lasuite-docs, 10-20 min).
+
+Run: cc-ci-run python -m pytest tests/regression/ -m canary -v
+Pin policy: canary refs are pinned to specific SHAs. Update only after confirming the new ref gives
+the expected verdict.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+import conftest as _reg  # noqa: E402
+
+run_recipe_ci = _reg.run_recipe_ci
+stage_has_passing_test = _reg.stage_has_passing_test
+stage_has_failing_test = _reg.stage_has_failing_test
+
+# ---------------------------------------------------------------------------
+# Canary definitions
+# ---------------------------------------------------------------------------
+
+# Good canary 1: minimal static-file server — fast signal, few deps.
+_SIMPLE = {
+    "id": "good-simple",
+    "recipe": "custom-html-tiny",
+    "src": "recipe-maintainers/custom-html-tiny",
+    # Pin: main @ 2026-06-02 — update if the recipe publishes a new release and pin goes stale.
+    "ref": "435df8fc98ef7598084fcffcd6225470eca80053",
+    "expected_green": True,
+    # Named tests that MUST appear with "pass" in the result — these are the semantic teeth.
+    # If the generic install assertion is removed/vacated, test_serving disappears → this fails.
+    "stage_pass_checks": [
+        ("install", "test_serving"),
+    ],
+    "stage_fail_checks": [],
+}
+
+# Good canary 2: multi-service stack — backend + Postgres + Collabora WOPI + OIDC.
+# Exercises real breadth. Slowest canary (~10-20 min full lifecycle).
+_SIGNIFICANT = {
+    "id": "good-significant",
+    "recipe": "lasuite-docs",
+    "src": "recipe-maintainers/lasuite-docs",
+    # Pin: main @ 2026-06-02
+    "ref": "290a8ad72d06232f0b3f302d976af14bef0f3c53",
+    "expected_green": True,
+    "stage_pass_checks": [
+        ("install", "test_serving_and_frontend"),
+    ],
+    "stage_fail_checks": [],
+}
+
+# Bad canary: app is UP + passes all lifecycle tiers but the custom functional assertion detects a
+# semantic defect (wrong Content-Type for .txt files). The harness MUST report RED.
+# If the harness wrongly returns green for this fixture, assert rc != 0 fails → false-green caught.
+_BAD = {
+    "id": "bad-false-green",
+    "recipe": "custom-html",
+    "src": "recipe-maintainers/custom-html",
+    # Pin: v5-stale-docroot @ 71e7326 — serves .txt as application/octet-stream; build #75 was RED.
+    # Recreate pattern if branch disappears: app up + passes lifecycle, fails one content assertion.
+    "ref": "71e7326a99bbb69035a046fba8fa51859ca66115",
+    "expected_green": False,
+    # The specific test that must have FAILED, proving the content-type assertion has teeth.
+    # If the assertion is vacated and the test disappears, stage_has_failing_test() returns False
+    # → the assert below fails → we detect that the guard was removed.
+    "stage_pass_checks": [],
+    "stage_fail_checks": [
+        ("custom", "test_content_type"),
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Per-tier RED canaries (fast subset: @pytest.mark.canary_fast)
+# Prove the server catches failure at EVERY lifecycle tier — false-green at any tier is caught.
+# Each uses custom-html-tiny (deploys in seconds) or custom-html (fast nginx, has backup support).
+# ---------------------------------------------------------------------------
+
+# Shared bad-image branch: deploy fails at prepull because the image doesn't exist on Docker Hub.
+# Used for install-RED (STAGES=install → chaos of HEAD with bad image → install=fail)
+# and upgrade-RED (STAGES=install,upgrade → prev-version install passes, upgrade chaos fails).
+_BAD_IMAGE_REF = "4ae8866100563204d40435c5aba00374aa5a8ed3"  # regression-bad-image @ 2026-06-02
+
+_BAD_INSTALL = {
+    "id": "bad-install",
+    "recipe": "custom-html-tiny",
+    "src": "recipe-maintainers/custom-html-tiny",
+    "ref": _BAD_IMAGE_REF,
+    "expected_green": False,
+    # STAGES=install only → no upgrade tier → prev=None → chaos deploy of HEAD (bad image) → fails.
+    "stages": "install",
+    # Assertions: install must be the failing tier.
+    "failing_tier": "install",
+    "passing_tiers_before": [],
+    "stage_pass_checks": [],
+    "stage_fail_checks": [],
+}
+
+_BAD_UPGRADE = {
+    "id": "bad-upgrade",
+    "recipe": "custom-html-tiny",
+    "src": "recipe-maintainers/custom-html-tiny",
+    "ref": _BAD_IMAGE_REF,
+    "expected_green": False,
+    # Default stages → prev-version deploy (good image) → install=PASS; upgrade chaos (bad image) → FAIL.
+    "stages": "install,upgrade,custom",
+    "failing_tier": "upgrade",
+    "passing_tiers_before": ["install"],
+    "stage_pass_checks": [],
+    "stage_fail_checks": [],
+}
+
+_BAD_BACKUP = {
+    "id": "bad-backup",
+    "recipe": "custom-html-bkp-bad",
+    "src": "recipe-maintainers/custom-html-bkp-bad",
+    # Pin: custom-html-bkp-bad main @ 2026-06-02 — custom-html WITHOUT backupbot labels.
+    # cc-ci recipe_meta sets BACKUP_CAPABLE=True → harness runs backup tier.
+    # No backupbot.backup=true label → backup-bot-two finds no containers → no snapshot.
+    # parse_snapshot_id returns None → test_backup_artifact fails → backup tier RED.
+    "ref": "b6fe99de41601f9e51bc7ea5b6072f0c3f56cdc3",
+    "expected_green": False,
+    "stages": "install,upgrade,backup",
+    "failing_tier": "backup",
+    "passing_tiers_before": ["install"],
+    "stage_pass_checks": [],
+    "stage_fail_checks": [],
+}
+
+_BAD_RESTORE = {
+    "id": "bad-restore",
+    "recipe": "custom-html-rst-bad",
+    "src": "recipe-maintainers/custom-html-rst-bad",
+    # Pin: custom-html-rst-bad main @ 2026-06-02 (9a73a184).
+    # No pre_backup hook → backup snapshot has no ci-marker.txt.
+    # pre_restore writes "mutated". After restore: marker stays "mutated" → FAIL → restore=RED.
+    # install+backup PASS (no test_backup.py in cc-ci dir); upgrade=skip (no version tags).
+    "ref": "9a73a184e739691bc6a621a5f1e6efc799743c5b",
+    "expected_green": False,
+    "stages": "install,backup,restore,custom",
+    "failing_tier": "restore",
+    "passing_tiers_before": ["install", "backup"],
+    "stage_pass_checks": [],
+    "stage_fail_checks": [
+        ("restore", "test_restore_returns_state"),
+    ],
+}
+
+CANARIES = [_SIMPLE, _SIGNIFICANT, _BAD]
+CANARIES_FAST = [_BAD_INSTALL, _BAD_UPGRADE, _BAD_BACKUP, _BAD_RESTORE]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.canary
+@pytest.mark.parametrize("canary", CANARIES, ids=[c["id"] for c in CANARIES])
+def test_canary(canary, tmp_path):
+    """Drive the full cold CI lifecycle for a canary recipe and verify the outcome.
+
+    For GREEN canaries: proves the harness correctly reports a healthy app as healthy, and that
+    the per-tier semantic assertions actually ran (not vacuous).
+
+    For the RED canary: proves the harness catches a broken app — if the harness wrongly returned
+    green, `assert rc != 0` fails, catching the false-green.
+    """
+    stages = canary.get("stages", "install,upgrade,backup,restore,custom")
+    rc, results, artifact_dir = run_recipe_ci(
+        recipe=canary["recipe"],
+        src=canary["src"],
+        ref=canary["ref"],
+        runs_dir=str(tmp_path),
+        stages=stages,
+    )
+
+    _note = f"artifact_dir={artifact_dir}"  # visible in -v output via assert messages
+
+    if canary["expected_green"]:
+        _assert_green(rc, results, canary, _note)
+    else:
+        _assert_red(rc, results, canary, _note)
+
+
+@pytest.mark.canary
+@pytest.mark.canary_fast
+@pytest.mark.parametrize("canary", CANARIES_FAST, ids=[c["id"] for c in CANARIES_FAST])
+def test_canary_fast(canary, tmp_path):
+    """Fast per-tier RED canaries: each proves the server catches failure at a specific lifecycle tier.
+
+    Each canary is broken at exactly one tier; the test asserts:
+    - Overall verdict: RED (rc != 0)
+    - The intended failing tier has status "fail"
+    - Tiers BEFORE the intended failure have status "pass" (proving tier-specific detection, not
+      "fails somewhere")
+
+    These use fast recipes (custom-html-tiny deploys in seconds, custom-html is similarly fast)
+    and are intended as a pre-merge quick check alongside the full slow suite.
+    """
+    stages = canary.get("stages", "install,upgrade,backup,restore,custom")
+    rc, results, artifact_dir = run_recipe_ci(
+        recipe=canary["recipe"],
+        src=canary["src"],
+        ref=canary["ref"],
+        runs_dir=str(tmp_path),
+        stages=stages,
+    )
+
+    _note = f"artifact_dir={artifact_dir}"
+    _assert_red_at_tier(rc, results, canary, _note)
+
+
+def _assert_green(rc: int, results: dict | None, canary: dict, note: str) -> None:
+    """Assert a good-canary run is GREEN with real semantic assertions."""
+
+    # 1. Harness exit code must be 0 (GREEN).
+    assert rc == 0, f"[{canary['id']}] harness returned non-zero rc={rc} — expected GREEN. {note}"
+
+    assert (
+        results is not None
+    ), f"[{canary['id']}] results.json not written — harness may have crashed. {note}"
+
+    # 2. Install tier must have passed.
+    assert results.get("results", {}).get("install") == "pass", (
+        f"[{canary['id']}] install tier did not pass: " f"results={results.get('results')}. {note}"
+    )
+
+    # 3. No tier may have FAILED (skips are acceptable for recipes without backup or custom tests).
+    failed_tiers = [t for t, s in results.get("results", {}).items() if s == "fail"]
+    assert not failed_tiers, f"[{canary['id']}] tiers failed: {failed_tiers}. {note}"
+
+    # 4. Teardown must be clean (no leftover containers/volumes/secrets).
+    assert (
+        results.get("flags", {}).get("clean_teardown") is True
+    ), f"[{canary['id']}] clean_teardown=False — residual state left on server. {note}"
+
+    # 5. No secret values leaked into the results artifact.
+    assert (
+        results.get("flags", {}).get("no_secret_leak") is True
+    ), f"[{canary['id']}] no_secret_leak=False — a secret value appeared in results.json. {note}"
+
+    # 6. Semantic stage assertions — TEETH CHECK.
+    # These verify that specific named tests actually ran and passed in the expected stage.
+    # If a tier assertion is removed or made vacuous, the named test disappears from results.json
+    # and this assert fires — proving the regression suite guards against silent test removal.
+    for stage_name, test_name_substr in canary.get("stage_pass_checks", []):
+        assert stage_has_passing_test(results, stage_name, test_name_substr), (
+            f"[{canary['id']}] expected a passing test containing {test_name_substr!r} in "
+            f"stage={stage_name!r}, but none found. "
+            f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+        )
+
+
+def _assert_red(rc: int, results: dict | None, canary: dict, note: str) -> None:
+    """Assert a bad-canary run is RED (false-green guard).
+
+    The PRIMARY assertion is rc != 0. If the harness wrongly returns 0 (green) for this fixture,
+    this assert fails → the regression suite catches the false-green. This is the core guard.
+    """
+
+    # PRIMARY: harness must return non-zero (RED).
+    # If the harness returns 0 for a broken app, the regression suite fails here — false-green caught.
+    assert rc != 0, (
+        f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture — "
+        f"FALSE-GREEN detected. The harness failed to catch the broken app. {note}"
+    )
+
+    # SECONDARY: verify the specific failing test is present in results.json.
+    # If the content-type assertion is removed/vacuated, stage_has_failing_test() returns False here
+    # → this assert fires → we detect that the guard itself was removed (a meta-failure).
+    if results is not None:
+        for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
+            assert stage_has_failing_test(results, stage_name, test_name_substr), (
+                f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
+                f"stage={stage_name!r}, but none found. "
+                f"The guard may have been removed or vacuated. "
+                f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+            )
+
+
+def _assert_red_at_tier(rc: int, results: dict | None, canary: dict, note: str) -> None:
+    """Assert a per-tier RED canary: overall RED, failing_tier=fail, passing_tiers_before=pass.
+
+    Proves the server catches failure AT THE INTENDED TIER (not just "fails somewhere"), and that
+    the tiers before it still PASSED (no collateral damage from the fixture).
+    If the harness returns 0 for any of these fixtures, false-green is detected at the primary assert.
+    """
+    failing_tier = canary.get("failing_tier")
+    passing_before = canary.get("passing_tiers_before", [])
+
+    # PRIMARY: harness must return non-zero.
+    assert rc != 0, (
+        f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture at tier "
+        f"{failing_tier!r} — FALSE-GREEN. {note}"
+    )
+
+    if results is None:
+        return
+
+    tier_results = results.get("results", {})
+
+    # The intended failing tier must be "fail".
+    if failing_tier:
+        actual = tier_results.get(failing_tier)
+        assert actual == "fail", (
+            f"[{canary['id']}] expected tier {failing_tier!r}='fail', got {actual!r}. "
+            f"All tier results: {tier_results}. {note}"
+        )
+
+    # Tiers before the failing tier must have passed (no collateral damage from the fixture).
+    for tier in passing_before:
+        actual = tier_results.get(tier)
+        assert actual == "pass", (
+            f"[{canary['id']}] expected prior tier {tier!r}='pass' before failing at "
+            f"{failing_tier!r}, got {actual!r}. All results: {tier_results}. {note}"
+        )
+
+    # Optional: specific failing test name (for the restore-RED canary).
+    for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
+        assert stage_has_failing_test(results, stage_name, test_name_substr), (
+            f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
+            f"stage={stage_name!r}. "
+            f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
+        )
+
+
+def _stage_tests(results: dict, stage_name: str) -> list[dict]:
+    for stage in results.get("stages", []):
+        if stage.get("name") == stage_name:
+            return stage.get("tests", [])
+    return []
-- 
2.49.0