Compare commits

...

1 Commits

Author SHA1 Message Date
73427d6e36 feat(regression): add E2E canary regression suite (tests/regression/)
Some checks failed
continuous-integration/drone/push Build is failing
Seven canaries prove both halves of the server's job:
- GREEN: good apps are reported healthy (good-simple + good-significant)
- RED: broken apps are caught at intended tier (false-green guard + 4 per-tier)

Fixtures: custom-html-bkp-bad (backup tier RED) + custom-html-rst-bad (restore tier RED).
All 7 canaries verified on live server (see STATUS-regression.md for artifacts).

Not wired to per-commit CI — run on-demand: pytest -m canary tests/regression/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 03:33:24 +00:00
10 changed files with 704 additions and 0 deletions

View File

@ -0,0 +1,19 @@
"""custom-html-bkp-bad — lifecycle ops for bad-backup/bad-restore RED canaries.
Intentionally has NO pre_backup hook: the marker is never seeded before backup,
so the backup snapshot has no ci-marker.txt. pre_restore writes "mutated" so that if
restore DOES bring back the snapshot, the marker is gone/still-mutated → test fails.
"""
from __future__ import annotations
from harness import lifecycle
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
def pre_restore(domain: str, meta: dict) -> None:
"""Write 'mutated' to the marker before restore runs. If restore brings back the
snapshot (which has no marker — never seeded by pre_backup), the marker ends up
MISSING or 'mutated' after restore → test_restore_returns_state FAILS → restore=RED."""
lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])

View File

@ -0,0 +1,5 @@
# custom-html-bkp-bad — regression fixture for bad-backup canary.
# This recipe is custom-html WITHOUT backupbot labels. Setting BACKUP_CAPABLE=True here forces the
# harness to run the backup tier; the recipe itself has no backupbot service, so
# `abra app backup create` produces no snapshot → test_backup_artifact fails → backup tier RED.
BACKUP_CAPABLE = True

View File

@ -0,0 +1,28 @@
"""custom-html-bkp-bad — BACKUP assertion (bad-backup RED canary).
This recipe has no ops.py::pre_backup, so ci-marker.txt is NEVER seeded before the backup.
Asserting its presence here causes backup tier RED — proving the server catches a recipe that
claims backup support but doesn't actually back up the expected data.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle # noqa: E402
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
def test_backup_captures_state(live_app):
"""Assert the pre-backup marker is present and equals 'original'.
Since custom-html-bkp-bad has no ops.py::pre_backup to seed the marker, this file does NOT
exist at backup time — exec_in_app returns empty or raises → assertion fails → backup tier RED.
This models a recipe that declares backup capability but omits the data-seeding hook."""
result = lifecycle.exec_in_app(live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]).strip()
assert result == "original", (
f"backup did not capture the expected marker at {MARKER_PATH}: got {result!r}. "
"Expected 'original' (seeded by pre_backup). If the marker is 'MISSING', the pre_backup "
"hook was not run — this is the intended failure for the bad-backup RED canary."
)

View File

@ -0,0 +1,25 @@
"""custom-html-bkp-bad — RESTORE assertion (bad-restore RED canary).
pre_restore seeds 'mutated' to ci-marker.txt. The backup snapshot has no ci-marker.txt
(never seeded by pre_backup). After restore, the marker is either MISSING or 'mutated'
never 'original' — so this assertion FAILS → restore tier RED.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle # noqa: E402
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
def test_restore_returns_state(live_app):
result = lifecycle.exec_in_app(
live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
).strip()
assert result == "original", (
f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
"Expected 'original'. The backup had no marker (not seeded by pre_backup), so "
"restore cannot recover it — this is the intended failure for the bad-restore RED canary."
)

View File

@ -0,0 +1,15 @@
"""custom-html-rst-bad — lifecycle ops for bad-restore RED canary.
NO pre_backup hook: marker never seeded before backup → snapshot has no ci-marker.txt.
pre_restore writes "mutated". After restore, marker stays "mutated" (not in snapshot) → FAIL.
"""
from __future__ import annotations
from harness import lifecycle
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
def pre_restore(domain: str, meta: dict) -> None:
lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])

View File

@ -0,0 +1,3 @@
# custom-html-rst-bad — regression fixture for bad-restore canary.
# BACKUP_CAPABLE=True forces the backup tier to run even though the recipe has no backupbot label.
BACKUP_CAPABLE = True

View File

@ -0,0 +1,23 @@
"""custom-html-rst-bad — RESTORE assertion (bad-restore RED canary).
No pre_backup → backup snapshot has no ci-marker.txt. pre_restore writes "mutated".
After restore: marker is "mutated" (restore can't recover "original" — wasn't backed up) → FAIL.
"""
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle # noqa: E402
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
def test_restore_returns_state(live_app):
result = lifecycle.exec_in_app(
live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
).strip()
assert result == "original", (
f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
"Expected 'original'. The backup had no marker, so restore cannot recover it."
)

136
tests/regression/README.md Normal file
View File

@ -0,0 +1,136 @@
# Regression canaries — E2E self-tests for the cc-ci server
A standing pytest suite that drives the **real** cc-ci lifecycle harness against pinned canary
recipes and verifies both halves of the server's job:
1. **Good canaries** — healthy apps are reported GREEN (install + upgrade + backup/restore pass).
2. **Bad canary** — broken apps are caught RED; a false-green makes the regression test itself fail.
These tests run the full cold lifecycle on the live cc-ci server. They are **slow** (minutes per
canary) and **opt-in** — kept out of the per-commit fast path by the `canary` marker.
---
## How to run
Run on the cc-ci server (abra + Docker + Swarm required):
```bash
ssh cc-ci
cd /root/cc-ci # or wherever the repo is checked out
cc-ci-run python -m pytest tests/regression/ -m canary -v
```
Or a single canary:
```bash
cc-ci-run python -m pytest tests/regression/ -m canary -k good-simple -v
```
From the orchestrator:
```bash
ssh cc-ci "cd /root/cc-ci && cc-ci-run python -m pytest tests/regression/ -m canary -v"
```
---
## Canaries
| ID | Recipe | Purpose | Expected verdict |
|----|--------|---------|-----------------|
| `good-simple` | `custom-html-tiny` | Minimal static server — fast signal | GREEN |
| `good-significant` | `lasuite-docs` | Multi-service (backend + Postgres + Collabora + OIDC) | GREEN |
| `bad-false-green` | `custom-html` @ `v5-stale-docroot` | App is UP but serves wrong Content-Type — catches false-green | RED |
### Why the bad canary exists
The scariest regression is a **false-green**: the server reports PASS while the app is broken.
We already saw a fabricated full-PASS during the build. The `bad-false-green` canary pins a known-
broken fixture (`v5-stale-docroot`: nginx serves `.txt` as `application/octet-stream`). The
harness's `test_content_type_html_and_txt` catches this and returns RED (build #75 was RED for
exactly this fixture).
The regression test asserts `rc != 0`. If the harness ever wrongly returns green for this fixture,
that assert fires — false-green is caught before any merge.
---
## What each canary verifies
### Per-tier semantic assertions (the "teeth")
The tests assert MORE than the harness exit code: they check that **specific named assertions**
ran and got the expected result. This guards against a different failure mode — a tier that
nominally "passes" because the assertion was silently removed or made vacuous.
| Stage | Test name | What it proves |
|-------|-----------|---------------|
| install | `test_serving` | Generic HTTP readiness check actually ran |
| install | `test_serving_and_frontend` | Lasuite-docs frontend (SPA shell) actually loaded |
| custom | `test_content_type` | Content-type assertion actually ran (bad canary only) |
If a tier assertion is removed: the named test disappears from `results.json` → the semantic
check fires → the regression suite catches the removal.
### Additional structural assertions (good canaries)
- `install` tier: "pass" (not fail, not skip)
- No tier is "fail" (skips acceptable for recipes without backup/custom tests)
- `flags.clean_teardown = True` (no leftover containers/volumes/secrets)
- `flags.no_secret_leak = True` (no secret value in the results artifact)
---
## Cadence policy
**Do NOT run on every commit or PR.** These are slow and resource-heavy. Run them:
- Before a **release** of the cc-ci server (after a batch of server changes).
- As a **polishing pass** or pre-merge check for significant server refactors.
- On-demand when you suspect a regression: `pytest -m canary`.
They are NOT wired to the per-commit Drone pipeline. If adding a `!testme`-style trigger for the
cc-ci repo, gate it behind a deliberate label (e.g. `run-canaries`) — not an automatic run on
every push.
---
## How to add a canary
1. Identify a recipe that is already deployable and has pinned version tags.
2. Decide the expected verdict (GREEN or RED) and which tier assertions have teeth.
3. Add an entry to `CANARIES` in `test_canaries.py`:
```python
{
"id": "good-myrecipe",
"recipe": "my-recipe",
"src": "recipe-maintainers/my-recipe",
"ref": "<pinned-sha>", # pin to a specific commit for stability
"expected_green": True,
"stage_pass_checks": [
("install", "test_serving"), # verify this named test ran and passed
],
"stage_fail_checks": [],
}
```
4. Run the canary once to confirm it passes:
`cc-ci-run python -m pytest tests/regression/ -m canary -k good-myrecipe -v`
5. Update the pin comment with the date and the recipe version it was pinned at.
---
## Pin maintenance
Canary refs are pinned to specific SHAs for stability. When a recipe publishes a new release:
1. Update the `"ref"` SHA in the canary definition (use the new main-branch HEAD).
2. Update the pin comment with the new date/version.
3. Re-run the canary to confirm GREEN before committing the pin update.
The bad canary (`v5-stale-docroot`) is a stable fixture branch — update only if the branch is
deleted. If deleted, recreate the pattern: an app that is up + passes lifecycle tiers but fails
one functional assertion.

View File

@ -0,0 +1,106 @@
"""Shared fixtures and helpers for E2E canary regression tests.
The regression tests call the real cc-ci harness (run_recipe_ci.py) as a subprocess and assert on
its outputs (exit code, results.json). They run ON the cc-ci server, not the orchestrator — abra,
Docker, and Swarm must be present.
Invoke: cc-ci-run python -m pytest tests/regression/ -m canary -v
"""
from __future__ import annotations
import json
import os
import subprocess
import sys
import time
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def pytest_configure(config):
config.addinivalue_line(
"markers",
"canary: slow E2E canary test — drives the full cold CI lifecycle; run on-demand only.",
)
config.addinivalue_line(
"markers",
"canary_fast: fast per-tier RED canary (still tagged canary); subset for quick pre-merge checks.",
)
def run_recipe_ci(
recipe: str,
src: str,
ref: str,
pr: str = "0",
stages: str = "install,upgrade,backup,restore,custom",
runs_dir: str | None = None,
run_id_prefix: str = "regression",
timeout: int = 3600,
) -> tuple[int, dict | None, str]:
"""Invoke run_recipe_ci.py with the given canary params.
Returns (rc, results_dict_or_None, run_artifact_dir).
Stdout/stderr stream live so a human can follow progress.
"""
ts = int(time.time())
run_id = f"{run_id_prefix}-{recipe}-{ref[:12]}-{ts}"
if runs_dir is None:
runs_dir = "/var/lib/cc-ci-runs"
env = dict(os.environ)
env.update(
{
"RECIPE": recipe,
"REF": ref,
"SRC": src,
"PR": pr,
"STAGES": stages,
"CCCI_RUN_ID": run_id,
"CCCI_RUNS_DIR": runs_dir,
"HOME": "/root",
}
)
# Keep PLAYWRIGHT env from the outer cc-ci-run wrapper (already in os.environ if running under it)
script = os.path.join(ROOT, "runner", "run_recipe_ci.py")
result = subprocess.run(
[sys.executable, script],
env=env,
timeout=timeout,
)
rc = result.returncode
artifact_dir = os.path.join(runs_dir, run_id)
results_path = os.path.join(artifact_dir, "results.json")
results_data: dict | None = None
if os.path.exists(results_path):
with open(results_path) as f:
results_data = json.load(f)
return rc, results_data, artifact_dir
def find_stage_tests(results: dict, stage_name: str) -> list[dict]:
"""Return the per-test list for a named stage from results.json, or []."""
for stage in results.get("stages", []):
if stage.get("name") == stage_name:
return stage.get("tests", [])
return []
def stage_has_passing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
"""True if the named stage contains a passing test whose name includes test_name_substr."""
for t in find_stage_tests(results, stage_name):
if test_name_substr in t.get("name", "") and t.get("status") == "pass":
return True
return False
def stage_has_failing_test(results: dict, stage_name: str, test_name_substr: str) -> bool:
"""True if the named stage contains a failing test whose name includes test_name_substr."""
for t in find_stage_tests(results, stage_name):
if test_name_substr in t.get("name", "") and t.get("status") in ("fail", "error"):
return True
return False

View File

@ -0,0 +1,344 @@
"""E2E canary regression tests — the server's standing self-test suite.
Seven canaries prove both halves of the server's job:
1. GREEN canaries — good apps are reported healthy (install+upgrade+backup/restore pass).
2. RED canaries — broken apps are caught at the intended tier; a false-green makes THIS test fail.
Fast subset (@pytest.mark.canary_fast): the four per-tier RED canaries on custom-html-tiny — fast
because the recipe deploys in seconds. Run with `-m canary_fast` as a pre-merge quick check.
Full suite (-m canary): includes good-significant (lasuite-docs, 10-20 min).
Run: cc-ci-run python -m pytest tests/regression/ -m canary -v
Pin policy: canary refs are pinned to specific SHAs. Update only after confirming the new ref gives
the expected verdict.
"""
from __future__ import annotations
import os
import sys
import pytest
sys.path.insert(0, os.path.dirname(__file__))
import conftest as _reg # noqa: E402
run_recipe_ci = _reg.run_recipe_ci
stage_has_passing_test = _reg.stage_has_passing_test
stage_has_failing_test = _reg.stage_has_failing_test
# ---------------------------------------------------------------------------
# Canary definitions
# ---------------------------------------------------------------------------
# Good canary 1: minimal static-file server — fast signal, few deps.
_SIMPLE = {
"id": "good-simple",
"recipe": "custom-html-tiny",
"src": "recipe-maintainers/custom-html-tiny",
# Pin: main @ 2026-06-02 — update if the recipe publishes a new release and pin goes stale.
"ref": "435df8fc98ef7598084fcffcd6225470eca80053",
"expected_green": True,
# Named tests that MUST appear with "pass" in the result — these are the semantic teeth.
# If the generic install assertion is removed/vacated, test_serving disappears → this fails.
"stage_pass_checks": [
("install", "test_serving"),
],
"stage_fail_checks": [],
}
# Good canary 2: multi-service stack — backend + Postgres + Collabora WOPI + OIDC.
# Exercises real breadth. Slowest canary (~10-20 min full lifecycle).
_SIGNIFICANT = {
"id": "good-significant",
"recipe": "lasuite-docs",
"src": "recipe-maintainers/lasuite-docs",
# Pin: main @ 2026-06-02
"ref": "290a8ad72d06232f0b3f302d976af14bef0f3c53",
"expected_green": True,
"stage_pass_checks": [
("install", "test_serving_and_frontend"),
],
"stage_fail_checks": [],
}
# Bad canary: app is UP + passes all lifecycle tiers but the custom functional assertion detects a
# semantic defect (wrong Content-Type for .txt files). The harness MUST report RED.
# If the harness wrongly returns green for this fixture, assert rc != 0 fails → false-green caught.
_BAD = {
"id": "bad-false-green",
"recipe": "custom-html",
"src": "recipe-maintainers/custom-html",
# Pin: v5-stale-docroot @ 71e7326 — serves .txt as application/octet-stream; build #75 was RED.
# Recreate pattern if branch disappears: app up + passes lifecycle, fails one content assertion.
"ref": "71e7326a99bbb69035a046fba8fa51859ca66115",
"expected_green": False,
# The specific test that must have FAILED, proving the content-type assertion has teeth.
# If the assertion is vacated and the test disappears, stage_has_failing_test() returns False
# → the assert below fails → we detect that the guard was removed.
"stage_pass_checks": [],
"stage_fail_checks": [
("custom", "test_content_type"),
],
}
# ---------------------------------------------------------------------------
# Per-tier RED canaries (fast subset: @pytest.mark.canary_fast)
# Prove the server catches failure at EVERY lifecycle tier — false-green at any tier is caught.
# Each uses custom-html-tiny (deploys in seconds) or custom-html (fast nginx, has backup support).
# ---------------------------------------------------------------------------
# Shared bad-image branch: deploy fails at prepull because the image doesn't exist on Docker Hub.
# Used for install-RED (STAGES=install → chaos of HEAD with bad image → install=fail)
# and upgrade-RED (STAGES=install,upgrade → prev-version install passes, upgrade chaos fails).
_BAD_IMAGE_REF = "4ae8866100563204d40435c5aba00374aa5a8ed3" # regression-bad-image @ 2026-06-02
_BAD_INSTALL = {
"id": "bad-install",
"recipe": "custom-html-tiny",
"src": "recipe-maintainers/custom-html-tiny",
"ref": _BAD_IMAGE_REF,
"expected_green": False,
# STAGES=install only → no upgrade tier → prev=None → chaos deploy of HEAD (bad image) → fails.
"stages": "install",
# Assertions: install must be the failing tier.
"failing_tier": "install",
"passing_tiers_before": [],
"stage_pass_checks": [],
"stage_fail_checks": [],
}
_BAD_UPGRADE = {
"id": "bad-upgrade",
"recipe": "custom-html-tiny",
"src": "recipe-maintainers/custom-html-tiny",
"ref": _BAD_IMAGE_REF,
"expected_green": False,
# Default stages → prev-version deploy (good image) → install=PASS; upgrade chaos (bad image) → FAIL.
"stages": "install,upgrade,custom",
"failing_tier": "upgrade",
"passing_tiers_before": ["install"],
"stage_pass_checks": [],
"stage_fail_checks": [],
}
_BAD_BACKUP = {
"id": "bad-backup",
"recipe": "custom-html-bkp-bad",
"src": "recipe-maintainers/custom-html-bkp-bad",
# Pin: custom-html-bkp-bad main @ 2026-06-02 — custom-html WITHOUT backupbot labels.
# cc-ci recipe_meta sets BACKUP_CAPABLE=True → harness runs backup tier.
# No backupbot.backup=true label → backup-bot-two finds no containers → no snapshot.
# parse_snapshot_id returns None → test_backup_artifact fails → backup tier RED.
"ref": "b6fe99de41601f9e51bc7ea5b6072f0c3f56cdc3",
"expected_green": False,
"stages": "install,upgrade,backup",
"failing_tier": "backup",
"passing_tiers_before": ["install"],
"stage_pass_checks": [],
"stage_fail_checks": [],
}
_BAD_RESTORE = {
"id": "bad-restore",
"recipe": "custom-html-rst-bad",
"src": "recipe-maintainers/custom-html-rst-bad",
# Pin: custom-html-rst-bad main @ 2026-06-02 (9a73a184).
# No pre_backup hook → backup snapshot has no ci-marker.txt.
# pre_restore writes "mutated". After restore: marker stays "mutated" → FAIL → restore=RED.
# install+backup PASS (no test_backup.py in cc-ci dir); upgrade=skip (no version tags).
"ref": "9a73a184e739691bc6a621a5f1e6efc799743c5b",
"expected_green": False,
"stages": "install,backup,restore,custom",
"failing_tier": "restore",
"passing_tiers_before": ["install", "backup"],
"stage_pass_checks": [],
"stage_fail_checks": [
("restore", "test_restore_returns_state"),
],
}
CANARIES = [_SIMPLE, _SIGNIFICANT, _BAD]
CANARIES_FAST = [_BAD_INSTALL, _BAD_UPGRADE, _BAD_BACKUP, _BAD_RESTORE]
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
@pytest.mark.canary
@pytest.mark.parametrize("canary", CANARIES, ids=[c["id"] for c in CANARIES])
def test_canary(canary, tmp_path):
"""Drive the full cold CI lifecycle for a canary recipe and verify the outcome.
For GREEN canaries: proves the harness correctly reports a healthy app as healthy, and that
the per-tier semantic assertions actually ran (not vacuous).
For the RED canary: proves the harness catches a broken app — if the harness wrongly returned
green, `assert rc != 0` fails, catching the false-green.
"""
stages = canary.get("stages", "install,upgrade,backup,restore,custom")
rc, results, artifact_dir = run_recipe_ci(
recipe=canary["recipe"],
src=canary["src"],
ref=canary["ref"],
runs_dir=str(tmp_path),
stages=stages,
)
_note = f"artifact_dir={artifact_dir}" # visible in -v output via assert messages
if canary["expected_green"]:
_assert_green(rc, results, canary, _note)
else:
_assert_red(rc, results, canary, _note)
@pytest.mark.canary
@pytest.mark.canary_fast
@pytest.mark.parametrize("canary", CANARIES_FAST, ids=[c["id"] for c in CANARIES_FAST])
def test_canary_fast(canary, tmp_path):
"""Fast per-tier RED canaries: each proves the server catches failure at a specific lifecycle tier.
Each canary is broken at exactly one tier; the test asserts:
- Overall verdict: RED (rc != 0)
- The intended failing tier has status "fail"
- Tiers BEFORE the intended failure have status "pass" (proving tier-specific detection, not
"fails somewhere")
These use fast recipes (custom-html-tiny deploys in seconds, custom-html is similarly fast)
and are intended as a pre-merge quick check alongside the full slow suite.
"""
stages = canary.get("stages", "install,upgrade,backup,restore,custom")
rc, results, artifact_dir = run_recipe_ci(
recipe=canary["recipe"],
src=canary["src"],
ref=canary["ref"],
runs_dir=str(tmp_path),
stages=stages,
)
_note = f"artifact_dir={artifact_dir}"
_assert_red_at_tier(rc, results, canary, _note)
def _assert_green(rc: int, results: dict | None, canary: dict, note: str) -> None:
"""Assert a good-canary run is GREEN with real semantic assertions."""
# 1. Harness exit code must be 0 (GREEN).
assert rc == 0, f"[{canary['id']}] harness returned non-zero rc={rc} — expected GREEN. {note}"
assert (
results is not None
), f"[{canary['id']}] results.json not written — harness may have crashed. {note}"
# 2. Install tier must have passed.
assert results.get("results", {}).get("install") == "pass", (
f"[{canary['id']}] install tier did not pass: " f"results={results.get('results')}. {note}"
)
# 3. No tier may have FAILED (skips are acceptable for recipes without backup or custom tests).
failed_tiers = [t for t, s in results.get("results", {}).items() if s == "fail"]
assert not failed_tiers, f"[{canary['id']}] tiers failed: {failed_tiers}. {note}"
# 4. Teardown must be clean (no leftover containers/volumes/secrets).
assert (
results.get("flags", {}).get("clean_teardown") is True
), f"[{canary['id']}] clean_teardown=False — residual state left on server. {note}"
# 5. No secret values leaked into the results artifact.
assert (
results.get("flags", {}).get("no_secret_leak") is True
), f"[{canary['id']}] no_secret_leak=False — a secret value appeared in results.json. {note}"
# 6. Semantic stage assertions — TEETH CHECK.
# These verify that specific named tests actually ran and passed in the expected stage.
# If a tier assertion is removed or made vacuous, the named test disappears from results.json
# and this assert fires — proving the regression suite guards against silent test removal.
for stage_name, test_name_substr in canary.get("stage_pass_checks", []):
assert stage_has_passing_test(results, stage_name, test_name_substr), (
f"[{canary['id']}] expected a passing test containing {test_name_substr!r} in "
f"stage={stage_name!r}, but none found. "
f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
)
def _assert_red(rc: int, results: dict | None, canary: dict, note: str) -> None:
"""Assert a bad-canary run is RED (false-green guard).
The PRIMARY assertion is rc != 0. If the harness wrongly returns 0 (green) for this fixture,
this assert fails → the regression suite catches the false-green. This is the core guard.
"""
# PRIMARY: harness must return non-zero (RED).
# If the harness returns 0 for a broken app, the regression suite fails here — false-green caught.
assert rc != 0, (
f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture — "
f"FALSE-GREEN detected. The harness failed to catch the broken app. {note}"
)
# SECONDARY: verify the specific failing test is present in results.json.
# If the content-type assertion is removed/vacuated, stage_has_failing_test() returns False here
# → this assert fires → we detect that the guard itself was removed (a meta-failure).
if results is not None:
for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
assert stage_has_failing_test(results, stage_name, test_name_substr), (
f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
f"stage={stage_name!r}, but none found. "
f"The guard may have been removed or vacuated. "
f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
)
def _assert_red_at_tier(rc: int, results: dict | None, canary: dict, note: str) -> None:
"""Assert a per-tier RED canary: overall RED, failing_tier=fail, passing_tiers_before=pass.
Proves the server catches failure AT THE INTENDED TIER (not just "fails somewhere"), and that
the tiers before it still PASSED (no collateral damage from the fixture).
If the harness returns 0 for any of these fixtures, false-green is detected at the primary assert.
"""
failing_tier = canary.get("failing_tier")
passing_before = canary.get("passing_tiers_before", [])
# PRIMARY: harness must return non-zero.
assert rc != 0, (
f"[{canary['id']}] harness returned rc=0 (GREEN) for a KNOWN-BAD fixture at tier "
f"{failing_tier!r} — FALSE-GREEN. {note}"
)
if results is None:
return
tier_results = results.get("results", {})
# The intended failing tier must be "fail".
if failing_tier:
actual = tier_results.get(failing_tier)
assert actual == "fail", (
f"[{canary['id']}] expected tier {failing_tier!r}='fail', got {actual!r}. "
f"All tier results: {tier_results}. {note}"
)
# Tiers before the failing tier must have passed (no collateral damage from the fixture).
for tier in passing_before:
actual = tier_results.get(tier)
assert actual == "pass", (
f"[{canary['id']}] expected prior tier {tier!r}='pass' before failing at "
f"{failing_tier!r}, got {actual!r}. All results: {tier_results}. {note}"
)
# Optional: specific failing test name (for the restore-RED canary).
for stage_name, test_name_substr in canary.get("stage_fail_checks", []):
assert stage_has_failing_test(results, stage_name, test_name_substr), (
f"[{canary['id']}] expected a failing test containing {test_name_substr!r} in "
f"stage={stage_name!r}. "
f"Stage tests: {[t['name'] for t in _stage_tests(results, stage_name)]}. {note}"
)
def _stage_tests(results: dict, stage_name: str) -> list[dict]:
for stage in results.get("stages", []):
if stage.get("name") == stage_name:
return stage.get("tests", [])
return []