feat(harness): declare intentional N/A tiers + custom-html-tiny functional test

Two changes the operator asked for after noticing custom-html-tiny PR #6 has no backup/restore or functional coverage: 1) Intentional-vs-accidental N/A. A recipe can now declare recipe_meta.EXPECTED_NA = {rung: reason} to mark a tier as deliberately not applicable (e.g. a stateless static server has no backup surface). N/A still caps the level — the harness never claims a rung it did not verify — but the run is now annotated 'intentional · <reason>' instead of being indistinguishable from a forgotten test. An *undeclared* N/A on a gap-sensitive rung (backup_restore, functional) is surfaced as a 'possible coverage gap', and a stale EXPECTED_NA (declared N/A but actually exercised) is surfaced too. All non-blocking (R7): results.json gains level_cap_intent + an block, the summary card shows the clause, and the CI log prints the gap/stale warnings. (results.classify_na/cap_intent are pure + unit-tested; level.py untouched.) custom-html-tiny declares backup_restore intentionally N/A. 2) custom-html-tiny functional test: writes a random file into the served content volume (via the volume mountpoint, like install_steps.sh, since the SWS image is shell-less), asserts exact-byte round-trip + a real 404 on a missing path — proving the static-web-server actually serves the volume, not a 200-everything fallback. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 01:59:28 +00:00
parent f5a6f7196f
commit 3b0a3d14ea
6 changed files with 279 additions and 3 deletions
--- a/runner/harness/card.py
+++ b/runner/harness/card.py
@ -116,7 +116,9 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
    recipe = html.escape(str(data.get("recipe", "?")))
    version = html.escape(str(data.get("version") or data.get("ref") or ""))
    level = int(data.get("level", 0))
-    cap = html.escape(str(data.get("level_cap_reason") or ""))
+    cap_reason = str(data.get("level_cap_reason") or "")
+    cap_intent = str(data.get("level_cap_intent") or "")
+    cap = html.escape(cap_reason + (f" · {cap_intent}" if cap_intent else ""))
    color = level_color(level)
    flags = data.get("flags", {}) or {}
    flag_bits = []
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -200,6 +200,56 @@ def derive_rungs(
    return rungs


+# Rungs where an *undeclared* N/A is suspicious — it usually means a recipe SHOULD have this coverage
+# but nobody added it (a backup label, a functional test), i.e. an accidental gap rather than a real
+# property of the recipe. For these, an undeclared N/A is surfaced as a "possible coverage gap" unless
+# the recipe declares it intentional via recipe_meta.EXPECTED_NA. The other rungs (upgrade — only one
+# published version; integration — no SSO surface; recipe_local — no repo-local tests) are
+# *structurally* optional: an N/A there is the normal case and is not flagged.
+GAP_SENSITIVE_RUNGS = ("backup_restore", "functional")
+
+
+def classify_na(rungs: dict[str, str], expected_na: dict | None) -> dict:
+    """Distinguish *intentionally* N/A rungs from *accidentally* missing ones (operator request).
+
+    A recipe declares intentional N/A in `recipe_meta.EXPECTED_NA = {rung: reason}`. N/A always caps
+    the level either way (the harness never inflates — a rung that wasn't verified wasn't verified);
+    this only EXPLAINS the cap so a reviewer can tell "this recipe legitimately has no backup surface"
+    from "someone forgot to add the backup test". Returns:
+      { "rungs": {rung: {"intent": "declared"|"undeclared", "reason": str}},  # one per N/A rung
+        "gaps": [rung, ...],            # gap-sensitive rungs that are N/A and NOT declared
+        "stale_declared": [rung, ...] } # rungs declared N/A but actually exercised (stale opt-out)
+    """
+    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
+    na: dict[str, dict] = {}
+    for rung, st in rungs.items():
+        if st != "na":
+            continue
+        if rung in expected:
+            na[rung] = {"intent": "declared", "reason": expected[rung]}
+        else:
+            na[rung] = {"intent": "undeclared", "reason": ""}
+    gaps = [r for r in GAP_SENSITIVE_RUNGS if na.get(r, {}).get("intent") == "undeclared"]
+    stale = sorted(r for r in expected if rungs.get(r) not in (None, "na"))
+    return {"rungs": na, "gaps": gaps, "stale_declared": stale}
+
+
+def cap_intent(rungs: dict[str, str], level: int, cap_reason: str, na_info: dict) -> str:
+    """A short clause explaining the level cap when the capping rung is N/A: the declared reason if
+    intentional, a 'possible coverage gap' note if it's an undeclared gap-sensitive rung, else ''."""
+    if not cap_reason:
+        return ""
+    capped = level_mod.RUNGS[level] if 0 <= level < len(level_mod.RUNGS) else None
+    if not capped or rungs.get(capped) != "na":
+        return ""
+    entry = na_info["rungs"].get(capped, {})
+    if entry.get("intent") == "declared":
+        return f"intentional · {entry['reason']}"
+    if capped in GAP_SENSITIVE_RUNGS:
+        return "undeclared N/A — possible coverage gap (add a test or declare EXPECTED_NA)"
+    return ""
+
+
 def build_results(
    *,
    recipe: str,
@ -217,9 +267,12 @@ def build_results(
    finished_ts: float | None,
    screenshot: str | None = None,
    summary_card: str | None = None,
+    expected_na: dict | None = None,
 ) -> dict:
    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
-    stamps it) so this stays pure and deterministic for unit tests."""
+    stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
+    declared intentional-N/A map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
+    accidentally-missing coverage."""
    stages = collect_stages(records)
    has_custom = any(r["tier"] == "custom" for r in records)
    rungs = derive_rungs(
@ -233,6 +286,8 @@ def build_results(
        repo_local_passed=_repo_local_passed(records),
    )
    lvl, cap_reason = level_mod.compute_level(rungs)
+    na_info = classify_na(rungs, expected_na)
+    intent = cap_intent(rungs, lvl, cap_reason, na_info)
    return {
        "schema": 1,
        "run_id": run_id(),
@ -243,7 +298,9 @@ def build_results(
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
+        "level_cap_intent": intent,
        "rungs": rungs,
+        "na": na_info,
        "stages": stages,
        "results": results,
        "flags": {
--- a/runner/run_recipe_ci.py
+++ b/runner/run_recipe_ci.py
@ -200,6 +200,7 @@ def _load_meta(recipe: str) -> dict:
        for k in list(meta) + [
            "BACKUP_CAPABLE",
            "SKIP_GENERIC",
+            "EXPECTED_NA",
            "OIDC_AT_INSTALL",
            "READY_PROBE",
            "UPGRADE_BASE_VERSION",
@ -1241,6 +1242,7 @@ def main() -> int:
            no_secret_leak=True,  # narrowed below by an actual scan of the serialised artifact
            screenshot=screenshot_rel,  # Phase 3 U1 (R4): relative PNG name iff capture succeeded
            finished_ts=time.time(),
+            expected_na=meta.get("EXPECTED_NA"),  # declared intentional-N/A map (recipe_meta)
        )
        # Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7).
        blob = json.dumps(data)
@ -1252,11 +1254,29 @@ def main() -> int:
                file=sys.stderr,
            )
        path = results_mod.write_results(data)
+        intent = data.get("level_cap_intent") or ""
        print(
            f"results.json written: {path} (level={data['level']}"
-            f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
+            f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''}"
+            f"{' [' + intent + ']' if intent else ''})",
            flush=True,
        )
+        # Surface the intentional-vs-accidental N/A signal in the CI log (non-blocking, R7): a
+        # gap-sensitive rung that is N/A but undeclared is a possible coverage hole; a stale
+        # EXPECTED_NA declares a tier N/A that actually ran.
+        na = data.get("na", {})
+        for rung in na.get("gaps", []):
+            print(
+                f"⚠ coverage: rung '{rung}' is N/A but not declared intentional — add a test or "
+                f"declare it in tests/{recipe}/recipe_meta.py EXPECTED_NA = {{'{rung}': '<why>'}}.",
+                flush=True,
+            )
+        for rung in na.get("stale_declared", []):
+            print(
+                f"⚠ stale EXPECTED_NA: rung '{rung}' is declared N/A but was actually exercised "
+                f"(status={data['rungs'].get(rung)}) — remove it from recipe_meta.EXPECTED_NA.",
+                flush=True,
+            )
    except Exception as e:  # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
        print(
            f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}",
--- a/tests/custom-html-tiny/functional/test_serves_content.py
+++ b/tests/custom-html-tiny/functional/test_serves_content.py
@ -0,0 +1,87 @@
+"""custom-html-tiny — recipe-specific functional test (static-web-server).
+
+Proves the deployed static-web-server is *actually serving files from its `content` volume* with real
+file-server semantics, not merely returning 200 from a Traefik fallback or a generic stub:
+
+  1. exact-byte round-trip — write a uniquely-named file with random content into the served volume,
+     fetch it over HTTPS, and assert the bytes come back verbatim. Non-vacuous: the content is random
+     per run, so only a server that reads this file off the volume can pass.
+  2. real 404 — a random non-existent path returns 404, proving directory/file semantics (a
+     200-everything stub or mis-routed host would not 404).
+
+The recipe's image (joseluisq/static-web-server) is shell-less (scratch-based) and its content volume
+is seeded via the install_steps.sh host-mountpoint mechanism — so this test writes its probe file the
+same way (resolve the swarm volume's mountpoint with `docker volume inspect`, write directly) rather
+than `docker exec`-ing in a container that has no shell.
+
+Runs in the custom tier against the shared post-install deployment (the `live_app` fixture is its
+per-run domain). Mirrors install_steps.sh: the app's content volume is named `<stack>_content`, where
+`stack` is the domain with dots replaced by underscores; HTTP_SUBDIR is empty, so the volume root is
+served at `/`.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import os
+import ssl
+import subprocess
+import urllib.error
+import urllib.request
+import uuid
+
+
+def _served_dir(domain: str) -> str:
+    """Host mountpoint of the app's served `content` volume (same naming as install_steps.sh)."""
+    vol = f"{domain.replace('.', '_')}_content"
+    out = subprocess.run(
+        ["docker", "volume", "inspect", vol, "--format", "{{.Mountpoint}}"],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    mountpoint = out.stdout.strip()
+    assert mountpoint, f"could not resolve mountpoint for volume {vol!r}"
+    return mountpoint
+
+
+def _get(url: str) -> tuple[int, bytes]:
+    """GET the URL; return (status, body). A 4xx/5xx is returned, not raised (we assert on the code).
+    TLS verification is relaxed: the served wildcard cert is validated separately by the infra check;
+    here we care only about the app's response."""
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    try:
+        with urllib.request.urlopen(url, timeout=20, context=ctx) as resp:
+            return resp.status, resp.read()
+    except urllib.error.HTTPError as e:
+        return e.code, e.read()
+
+
+def test_static_file_roundtrip_and_404(live_app):
+    """Write a random file into the served volume → fetch it → bytes match; and a missing path 404s."""
+    served = _served_dir(live_app)
+    token = uuid.uuid4().hex
+    name = f"ccci-probe-{token}.txt"
+    body = f"cc-ci-functional-{token}\n".encode()
+    path = os.path.join(served, name)
+    with open(path, "wb") as fh:
+        fh.write(body)
+    try:
+        status, got = _get(f"https://{live_app}/{name}")
+        assert status == 200, f"served probe file returned {status} (expected 200)"
+        assert got == body, (
+            f"content round-trip mismatch: served {got!r}, wrote {body!r} "
+            "(static-web-server not serving the content volume?)"
+        )
+
+        # A random non-existent path must 404 — proves real static-file semantics, distinguishing a
+        # working server from a 200-everything stub or a mis-routed Traefik fallback.
+        miss_status, _ = _get(f"https://{live_app}/ccci-missing-{uuid.uuid4().hex}.txt")
+        assert miss_status == 404, (
+            f"missing path returned {miss_status} (expected 404 — generic 200-returner / mis-route?)"
+        )
+    finally:
+        with contextlib.suppress(OSError):
+            os.remove(path)
--- a/tests/custom-html-tiny/recipe_meta.py
+++ b/tests/custom-html-tiny/recipe_meta.py
@ -3,3 +3,15 @@
 # (DG5) is detected quickly instead of waiting the default 300s HTTP timeout.
 DEPLOY_TIMEOUT = 120
 HTTP_TIMEOUT = 90
+
+# Intentionally-N/A tiers (reviewed opt-out, NOT a coverage gap). custom-html-tiny is a stateless
+# static-web-server: it serves an ephemeral `content` volume that the harness seeds at deploy time
+# (install_steps.sh) and holds no persistent or user data, so there is nothing to back up or restore.
+# The recipe therefore declares no `backupbot.backup` label and the L3 backup/restore rung is N/A.
+# Declaring it here marks that N/A as deliberate, so the run is annotated "intentional" instead of
+# being flagged as a possible missing-coverage gap. (N/A still caps the level — the harness never
+# claims a rung it did not verify; this only explains *why* the cap is expected.)
+EXPECTED_NA = {
+    "backup_restore": "stateless static file server: serves an ephemeral content volume seeded at "
+    "deploy, with no persistent/user data to back up or restore (no backupbot.backup label)",
+}
--- a/tests/unit/test_results.py
+++ b/tests/unit/test_results.py
@ -257,6 +257,104 @@ def test_build_results_capped_at_L1_on_upgrade_fail(tmp_path):
    assert "L2" in data["level_cap_reason"]


+# ---- classify_na / cap_intent: intentional-vs-accidental N/A (operator request) ----
+
+
+def _rungs(**kw):
+    base = {
+        "install": "pass",
+        "upgrade": "pass",
+        "backup_restore": "pass",
+        "functional": "pass",
+        "integration": "na",
+        "recipe_local": "na",
+    }
+    base.update(kw)
+    return base
+
+
+def test_classify_na_declared_vs_undeclared():
+    rungs = _rungs(backup_restore="na", functional="na")
+    info = R.classify_na(rungs, {"backup_restore": "stateless static server"})
+    # backup_restore is declared intentional; functional is an undeclared gap-sensitive N/A.
+    assert info["rungs"]["backup_restore"] == {
+        "intent": "declared",
+        "reason": "stateless static server",
+    }
+    assert info["rungs"]["functional"]["intent"] == "undeclared"
+    assert info["gaps"] == ["functional"]  # backup_restore declared → not a gap
+    assert info["stale_declared"] == []
+    # structurally-optional N/A (integration, recipe_local) are recorded but never flagged as gaps.
+    assert info["rungs"]["integration"]["intent"] == "undeclared"
+    assert "integration" not in info["gaps"]
+
+
+def test_classify_na_stale_declaration():
+    # backup_restore actually ran (pass) but is declared N/A → stale opt-out, surfaced.
+    rungs = _rungs(backup_restore="pass")
+    info = R.classify_na(rungs, {"backup_restore": "stale reason"})
+    assert info["stale_declared"] == ["backup_restore"]
+    assert "backup_restore" not in info["rungs"]  # not N/A, so not in the per-rung N/A map
+
+
+def test_cap_intent_declared_explains_cap():
+    # install+upgrade pass, backup_restore declared-N/A → caps at L2 with an intentional clause.
+    rungs = _rungs(backup_restore="na")
+    info = R.classify_na(rungs, {"backup_restore": "no persistent data"})
+    intent = R.cap_intent(rungs, 2, "L3 backup/restore (data integrity) N/A", info)
+    assert intent == "intentional · no persistent data"
+
+
+def test_cap_intent_undeclared_gap():
+    rungs = _rungs(backup_restore="na")
+    info = R.classify_na(rungs, None)
+    intent = R.cap_intent(rungs, 2, "L3 backup/restore (data integrity) N/A", info)
+    assert "possible coverage gap" in intent
+
+
+def test_cap_intent_blank_when_not_capped_on_na():
+    rungs = _rungs()  # full clean climb, capped only at integration (na, structurally optional)
+    info = R.classify_na(rungs, None)
+    # capping rung is integration (level 4) — structurally optional, so no intent clause.
+    assert R.cap_intent(rungs, 4, "L5 integration N/A", info) == ""
+    # and no cap at all → blank.
+    assert R.cap_intent(rungs, 6, "", info) == ""
+
+
+def test_build_results_threads_expected_na(tmp_path):
+    recs = [
+        {
+            "tier": "install",
+            "source": "generic",
+            "file": "g/test_install.py",
+            "rc": 0,
+            "junit": _write(tmp_path, "i.xml", JUNIT_PASS),
+        }
+    ]
+    data = R.build_results(
+        recipe="custom-html-tiny",
+        version="1.1.0",
+        pr="0",
+        ref=None,
+        records=recs,
+        results=_results(backup="skip", restore="skip", custom="skip"),
+        backup_capable=False,  # no backupbot label → backup_restore N/A
+        declared=[],
+        deps_ready=True,
+        sso_unverified=False,
+        clean_teardown=True,
+        no_secret_leak=True,
+        finished_ts=0.0,
+        expected_na={"backup_restore": "stateless static file server"},
+    )
+    # N/A still caps at L2 (never inflates), but now annotated intentional rather than flagged.
+    assert data["level"] == 2
+    assert "L3" in data["level_cap_reason"]
+    assert data["level_cap_intent"] == "intentional · stateless static file server"
+    assert data["na"]["rungs"]["backup_restore"]["intent"] == "declared"
+    assert data["na"]["gaps"] == []
+
+
 def test_write_results_roundtrip(tmp_path):
    data = {"run_id": "42", "level": 3, "stages": []}
    path = R.write_results(data, runs_dir_override=str(tmp_path))