feat(harness): declare intentional N/A tiers + custom-html-tiny functional test

Two changes the operator asked for after noticing custom-html-tiny PR #6 has no backup/restore or functional coverage: 1) Intentional-vs-accidental N/A. A recipe can now declare recipe_meta.EXPECTED_NA = {rung: reason} to mark a tier as deliberately not applicable (e.g. a stateless static server has no backup surface). N/A still caps the level — the harness never claims a rung it did not verify — but the run is now annotated 'intentional · <reason>' instead of being indistinguishable from a forgotten test. An *undeclared* N/A on a gap-sensitive rung (backup_restore, functional) is surfaced as a 'possible coverage gap', and a stale EXPECTED_NA (declared N/A but actually exercised) is surfaced too. All non-blocking (R7): results.json gains level_cap_intent + an block, the summary card shows the clause, and the CI log prints the gap/stale warnings. (results.classify_na/cap_intent are pure + unit-tested; level.py untouched.) custom-html-tiny declares backup_restore intentionally N/A. 2) custom-html-tiny functional test: writes a random file into the served content volume (via the volume mountpoint, like install_steps.sh, since the SWS image is shell-less), asserts exact-byte round-trip + a real 404 on a missing path — proving the static-web-server actually serves the volume, not a 200-everything fallback. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 01:59:28 +00:00
parent f5a6f7196f
commit 3b0a3d14ea
6 changed files with 279 additions and 3 deletions
--- a/runner/harness/card.py
+++ b/runner/harness/card.py
@ -116,7 +116,9 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
    recipe = html.escape(str(data.get("recipe", "?")))
    version = html.escape(str(data.get("version") or data.get("ref") or ""))
    level = int(data.get("level", 0))
-    cap = html.escape(str(data.get("level_cap_reason") or ""))
+    cap_reason = str(data.get("level_cap_reason") or "")
+    cap_intent = str(data.get("level_cap_intent") or "")
+    cap = html.escape(cap_reason + (f" · {cap_intent}" if cap_intent else ""))
    color = level_color(level)
    flags = data.get("flags", {}) or {}
    flag_bits = []
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -200,6 +200,56 @@ def derive_rungs(
    return rungs


+# Rungs where an *undeclared* N/A is suspicious — it usually means a recipe SHOULD have this coverage
+# but nobody added it (a backup label, a functional test), i.e. an accidental gap rather than a real
+# property of the recipe. For these, an undeclared N/A is surfaced as a "possible coverage gap" unless
+# the recipe declares it intentional via recipe_meta.EXPECTED_NA. The other rungs (upgrade — only one
+# published version; integration — no SSO surface; recipe_local — no repo-local tests) are
+# *structurally* optional: an N/A there is the normal case and is not flagged.
+GAP_SENSITIVE_RUNGS = ("backup_restore", "functional")
+
+
+def classify_na(rungs: dict[str, str], expected_na: dict | None) -> dict:
+    """Distinguish *intentionally* N/A rungs from *accidentally* missing ones (operator request).
+
+    A recipe declares intentional N/A in `recipe_meta.EXPECTED_NA = {rung: reason}`. N/A always caps
+    the level either way (the harness never inflates — a rung that wasn't verified wasn't verified);
+    this only EXPLAINS the cap so a reviewer can tell "this recipe legitimately has no backup surface"
+    from "someone forgot to add the backup test". Returns:
+      { "rungs": {rung: {"intent": "declared"|"undeclared", "reason": str}},  # one per N/A rung
+        "gaps": [rung, ...],            # gap-sensitive rungs that are N/A and NOT declared
+        "stale_declared": [rung, ...] } # rungs declared N/A but actually exercised (stale opt-out)
+    """
+    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
+    na: dict[str, dict] = {}
+    for rung, st in rungs.items():
+        if st != "na":
+            continue
+        if rung in expected:
+            na[rung] = {"intent": "declared", "reason": expected[rung]}
+        else:
+            na[rung] = {"intent": "undeclared", "reason": ""}
+    gaps = [r for r in GAP_SENSITIVE_RUNGS if na.get(r, {}).get("intent") == "undeclared"]
+    stale = sorted(r for r in expected if rungs.get(r) not in (None, "na"))
+    return {"rungs": na, "gaps": gaps, "stale_declared": stale}
+
+
+def cap_intent(rungs: dict[str, str], level: int, cap_reason: str, na_info: dict) -> str:
+    """A short clause explaining the level cap when the capping rung is N/A: the declared reason if
+    intentional, a 'possible coverage gap' note if it's an undeclared gap-sensitive rung, else ''."""
+    if not cap_reason:
+        return ""
+    capped = level_mod.RUNGS[level] if 0 <= level < len(level_mod.RUNGS) else None
+    if not capped or rungs.get(capped) != "na":
+        return ""
+    entry = na_info["rungs"].get(capped, {})
+    if entry.get("intent") == "declared":
+        return f"intentional · {entry['reason']}"
+    if capped in GAP_SENSITIVE_RUNGS:
+        return "undeclared N/A — possible coverage gap (add a test or declare EXPECTED_NA)"
+    return ""
+
+
 def build_results(
    *,
    recipe: str,
@ -217,9 +267,12 @@ def build_results(
    finished_ts: float | None,
    screenshot: str | None = None,
    summary_card: str | None = None,
+    expected_na: dict | None = None,
 ) -> dict:
    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
-    stamps it) so this stays pure and deterministic for unit tests."""
+    stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
+    declared intentional-N/A map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
+    accidentally-missing coverage."""
    stages = collect_stages(records)
    has_custom = any(r["tier"] == "custom" for r in records)
    rungs = derive_rungs(
@ -233,6 +286,8 @@ def build_results(
        repo_local_passed=_repo_local_passed(records),
    )
    lvl, cap_reason = level_mod.compute_level(rungs)
+    na_info = classify_na(rungs, expected_na)
+    intent = cap_intent(rungs, lvl, cap_reason, na_info)
    return {
        "schema": 1,
        "run_id": run_id(),
@ -243,7 +298,9 @@ def build_results(
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
+        "level_cap_intent": intent,
        "rungs": rungs,
+        "na": na_info,
        "stages": stages,
        "results": results,
        "flags": {