refactor: simplify to a list of intentionally-skipped rungs

Per operator: drop the gap-sensitivity / cap-intent-clause / stale-detection machinery. Model is now dead simple — recipe_meta.EXPECTED_NA = {rung: reason} lists the rungs a recipe intentionally skips; ANY rung skipped (N/A) and not in that list is unintentional. results.json: replace the 'na' block + level_cap_intent with skips: { intentional: {rung: reason}, unintentional: [rung] } plus level_cap_rung (which rung capped). Badge/card derive intentional-vs- unintentional from whether the capping rung is in the intentional list. Skips still cap the level (never inflate). custom-html-tiny lists all three rungs it intentionally skips (backup_restore, integration, recipe_local). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 02:36:53 +00:00
parent d733e2c4ca
commit b3ab68a9dd
6 changed files with 105 additions and 146 deletions
--- a/runner/harness/card.py
+++ b/runner/harness/card.py
@ -79,27 +79,27 @@ def render_badge_svg(label: str, message: str, color: str) -> str:
    )


-# Third-segment colours for the level badge: amber = an UNEXPECTED skip (undeclared gap-sensitive
-# N/A — likely missing coverage) capped the climb; muted = an EXPECTED skip (declared intentional
-# N/A — reviewed, nothing to fix). Font-safe text labels (no emoji) so the SVG renders anywhere.
+# Third-segment colours for the level badge: amber = an UNINTENTIONAL skip (a rung skipped but not
+# in the recipe's intentional list — likely missing coverage) capped the climb; muted = an
+# INTENTIONAL skip (declared in recipe_meta.EXPECTED_NA — nothing to fix). Font-safe text labels
+# (no emoji) so the SVG renders anywhere.
 GAP_COLOR = "#d29922"
 EXPECT_COLOR = "#6e7681"


-def level_badge_svg(level: int, cap_reason: str = "", cap_intent: str = "") -> str:
+def level_badge_svg(level: int, cap_reason: str = "", cap_skip: str = "") -> str:
    """Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level (R6), with a THIRD segment
-    that differentiates *why* the climb stopped when an N/A capped it:
-      - undeclared gap-sensitive N/A (an UNEXPECTED skip — likely missing coverage): amber 'gap?'.
-      - declared intentional N/A (an EXPECTED skip — reviewed, nothing to fix): muted 'expected'.
-      - clean cap / full climb / a real failure: no third segment (the level + card carry it).
-    Derived from `cap_intent` (results.level_cap_intent) so the badge never inflates — it only
-    annotates the cap the level already reflects."""
+    that differentiates *why* the climb stopped when a SKIP capped it (`cap_skip`):
+      - "unintentional" (a rung skipped but not in the recipe's intentional list): amber 'gap?'.
+      - "intentional"   (a skip declared in recipe_meta.EXPECTED_NA): muted 'expected'.
+      - "" (clean cap / full climb / a real failure): no third segment (the level + card carry it).
+    The badge never inflates — it only annotates the cap the level already reflects."""
    label, msg = "cc-ci", f"level {int(level)}"
    lw, mw = _text_width(label), _text_width(msg)
    third: tuple[str, str] | None = None
-    if cap_intent.startswith("undeclared"):
+    if cap_skip == "unintentional":
        third = ("gap?", GAP_COLOR)
-    elif cap_intent.startswith("intentional"):
+    elif cap_skip == "intentional":
        third = ("expected", EXPECT_COLOR)
    if third is None:
        return render_badge_svg(label, msg, level_color(level))
@ -151,8 +151,15 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
    version = html.escape(str(data.get("version") or data.get("ref") or ""))
    level = int(data.get("level", 0))
    cap_reason = str(data.get("level_cap_reason") or "")
-    cap_intent = str(data.get("level_cap_intent") or "")
-    cap = html.escape(cap_reason + (f" · {cap_intent}" if cap_intent else ""))
+    # Annotate the cap line by whether the capping rung was an intentional skip (declared, with its
+    # reason) or an unintentional one (skipped but not declared).
+    capped = data.get("level_cap_rung")
+    sk = data.get("skips", {}) or {}
+    if capped and capped in (sk.get("intentional") or {}):
+        cap_reason += f" · intentional: {sk['intentional'][capped]}"
+    elif capped and capped in (sk.get("unintentional") or []):
+        cap_reason += " · unintentional skip (no EXPECTED_NA — add a test or declare it)"
+    cap = html.escape(cap_reason)
    color = level_color(level)
    flags = data.get("flags", {}) or {}
    flag_bits = []
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -2,7 +2,14 @@

 Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
  { recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
-    level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+    level, level_cap_reason, level_cap_rung, rungs,
+    skips:{intentional:{rung:reason}, unintentional:[rung]},
+    flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+
+`skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
+it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
+UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
+never claims a rung it did not verify; this only labels *why* a skip happened.

 The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
 parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
@ -200,54 +207,23 @@ def derive_rungs(
    return rungs


-# Rungs where an *undeclared* N/A is suspicious — it usually means a recipe SHOULD have this coverage
-# but nobody added it (a backup label, a functional test), i.e. an accidental gap rather than a real
-# property of the recipe. For these, an undeclared N/A is surfaced as a "possible coverage gap" unless
-# the recipe declares it intentional via recipe_meta.EXPECTED_NA. The other rungs (upgrade — only one
-# published version; integration — no SSO surface; recipe_local — no repo-local tests) are
-# *structurally* optional: an N/A there is the normal case and is not flagged.
-GAP_SENSITIVE_RUNGS = ("backup_restore", "functional")
+def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
+    """Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).

-
-def classify_na(rungs: dict[str, str], expected_na: dict | None) -> dict:
-    """Distinguish *intentionally* N/A rungs from *accidentally* missing ones (operator request).
-
-    A recipe declares intentional N/A in `recipe_meta.EXPECTED_NA = {rung: reason}`. N/A always caps
-    the level either way (the harness never inflates — a rung that wasn't verified wasn't verified);
-    this only EXPLAINS the cap so a reviewer can tell "this recipe legitimately has no backup surface"
-    from "someone forgot to add the backup test". Returns:
-      { "rungs": {rung: {"intent": "declared"|"undeclared", "reason": str}},  # one per N/A rung
-        "gaps": [rung, ...],            # gap-sensitive rungs that are N/A and NOT declared
-        "stale_declared": [rung, ...] } # rungs declared N/A but actually exercised (stale opt-out)
+    A recipe lists the rungs it intentionally skips, each with a reason, in
+    `recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
+    **intentional** iff it is in that list; any rung that is skipped and NOT in the list is
+    **unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
+    level either way — the harness never claims a rung it did not verify — this only labels *why* a
+    skip happened. Returns:
+      { "intentional": {rung: reason, ...},   # skipped AND declared in EXPECTED_NA
+        "unintentional": [rung, ...] }         # skipped but NOT declared
    """
    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
-    na: dict[str, dict] = {}
-    for rung, st in rungs.items():
-        if st != "na":
-            continue
-        if rung in expected:
-            na[rung] = {"intent": "declared", "reason": expected[rung]}
-        else:
-            na[rung] = {"intent": "undeclared", "reason": ""}
-    gaps = [r for r in GAP_SENSITIVE_RUNGS if na.get(r, {}).get("intent") == "undeclared"]
-    stale = sorted(r for r in expected if rungs.get(r) not in (None, "na"))
-    return {"rungs": na, "gaps": gaps, "stale_declared": stale}
-
-
-def cap_intent(rungs: dict[str, str], level: int, cap_reason: str, na_info: dict) -> str:
-    """A short clause explaining the level cap when the capping rung is N/A: the declared reason if
-    intentional, a 'possible coverage gap' note if it's an undeclared gap-sensitive rung, else ''."""
-    if not cap_reason:
-        return ""
-    capped = level_mod.RUNGS[level] if 0 <= level < len(level_mod.RUNGS) else None
-    if not capped or rungs.get(capped) != "na":
-        return ""
-    entry = na_info["rungs"].get(capped, {})
-    if entry.get("intent") == "declared":
-        return f"intentional · {entry['reason']}"
-    if capped in GAP_SENSITIVE_RUNGS:
-        return "undeclared N/A — possible coverage gap (add a test or declare EXPECTED_NA)"
-    return ""
+    na = [r for r, st in rungs.items() if st == "na"]
+    intentional = {r: expected[r] for r in na if r in expected}
+    unintentional = sorted(r for r in na if r not in expected)
+    return {"intentional": intentional, "unintentional": unintentional}


 def build_results(
@ -286,8 +262,9 @@ def build_results(
        repo_local_passed=_repo_local_passed(records),
    )
    lvl, cap_reason = level_mod.compute_level(rungs)
-    na_info = classify_na(rungs, expected_na)
-    intent = cap_intent(rungs, lvl, cap_reason, na_info)
+    # The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
+    # (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
+    capped = level_mod.RUNGS[lvl] if cap_reason else None
    return {
        "schema": 1,
        "run_id": run_id(),
@ -298,9 +275,9 @@ def build_results(
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
-        "level_cap_intent": intent,
+        "level_cap_rung": capped,
        "rungs": rungs,
-        "na": na_info,
+        "skips": skips(rungs, expected_na),
        "stages": stages,
        "results": results,
        "flags": {
--- a/runner/run_recipe_ci.py
+++ b/runner/run_recipe_ci.py
@ -1254,27 +1254,18 @@ def main() -> int:
                file=sys.stderr,
            )
        path = results_mod.write_results(data)
-        intent = data.get("level_cap_intent") or ""
        print(
            f"results.json written: {path} (level={data['level']}"
-            f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''}"
-            f"{' [' + intent + ']' if intent else ''})",
+            f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
            flush=True,
        )
-        # Surface the intentional-vs-accidental N/A signal in the CI log (non-blocking, R7): a
-        # gap-sensitive rung that is N/A but undeclared is a possible coverage hole; a stale
-        # EXPECTED_NA declares a tier N/A that actually ran.
-        na = data.get("na", {})
-        for rung in na.get("gaps", []):
+        # Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
+        # but is not in the recipe's intentional list — either add the missing coverage or declare it.
+        for rung in data.get("skips", {}).get("unintentional", []):
            print(
-                f"⚠ coverage: rung '{rung}' is N/A but not declared intentional — add a test or "
-                f"declare it in tests/{recipe}/recipe_meta.py EXPECTED_NA = {{'{rung}': '<why>'}}.",
-                flush=True,
-            )
-        for rung in na.get("stale_declared", []):
-            print(
-                f"⚠ stale EXPECTED_NA: rung '{rung}' is declared N/A but was actually exercised "
-                f"(status={data['rungs'].get(rung)}) — remove it from recipe_meta.EXPECTED_NA.",
+                f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
+                f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
+                f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
                flush=True,
            )
    except Exception as e:  # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
@ -1295,12 +1286,17 @@ def main() -> int:
            with open(html_path, "w", encoding="utf-8") as f:
                f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
            png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
+            capped = data.get("level_cap_rung")
+            sk = data.get("skips", {})
+            cap_skip = (
+                "intentional" if capped in (sk.get("intentional") or {})
+                else "unintentional" if capped in (sk.get("unintentional") or [])
+                else ""
+            )
            with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
                f.write(
                    card_mod.level_badge_svg(
-                        data["level"],
-                        data.get("level_cap_reason", ""),
-                        data.get("level_cap_intent", ""),
+                        data["level"], data.get("level_cap_reason", ""), cap_skip
                    )
                )
            print(