feat(harness): intentional skips + custom-html-tiny functional test; 4-rung ladder (#6)

Declare intentional skips + custom-html-tiny functional test; 4-rung level ladder - recipe_meta.EXPECTED_NA = {rung: reason} lists intentionally-skipped rungs; any essential rung skipped and not listed is unintentional. Skips still cap the level (never inflate). results.json: skips:{intentional,unintentional} + level_cap_rung. - Level ladder = the four essential rungs (install, upgrade, backup/restore, functional; top = L4). integration & recipe-local are optional, not leveled (SSO still enforced for the run verdict, unchanged). - Card shows skipped rungs as INTENTIONAL SKIP (green, reason below) / UNINTENTIONAL SKIP (amber); level badge gains an expected/gap? third segment. - custom-html-tiny: functional serve test (exact-byte round-trip + 404); declares backup_restore intentionally skipped (stateless static server). Independently verified by the adversary: 138 unit tests pass cold; live full-stage run on custom-html-tiny green (upgrade tier ran; level 2; correct skips/badge); clean teardown.
2026-06-09 03:12:11 +00:00
parent f5a6f7196f
commit c51cd84159
10 changed files with 392 additions and 187 deletions
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -2,7 +2,14 @@

 Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
  { recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
-    level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+    level, level_cap_reason, level_cap_rung, rungs,
+    skips:{intentional:{rung:reason}, unintentional:[rung]},
+    flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+
+`skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
+it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
+UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
+never claims a rung it did not verify; this only labels *why* a skip happened.

 The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
 parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
@ -127,41 +134,24 @@ def collect_stages(records: list[dict]) -> list[dict]:
    return stages


-def _has_repo_local(records: list[dict]) -> bool:
-    return any(r.get("source") == "repo-local" for r in records)
-
-
-def _repo_local_passed(records: list[dict]) -> bool:
-    repo = [r for r in records if r.get("source") == "repo-local"]
-    return bool(repo) and all(r.get("rc", 1) == 0 for r in repo)
-
-
 def derive_rungs(
    results: dict[str, str],
    *,
    backup_capable: bool,
-    declared: list[str] | None,
-    deps_ready: bool,
-    sso_unverified: bool,
    has_custom: bool,
-    has_repo_local: bool,
-    repo_local_passed: bool,
 ) -> dict[str, str]:
-    """Translate the orchestrator's tier results + deps/SSO signals into the rung-status dict
-    harness.level consumes. Documented in DECISIONS.md (Phase 3). Conservative by design — never
-    reports a rung 'pass' it can't substantiate (cardinal guardrail: presentation never inflates).
+    """Translate the orchestrator's tier results into the rung-status dict harness.level consumes —
+    the FOUR essential rungs only. Conservative by design — never reports a rung 'pass' it can't
+    substantiate (cardinal guardrail: presentation never inflates).

      L1 install    : install tier pass.
      L2 upgrade    : upgrade tier (skip → N/A: only one published version).
      L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
-      L4 functional : the recipe-specific functional (non-deps) tests pass — the custom tier, minus
-                      its SSO/integration tests. N/A if the recipe has no custom tests at all.
-      L5 integration: SSO/OIDC + cross-app. Applies ONLY if the recipe declares deps (else N/A — the
-                      "no integration surface caps at L4" rule, §4.1). pass iff deps wired
-                      (deps_ready) and not sso_unverified and the custom tier didn't fail.
-      L6 recipe-loc : the recipe repo's own tests/ (repo-local source) ran and passed (N/A if none).
+      L4 functional : recipe-specific functional tests pass — the custom tier. N/A if none ran.
+
+    Integration (SSO/OIDC) and recipe-local are OPTIONAL and intentionally NOT rungs here — they
+    never cap the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
    """
-    declared = declared or []
    rungs: dict[str, str] = {}
    rungs["install"] = level_mod.tier_to_rung(results.get("install"))
    rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
@ -170,36 +160,34 @@ def derive_rungs(
    )

    custom = results.get("custom")
-    # Functional rung (L4): the non-deps custom tests.
    if not has_custom or custom == "skip" or custom is None:
        rungs["functional"] = "na"
    elif custom == "fail":
-        # A custom test failed. With declared deps we cannot cheaply tell functional-vs-SSO apart, so
-        # conservatively fail the functional rung (caps at L3) — never inflate.
        rungs["functional"] = "fail"
    else:  # custom == "pass"
        rungs["functional"] = "pass"
-
-    # Integration rung (L5): only recipes with an SSO/integration surface (declared deps) can climb.
-    if not declared:
-        rungs["integration"] = "na"
-    elif sso_unverified or not deps_ready or custom == "fail":
-        # SSO not wired/verified, or a custom test failed → integration not verified.
-        rungs["integration"] = "fail"
-    elif custom == "pass":
-        rungs["integration"] = "pass"
-    else:
-        # declared deps but no custom tests ran — can't claim integration verified
-        rungs["integration"] = "na"
-
-    # Recipe-local rung (L6).
-    if not has_repo_local:
-        rungs["recipe_local"] = "na"
-    else:
-        rungs["recipe_local"] = "pass" if repo_local_passed else "fail"
    return rungs


+def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
+    """Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).
+
+    A recipe lists the rungs it intentionally skips, each with a reason, in
+    `recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
+    **intentional** iff it is in that list; any rung that is skipped and NOT in the list is
+    **unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
+    level either way — the harness never claims a rung it did not verify — this only labels *why* a
+    skip happened. Returns:
+      { "intentional": {rung: reason, ...},   # skipped AND declared in EXPECTED_NA
+        "unintentional": [rung, ...] }         # skipped but NOT declared
+    """
+    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
+    na = [r for r, st in rungs.items() if st == "na"]
+    intentional = {r: expected[r] for r in na if r in expected}
+    unintentional = sorted(r for r in na if r not in expected)
+    return {"intentional": intentional, "unintentional": unintentional}
+
+
 def build_results(
    *,
    recipe: str,
@ -209,30 +197,24 @@ def build_results(
    records: list[dict],
    results: dict[str, str],
    backup_capable: bool,
-    declared: list[str] | None,
-    deps_ready: bool,
-    sso_unverified: bool,
    clean_teardown: bool,
    no_secret_leak: bool,
    finished_ts: float | None,
    screenshot: str | None = None,
    summary_card: str | None = None,
+    expected_na: dict | None = None,
 ) -> dict:
    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
-    stamps it) so this stays pure and deterministic for unit tests."""
+    stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
+    declared intentional-skip map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
+    accidentally-missing coverage."""
    stages = collect_stages(records)
    has_custom = any(r["tier"] == "custom" for r in records)
-    rungs = derive_rungs(
-        results,
-        backup_capable=backup_capable,
-        declared=declared,
-        deps_ready=deps_ready,
-        sso_unverified=sso_unverified,
-        has_custom=has_custom,
-        has_repo_local=_has_repo_local(records),
-        repo_local_passed=_repo_local_passed(records),
-    )
+    rungs = derive_rungs(results, backup_capable=backup_capable, has_custom=has_custom)
    lvl, cap_reason = level_mod.compute_level(rungs)
+    # The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
+    # (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
+    capped = level_mod.RUNGS[lvl] if cap_reason else None
    return {
        "schema": 1,
        "run_id": run_id(),
@ -243,7 +225,9 @@ def build_results(
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
+        "level_cap_rung": capped,
        "rungs": rungs,
+        "skips": skips(rungs, expected_na),
        "stages": stages,
        "results": results,
        "flags": {