feat(harness): intentional skips + custom-html-tiny functional test; 4-rung ladder (#6)

Declare intentional skips + custom-html-tiny functional test; 4-rung level ladder - recipe_meta.EXPECTED_NA = {rung: reason} lists intentionally-skipped rungs; any essential rung skipped and not listed is unintentional. Skips still cap the level (never inflate). results.json: skips:{intentional,unintentional} + level_cap_rung. - Level ladder = the four essential rungs (install, upgrade, backup/restore, functional; top = L4). integration & recipe-local are optional, not leveled (SSO still enforced for the run verdict, unchanged). - Card shows skipped rungs as INTENTIONAL SKIP (green, reason below) / UNINTENTIONAL SKIP (amber); level badge gains an expected/gap? third segment. - custom-html-tiny: functional serve test (exact-byte round-trip + 404); declares backup_restore intentionally skipped (stateless static server). Independently verified by the adversary: 138 unit tests pass cold; live full-stage run on custom-html-tiny green (upgrade tier ran; level 2; correct skips/badge); clean teardown.
2026-06-09 03:12:11 +00:00
parent f5a6f7196f
commit c51cd84159
10 changed files with 392 additions and 187 deletions
--- a/runner/harness/card.py
+++ b/runner/harness/card.py
@ -79,10 +79,44 @@ def render_badge_svg(label: str, message: str, color: str) -> str:
    )
-def level_badge_svg(level: int, cap_reason: str = "") -> str:
+# Third-segment colours for the level badge: amber = an UNINTENTIONAL skip (a rung skipped but not
-    """Per-recipe/-run LEVEL badge: 'cc-ci | level N'. Colour by level (R6)."""
+# in the recipe's intentional list — likely missing coverage) capped the climb; muted = an
-    msg = f"level {int(level)}"
+# INTENTIONAL skip (declared in recipe_meta.EXPECTED_NA — nothing to fix). Font-safe text labels
-    return render_badge_svg("cc-ci", msg, level_color(level))
+# (no emoji) so the SVG renders anywhere.
 GAP_COLOR = "#d29922"
 EXPECT_COLOR = "#6e7681"
 def level_badge_svg(level: int, cap_reason: str = "", cap_skip: str = "") -> str:
    """Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level (R6), with a THIRD segment
    that differentiates *why* the climb stopped when a SKIP capped it (`cap_skip`):
      - "unintentional" (a rung skipped but not in the recipe's intentional list): amber 'gap?'.
      - "intentional"   (a skip declared in recipe_meta.EXPECTED_NA): muted 'expected'.
      - "" (clean cap / full climb / a real failure): no third segment (the level + card carry it).
    The badge never inflates — it only annotates the cap the level already reflects."""
    label, msg = "cc-ci", f"level {int(level)}"
    lw, mw = _text_width(label), _text_width(msg)
    third: tuple[str, str] | None = None
    if cap_skip == "unintentional":
        third = ("gap?", GAP_COLOR)
    elif cap_skip == "intentional":
        third = ("expected", EXPECT_COLOR)
    if third is None:
        return render_badge_svg(label, msg, level_color(level))
    txt, tcolor = third
    tw = _text_width(txt)
    w = lw + mw + tw
    return (
        f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="20" role="img" '
        f'aria-label="{html.escape(label)}: {html.escape(msg)} ({html.escape(txt)})">'
        f'<rect width="{lw}" height="20" fill="#555"/>'
        f'<rect x="{lw}" width="{mw}" height="20" fill="{level_color(level)}"/>'
        f'<rect x="{lw + mw}" width="{tw}" height="20" fill="{tcolor}"/>'
        f'<g fill="#fff" font-family="Verdana,Geneva,sans-serif" font-size="11">'
        f'<text x="6" y="14">{html.escape(label)}</text>'
        f'<text x="{lw + 6}" y="14">{html.escape(msg)}</text>'
        f'<text x="{lw + mw + 6}" y="14">{html.escape(txt)}</text></g></svg>'
    )
 def _stage_rows(stages: list[dict]) -> str:
@ -107,6 +141,41 @@ def _stage_rows(stages: list[dict]) -> str:
    return "\n".join(rows) or '<tr><td colspan="3">no stages</td></tr>'
 # Friendly rung labels for the skip rows (the four essential rungs).
 RUNG_LABEL = {
    "install": "install",
    "upgrade": "upgrade",
    "backup_restore": "backup/restore",
    "functional": "functional",
 }
 SKIP_GREEN = "#57ab5a"  # muted green — an intentional skip reads like a pass (but labelled, never inflating)
 def _skip_rows(skips: dict) -> str:
    """Render SKIPPED rungs as stage-like rows. An intentional (declared) skip looks like a pass row
    but its status says 'INTENTIONAL SKIP' (muted green) with the declared reason on the line below;
    an unintentional skip is amber 'UNINTENTIONAL SKIP' with a prompt to add a test or declare it."""
    rows = []
    for rung, reason in (skips.get("intentional") or {}).items():
        rows.append(
            f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{SKIP_GREEN}">⊘</span>'
            f'<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>'
            f'<td class="st" style="color:{SKIP_GREEN}">intentional skip</td></tr>'
        )
        rows.append(f'<tr class="skipreason"><td></td><td colspan="2">{html.escape(reason)}</td></tr>')
    for rung in skips.get("unintentional") or []:
        rows.append(
            f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{GAP_COLOR}">⊘</span>'
            f'<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>'
            f'<td class="st" style="color:{GAP_COLOR}">unintentional skip</td></tr>'
        )
        rows.append(
            '<tr class="skipreason"><td></td><td colspan="2">not declared in EXPECTED_NA — add the '
            "missing test/label, or declare the skip with a reason</td></tr>"
        )
    return "\n".join(rows)
 def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png") -> str:
    """Build the summary-card HTML from a results.json dict. `screenshot_rel` is the relative path to
    the screenshot PNG (same dir as the card) — omitted from the card if None / absent.
@ -116,7 +185,9 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
    recipe = html.escape(str(data.get("recipe", "?")))
    version = html.escape(str(data.get("version") or data.get("ref") or ""))
    level = int(data.get("level", 0))
-    cap = html.escape(str(data.get("level_cap_reason") or ""))
+    cap_reason = str(data.get("level_cap_reason") or "")
    cap = html.escape(cap_reason)
    sk = data.get("skips", {}) or {}
    color = level_color(level)
    flags = data.get("flags", {}) or {}
    flag_bits = []
@ -132,7 +203,7 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
        if show_shot
        else '<div class="shot noshot">no screenshot</div>'
    )
-    rows = _stage_rows(data.get("stages", []))
+    rows = _stage_rows(data.get("stages", [])) + "\n" + _skip_rows(sk)
    return f"""<!doctype html><html><head><meta charset="utf-8"><style>
 *{{box-sizing:border-box}}
 body{{margin:0;font-family:system-ui,-apple-system,Segoe UI,sans-serif;background:#0d1117;color:#c9d1d9}}
@ -157,6 +228,7 @@ tr.stage td{{padding-top:.5rem;border-bottom:1px solid #30363d}}
 .test .tmark{{width:1.4rem;text-align:center}}
 .test .tname{{color:#c9d1d9;font-family:ui-monospace,monospace;font-size:.8rem}}
 .test .tms{{text-align:right;color:#8b949e;font-size:.74rem;width:5rem}}
 tr.skipreason td{{color:#8b949e;font-size:.78rem;font-style:italic;padding-top:0;padding-bottom:.45rem;border-bottom:1px solid #21262d}}
 .shot{{width:360px;flex:none;border:1px solid #30363d;border-radius:8px;overflow:hidden;background:#0d1117}}
 .shot img{{width:100%;display:block}}
 .shot.noshot{{display:flex;align-items:center;justify-content:center;height:225px;color:#8b949e;font-size:.85rem}}
@ -167,7 +239,7 @@ tr.stage td{{padding-top:.5rem;border-bottom:1px solid #30363d}}
 <div class="hd">{FLOWER_SVG}
 <div class="title"><h1>{recipe}</h1><span class="ver">{version}</span></div>
 <div class="lvl"><span class="num">{level}</span><span class="lbl">level</span></div></div>
-<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (6)"}</div>
+<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (4)"}</div>
 <div class="body"><div class="tbl"><table>{rows}</table></div>{shot_html}</div>
 <div class="flags">{"".join(flag_bits)}</div>
 </div></body></html>"""
--- a/runner/harness/level.py
+++ b/runner/harness/level.py
@ -5,37 +5,39 @@ YunoHost semantics: **a gap caps the level** — you only earn level L if every
 PASS. The first rung that is not a clean PASS (a real FAIL *or* genuinely N/A for this recipe) stops
 the climb; `cap_reason` records why. This is deliberately conservative: presentation must NEVER make
 a run look greener than its tests (plan §6 cardinal guardrail), so an N/A rung caps just like a fail
-(the L5 example in §4.1 — "recipes with no integration surface cap at L4 by definition" — is exactly
+— with a recorded reason so the level is *fair*, not inflated.
 this: N/A caps, with a recorded reason so the level is *fair*, not inflated).
-The ladder (§4.1):
+The ladder is the FOUR essential rungs every recipe is held to:
  L0 — install failed / app never became healthy.
  L1 — Installs: deploys + passes health/readiness.
  L2 — Upgrades: previous published version → PR version, stays healthy, data intact.
  L3 — Backup/restore: seeded data survives backup → wipe → restore.
  L4 — Functional: recipe-specific functional tests pass.
-  L5 — Integration: SSO/OIDC + cross-app integration tests pass.
+
-  L6 — Recipe-local: the recipe repo's own tests/ (D4) pass and are merged.
+Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) are **OPTIONAL**
 capabilities — they are NOT part of the level ladder and never cap it. They still run when present
 (and SSO is still enforced for the run VERDICT via the deps/SSO checks in run_recipe_ci.py), but a
 recipe without an SSO surface or without repo-local tests is simply not penalised on the level.
 This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the unit
 test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`). The orchestrator
-(`run_recipe_ci.py`) is responsible for translating its raw per-tier results + deps/SSO signals into
+(`run_recipe_ci.py`) is responsible for translating its raw per-tier results into the rung-status
-the rung-status dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
+dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
 Rung status vocabulary (each rung ∈ these three):
  "pass" — the rung was exercised and passed.
  "fail" — the rung was exercised and failed.
  "na"   — the rung does not apply to this recipe (e.g. only one published version → no upgrade;
-           not backup-capable; no SSO/integration surface; no recipe-local tests). N/A is NOT a
+           not backup-capable). N/A is NOT a failure, but it DOES cap the climb (with a distinct
-           failure, but it DOES cap the climb (with a distinct cap_reason) so the level never
+           cap_reason) so the level never overstates what was actually verified.
           overstates what was actually verified.
 """
 from __future__ import annotations
 # The climbable rungs in ascending order. install (L1) is the foundation; L0 means install itself
-# did not pass. Each later rung requires every earlier rung to be a clean PASS.
+# did not pass. Each later rung requires every earlier rung to be a clean PASS. These four are the
-RUNGS = ("install", "upgrade", "backup_restore", "functional", "integration", "recipe_local")
+# ESSENTIAL rungs — integration/recipe-local are optional and deliberately NOT in this tuple.
 RUNGS = ("install", "upgrade", "backup_restore", "functional")
 # Human-readable label per rung level, for cap_reason + the summary card.
 RUNG_LABEL = {
@ -43,22 +45,20 @@ RUNG_LABEL = {
    2: "upgrade (prev published → PR)",
    3: "backup/restore (data integrity)",
    4: "functional (recipe-specific tests)",
    5: "integration (SSO/OIDC + cross-app)",
    6: "recipe-local (recipe repo tests/)",
 }
 VALID = {"pass", "fail", "na"}
 def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
-    """Map a rung-status dict → (level 0..6, cap_reason).
+    """Map a rung-status dict → (level 0..4, cap_reason).
    `rungs` must contain a status in {"pass","fail","na"} for every name in RUNGS. The level is the
    highest L such that rungs[1..L] are all "pass"; the first non-"pass" rung caps the climb. L0 is
    returned when the install rung itself is not "pass" (install failed / never healthy).
    cap_reason explains where the climb stopped:
-      - "" (empty) when the recipe earned the top rung (L6, full clean climb).
+      - "" (empty) when the recipe earned the top rung (L4, full clean climb).
      - "L<k> <label> FAILED" when a rung was exercised and failed.
      - "L<k> <label> N/A" when a rung does not apply to this recipe.
    Returns the reason for the FIRST rung that stopped the climb (the binding constraint).
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -2,7 +2,14 @@
 Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
  { recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
-    level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+    level, level_cap_reason, level_cap_rung, rungs,
    skips:{intentional:{rung:reason}, unintentional:[rung]},
    flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
 `skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
 it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
 UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
 never claims a rung it did not verify; this only labels *why* a skip happened.
 The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
 parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
@ -127,41 +134,24 @@ def collect_stages(records: list[dict]) -> list[dict]:
    return stages
 def _has_repo_local(records: list[dict]) -> bool:
    return any(r.get("source") == "repo-local" for r in records)
 def _repo_local_passed(records: list[dict]) -> bool:
    repo = [r for r in records if r.get("source") == "repo-local"]
    return bool(repo) and all(r.get("rc", 1) == 0 for r in repo)
 def derive_rungs(
    results: dict[str, str],
    *,
    backup_capable: bool,
    declared: list[str] | None,
    deps_ready: bool,
    sso_unverified: bool,
    has_custom: bool,
    has_repo_local: bool,
    repo_local_passed: bool,
 ) -> dict[str, str]:
-    """Translate the orchestrator's tier results + deps/SSO signals into the rung-status dict
+    """Translate the orchestrator's tier results into the rung-status dict harness.level consumes —
-    harness.level consumes. Documented in DECISIONS.md (Phase 3). Conservative by design — never
+    the FOUR essential rungs only. Conservative by design — never reports a rung 'pass' it can't
-    reports a rung 'pass' it can't substantiate (cardinal guardrail: presentation never inflates).
+    substantiate (cardinal guardrail: presentation never inflates).
      L1 install    : install tier pass.
      L2 upgrade    : upgrade tier (skip → N/A: only one published version).
      L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
-      L4 functional : the recipe-specific functional (non-deps) tests pass — the custom tier, minus
+      L4 functional : recipe-specific functional tests pass — the custom tier. N/A if none ran.
-                      its SSO/integration tests. N/A if the recipe has no custom tests at all.
+
-      L5 integration: SSO/OIDC + cross-app. Applies ONLY if the recipe declares deps (else N/A — the
+    Integration (SSO/OIDC) and recipe-local are OPTIONAL and intentionally NOT rungs here — they
-                      "no integration surface caps at L4" rule, §4.1). pass iff deps wired
+    never cap the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
                      (deps_ready) and not sso_unverified and the custom tier didn't fail.
      L6 recipe-loc : the recipe repo's own tests/ (repo-local source) ran and passed (N/A if none).
    """
    declared = declared or []
    rungs: dict[str, str] = {}
    rungs["install"] = level_mod.tier_to_rung(results.get("install"))
    rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
@ -170,36 +160,34 @@ def derive_rungs(
    )
    custom = results.get("custom")
    # Functional rung (L4): the non-deps custom tests.
    if not has_custom or custom == "skip" or custom is None:
        rungs["functional"] = "na"
    elif custom == "fail":
        # A custom test failed. With declared deps we cannot cheaply tell functional-vs-SSO apart, so
        # conservatively fail the functional rung (caps at L3) — never inflate.
        rungs["functional"] = "fail"
    else:  # custom == "pass"
        rungs["functional"] = "pass"
    # Integration rung (L5): only recipes with an SSO/integration surface (declared deps) can climb.
    if not declared:
        rungs["integration"] = "na"
    elif sso_unverified or not deps_ready or custom == "fail":
        # SSO not wired/verified, or a custom test failed → integration not verified.
        rungs["integration"] = "fail"
    elif custom == "pass":
        rungs["integration"] = "pass"
    else:
        # declared deps but no custom tests ran — can't claim integration verified
        rungs["integration"] = "na"
    # Recipe-local rung (L6).
    if not has_repo_local:
        rungs["recipe_local"] = "na"
    else:
        rungs["recipe_local"] = "pass" if repo_local_passed else "fail"
    return rungs
 def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
    """Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).
    A recipe lists the rungs it intentionally skips, each with a reason, in
    `recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
    **intentional** iff it is in that list; any rung that is skipped and NOT in the list is
    **unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
    level either way — the harness never claims a rung it did not verify — this only labels *why* a
    skip happened. Returns:
      { "intentional": {rung: reason, ...},   # skipped AND declared in EXPECTED_NA
        "unintentional": [rung, ...] }         # skipped but NOT declared
    """
    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
    na = [r for r, st in rungs.items() if st == "na"]
    intentional = {r: expected[r] for r in na if r in expected}
    unintentional = sorted(r for r in na if r not in expected)
    return {"intentional": intentional, "unintentional": unintentional}
 def build_results(
    *,
    recipe: str,
@ -209,30 +197,24 @@ def build_results(
    records: list[dict],
    results: dict[str, str],
    backup_capable: bool,
    declared: list[str] | None,
    deps_ready: bool,
    sso_unverified: bool,
    clean_teardown: bool,
    no_secret_leak: bool,
    finished_ts: float | None,
    screenshot: str | None = None,
    summary_card: str | None = None,
    expected_na: dict | None = None,
 ) -> dict:
    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
-    stamps it) so this stays pure and deterministic for unit tests."""
+    stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
    declared intentional-skip map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
    accidentally-missing coverage."""
    stages = collect_stages(records)
    has_custom = any(r["tier"] == "custom" for r in records)
-    rungs = derive_rungs(
+    rungs = derive_rungs(results, backup_capable=backup_capable, has_custom=has_custom)
        results,
        backup_capable=backup_capable,
        declared=declared,
        deps_ready=deps_ready,
        sso_unverified=sso_unverified,
        has_custom=has_custom,
        has_repo_local=_has_repo_local(records),
        repo_local_passed=_repo_local_passed(records),
    )
    lvl, cap_reason = level_mod.compute_level(rungs)
    # The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
    # (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
    capped = level_mod.RUNGS[lvl] if cap_reason else None
    return {
        "schema": 1,
        "run_id": run_id(),
@ -243,7 +225,9 @@ def build_results(
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
        "level_cap_rung": capped,
        "rungs": rungs,
        "skips": skips(rungs, expected_na),
        "stages": stages,
        "results": results,
        "flags": {
--- a/runner/run_recipe_ci.py
+++ b/runner/run_recipe_ci.py
@ -200,6 +200,7 @@ def _load_meta(recipe: str) -> dict:
        for k in list(meta) + [
            "BACKUP_CAPABLE",
            "SKIP_GENERIC",
            "EXPECTED_NA",
            "OIDC_AT_INSTALL",
            "READY_PROBE",
            "UPGRADE_BASE_VERSION",
@ -1224,7 +1225,6 @@ def main() -> int:
    # a failure here NEVER changes `overall` (R7 — cosmetics never block the pipeline). ----
    data: dict | None = None
    try:
        sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped)
        clean_teardown = (deploy_count == expected_deploy_count) and not dep_teardown_error
        data = results_mod.build_results(
            recipe=recipe,
@ -1234,13 +1234,11 @@ def main() -> int:
            records=records,
            results=results,
            backup_capable=backup_cap,
            declared=declared,
            deps_ready=deps_ready,
            sso_unverified=sso_unverified,
            clean_teardown=clean_teardown,
            no_secret_leak=True,  # narrowed below by an actual scan of the serialised artifact
            screenshot=screenshot_rel,  # Phase 3 U1 (R4): relative PNG name iff capture succeeded
            finished_ts=time.time(),
            expected_na=meta.get("EXPECTED_NA"),  # declared intentional-skip map (recipe_meta)
        )
        # Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7).
        blob = json.dumps(data)
@ -1257,6 +1255,15 @@ def main() -> int:
            f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
            flush=True,
        )
        # Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
        # but is not in the recipe's intentional list — either add the missing coverage or declare it.
        for rung in data.get("skips", {}).get("unintentional", []):
            print(
                f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
                f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
                f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
                flush=True,
            )
    except Exception as e:  # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
        print(
            f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}",
@ -1275,8 +1282,19 @@ def main() -> int:
            with open(html_path, "w", encoding="utf-8") as f:
                f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
            png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
            capped = data.get("level_cap_rung")
            sk = data.get("skips", {})
            cap_skip = (
                "intentional" if capped in (sk.get("intentional") or {})
                else "unintentional" if capped in (sk.get("unintentional") or [])
                else ""
            )
            with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
-                f.write(card_mod.level_badge_svg(data["level"], data.get("level_cap_reason", "")))
+                f.write(
                    card_mod.level_badge_svg(
                        data["level"], data.get("level_cap_reason", ""), cap_skip
                    )
                )
            print(
                f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
                f"badge.svg written into {run_artifact_dir}",
--- a/tests/custom-html-tiny/functional/test_serves_content.py
+++ b/tests/custom-html-tiny/functional/test_serves_content.py
@ -0,0 +1,87 @@
 """custom-html-tiny — recipe-specific functional test (static-web-server).
 Proves the deployed static-web-server is *actually serving files from its `content` volume* with real
 file-server semantics, not merely returning 200 from a Traefik fallback or a generic stub:
  1. exact-byte round-trip — write a uniquely-named file with random content into the served volume,
     fetch it over HTTPS, and assert the bytes come back verbatim. Non-vacuous: the content is random
     per run, so only a server that reads this file off the volume can pass.
  2. real 404 — a random non-existent path returns 404, proving directory/file semantics (a
     200-everything stub or mis-routed host would not 404).
 The recipe's image (joseluisq/static-web-server) is shell-less (scratch-based) and its content volume
 is seeded via the install_steps.sh host-mountpoint mechanism — so this test writes its probe file the
 same way (resolve the swarm volume's mountpoint with `docker volume inspect`, write directly) rather
 than `docker exec`-ing in a container that has no shell.
 Runs in the custom tier against the shared post-install deployment (the `live_app` fixture is its
 per-run domain). Mirrors install_steps.sh: the app's content volume is named `<stack>_content`, where
 `stack` is the domain with dots replaced by underscores; HTTP_SUBDIR is empty, so the volume root is
 served at `/`.
 """
 from __future__ import annotations
 import contextlib
 import os
 import ssl
 import subprocess
 import urllib.error
 import urllib.request
 import uuid
 def _served_dir(domain: str) -> str:
    """Host mountpoint of the app's served `content` volume (same naming as install_steps.sh)."""
    vol = f"{domain.replace('.', '_')}_content"
    out = subprocess.run(
        ["docker", "volume", "inspect", vol, "--format", "{{.Mountpoint}}"],
        capture_output=True,
        text=True,
        check=True,
    )
    mountpoint = out.stdout.strip()
    assert mountpoint, f"could not resolve mountpoint for volume {vol!r}"
    return mountpoint
 def _get(url: str) -> tuple[int, bytes]:
    """GET the URL; return (status, body). A 4xx/5xx is returned, not raised (we assert on the code).
    TLS verification is relaxed: the served wildcard cert is validated separately by the infra check;
    here we care only about the app's response."""
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    try:
        with urllib.request.urlopen(url, timeout=20, context=ctx) as resp:
            return resp.status, resp.read()
    except urllib.error.HTTPError as e:
        return e.code, e.read()
 def test_static_file_roundtrip_and_404(live_app):
    """Write a random file into the served volume → fetch it → bytes match; and a missing path 404s."""
    served = _served_dir(live_app)
    token = uuid.uuid4().hex
    name = f"ccci-probe-{token}.txt"
    body = f"cc-ci-functional-{token}\n".encode()
    path = os.path.join(served, name)
    with open(path, "wb") as fh:
        fh.write(body)
    try:
        status, got = _get(f"https://{live_app}/{name}")
        assert status == 200, f"served probe file returned {status} (expected 200)"
        assert got == body, (
            f"content round-trip mismatch: served {got!r}, wrote {body!r} "
            "(static-web-server not serving the content volume?)"
        )
        # A random non-existent path must 404 — proves real static-file semantics, distinguishing a
        # working server from a 200-everything stub or a mis-routed Traefik fallback.
        miss_status, _ = _get(f"https://{live_app}/ccci-missing-{uuid.uuid4().hex}.txt")
        assert miss_status == 404, (
            f"missing path returned {miss_status} (expected 404 — generic 200-returner / mis-route?)"
        )
    finally:
        with contextlib.suppress(OSError):
            os.remove(path)
--- a/tests/custom-html-tiny/recipe_meta.py
+++ b/tests/custom-html-tiny/recipe_meta.py
@ -3,3 +3,14 @@
 # (DG5) is detected quickly instead of waiting the default 300s HTTP timeout.
 DEPLOY_TIMEOUT = 120
 HTTP_TIMEOUT = 90
 # Rungs this recipe INTENTIONALLY skips, each with a reason. Any essential rung skipped (N/A) and NOT
 # listed here is reported as an *unintentional* skip (a coverage gap to fill or declare). A skip still
 # caps the level either way — the harness never claims a rung it did not verify; this only records
 # that the skip is deliberate. (The level ladder is the four essential rungs install/upgrade/
 # backup_restore/functional; integration + recipe-local are optional and not leveled.)
 # custom-html-tiny is a stateless static-web-server, so it has no backup surface:
 EXPECTED_NA = {
    "backup_restore": "stateless static file server: serves an ephemeral content volume seeded at "
    "deploy, with no persistent/user data to back up or restore (no backupbot.backup label)",
 }
--- a/tests/unit/test_card.py
+++ b/tests/unit/test_card.py
@ -14,7 +14,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")
 from harness import card as C  # noqa: E402
-def _data(level=4, cap="L5 integration (SSO/OIDC + cross-app) N/A"):
+def _data(level=3, cap="L4 functional (recipe-specific tests) N/A"):
    return {
        "recipe": "uptime-kuma",
        "version": "1.23.0",
@ -51,6 +51,35 @@ def test_badge_svg_wellformed():
    assert svg.startswith("<svg") and svg.endswith("</svg>")
    assert "level 4" in svg
    assert C.level_color(4) in svg
    # plain cap (no intent) → two-box badge, no third segment
    assert "expected" not in svg and "gap?" not in svg
 def test_badge_svg_differentiates_intentional_vs_unintentional_skip():
    # an intentional (declared) skip capped the climb → muted "expected" third segment
    exp = C.level_badge_svg(2, "L3 backup/restore N/A", "intentional")
    assert "level 2" in exp and "expected" in exp and C.EXPECT_COLOR in exp
    assert "gap?" not in exp
    # an unintentional skip (not declared) → amber "gap?" third segment
    gap = C.level_badge_svg(2, "L3 backup/restore N/A", "unintentional")
    assert "level 2" in gap and "gap?" in gap and C.GAP_COLOR in gap
    assert "expected" not in gap
 def test_skip_rows_intentional_and_unintentional():
    html_out = C._skip_rows(
        {"intentional": {"backup_restore": "no persistent data"}, "unintentional": ["functional"]}
    )
    # intentional skip: labelled row (muted green) + the reason on its own line
    assert "intentional skip" in html_out and C.SKIP_GREEN in html_out
    assert "backup/restore" in html_out and "no persistent data" in html_out
    # unintentional skip: amber row + prompt to declare/add coverage
    assert "unintentional skip" in html_out and C.GAP_COLOR in html_out
    assert "functional" in html_out and "EXPECTED_NA" in html_out
 def test_skip_rows_empty_when_no_skips():
    assert C._skip_rows({"intentional": {}, "unintentional": []}) == ""
 def test_card_html_reports_level_verbatim():
--- a/tests/unit/test_dashboard.py
+++ b/tests/unit/test_dashboard.py
@ -24,7 +24,7 @@ import dashboard  # noqa: E402
 def _row(**kw):
    base = {
        "recipe": "custom-html", "status": "success", "number": 4, "ref": "db9a9502",
-        "version": "db9a95024e9d", "level": 4, "level_cap_reason": "L5 integration N/A",
+        "version": "db9a95024e9d", "level": 4, "level_cap_reason": "",
        "has_screenshot": True, "flags": {"clean_teardown": True, "no_secret_leak": True},
        "finished": 0, "url": "https://drone.x/cc-ci/4",
    }
--- a/tests/unit/test_level.py
+++ b/tests/unit/test_level.py
@ -19,33 +19,23 @@ def _rungs(
    upgrade="pass",
    backup_restore="pass",
    functional="pass",
    integration="pass",
    recipe_local="pass",
 ):
    return {
        "install": install,
        "upgrade": upgrade,
        "backup_restore": backup_restore,
        "functional": functional,
        "integration": integration,
        "recipe_local": recipe_local,
    }
-# ---- the U0 gate: L4-pass and L2-cap ----
+# ---- the ladder: four essential rungs, top is L4 (functional) ----
-def test_full_clean_climb_to_L6():
+def test_full_clean_climb_to_L4():
    # All four essential rungs pass → L4 (the top; integration/recipe-local are optional, not leveled).
    lvl, reason = L.compute_level(_rungs())
    assert lvl == 6
    assert reason == ""
 def test_climbs_through_L4_then_no_integration_surface_caps_at_L4():
    # GATE: a recipe whose functional tests pass but has no SSO/integration surface caps at L4.
    lvl, reason = L.compute_level(_rungs(integration="na", recipe_local="na"))
    assert lvl == 4
-    assert "L5" in reason and "N/A" in reason
+    assert reason == ""
 def test_fails_at_L2_capped_at_L1():
@ -69,34 +59,27 @@ def test_install_fail_is_L0():
 def test_higher_pass_does_not_rescue_lower_na():
    # backup/restore N/A (stateless app) caps at L2 even though functional would pass.
-    lvl, reason = L.compute_level(_rungs(backup_restore="na", functional="pass", integration="na"))
+    lvl, reason = L.compute_level(_rungs(backup_restore="na", functional="pass"))
    assert lvl == 2
    assert "L3" in reason and "N/A" in reason
 def test_upgrade_na_caps_at_L1():
-    # only one published version → no upgrade possible → N/A caps at L1.
+    # only one published version → no upgrade possible → N/A caps at L1 (upgrade is essential).
    lvl, reason = L.compute_level(_rungs(upgrade="na"))
    assert lvl == 1
    assert "L2" in reason and "N/A" in reason
-def test_integration_fail_caps_at_L4():
+def test_functional_na_caps_at_L3():
-    # SSO declared but unverified (failed) → integration rung fails → cap at L4.
+    # no recipe-specific functional tests → functional N/A caps at L3.
-    lvl, reason = L.compute_level(_rungs(integration="fail", recipe_local="na"))
+    lvl, reason = L.compute_level(_rungs(functional="na"))
-    assert lvl == 4
+    assert lvl == 3
-    assert "L5" in reason and "FAILED" in reason
+    assert "L4" in reason and "N/A" in reason
 def test_recipe_local_na_caps_at_L5():
    # SSO passes but no recipe-local tests → cap at L5 (L6 N/A).
    lvl, reason = L.compute_level(_rungs(recipe_local="na"))
    assert lvl == 5
    assert "L6" in reason and "N/A" in reason
 def test_functional_fail_caps_at_L3():
-    lvl, reason = L.compute_level(_rungs(functional="fail", integration="na"))
+    lvl, reason = L.compute_level(_rungs(functional="fail"))
    assert lvl == 3
    assert "L4" in reason and "FAILED" in reason
--- a/tests/unit/test_results.py
+++ b/tests/unit/test_results.py
@ -105,83 +105,31 @@ def _results(**kw):
    return base
-def test_derive_rungs_full_stateful_sso():
+def test_derive_rungs_full_climb_four_essential():
-    rungs = R.derive_rungs(
+    rungs = R.derive_rungs(_results(), backup_capable=True, has_custom=True)
-        _results(),
+    # only the four essential rungs — integration/recipe-local are optional, not produced here.
        backup_capable=True,
        declared=["keycloak"],
        deps_ready=True,
        sso_unverified=False,
        has_custom=True,
        has_repo_local=False,
        repo_local_passed=False,
    )
    assert rungs == {
        "install": "pass",
        "upgrade": "pass",
        "backup_restore": "pass",
        "functional": "pass",
        "integration": "pass",
        "recipe_local": "na",
    }
-def test_derive_rungs_no_sso_surface_is_integration_na():
+def test_derive_rungs_stateless_backup_and_functional_na():
    rungs = R.derive_rungs(
        _results(),
        backup_capable=True,
        declared=[],
        deps_ready=True,
        sso_unverified=False,
        has_custom=True,
        has_repo_local=False,
        repo_local_passed=False,
    )
    assert rungs["integration"] == "na"
    assert rungs["functional"] == "pass"
 def test_derive_rungs_stateless_backup_na():
    rungs = R.derive_rungs(
        _results(backup="skip", restore="skip", custom="skip"),
        backup_capable=False,
        declared=[],
        deps_ready=True,
        sso_unverified=False,
        has_custom=False,
        has_repo_local=False,
        repo_local_passed=False,
    )
    assert rungs["backup_restore"] == "na"
    assert rungs["functional"] == "na"
    assert "integration" not in rungs and "recipe_local" not in rungs
-def test_derive_rungs_sso_unverified_is_integration_fail():
+def test_derive_rungs_functional_fail():
-    rungs = R.derive_rungs(
+    rungs = R.derive_rungs(_results(custom="fail"), backup_capable=True, has_custom=True)
-        _results(),
+    assert rungs["functional"] == "fail"
        backup_capable=True,
        declared=["keycloak"],
        deps_ready=False,
        sso_unverified=True,
        has_custom=True,
        has_repo_local=False,
        repo_local_passed=False,
    )
    assert rungs["integration"] == "fail"
 def test_derive_rungs_repo_local_pass():
    rungs = R.derive_rungs(
        _results(),
        backup_capable=True,
        declared=[],
        deps_ready=True,
        sso_unverified=False,
        has_custom=True,
        has_repo_local=True,
        repo_local_passed=True,
    )
    assert rungs["recipe_local"] == "pass"
 # ---- build_results: end-to-end incl level + flags ----
@ -212,16 +160,13 @@ def test_build_results_level_and_flags(tmp_path):
        records=recs,
        results=_results(),
        backup_capable=True,
        declared=[],
        deps_ready=True,
        sso_unverified=False,
        clean_teardown=True,
        no_secret_leak=True,
        finished_ts=1234.0,
    )
-    # stateful, functional pass, no SSO surface, no repo-local → caps at L4
+    # all four essential rungs pass → full climb to L4 (the top), no cap
    assert data["level"] == 4
-    assert "L5" in data["level_cap_reason"]
+    assert data["level_cap_reason"] == ""
    assert data["recipe"] == "hedgedoc"
    assert data["ref"] == "deadbeefcafe"
    assert data["flags"] == {"clean_teardown": True, "no_secret_leak": True}
@ -246,9 +191,6 @@ def test_build_results_capped_at_L1_on_upgrade_fail(tmp_path):
        records=recs,
        results=_results(upgrade="fail"),
        backup_capable=True,
        declared=[],
        deps_ready=True,
        sso_unverified=False,
        clean_teardown=True,
        no_secret_leak=True,
        finished_ts=0.0,
@ -257,6 +199,85 @@ def test_build_results_capped_at_L1_on_upgrade_fail(tmp_path):
    assert "L2" in data["level_cap_reason"]
 # ---- skips: intentional (declared) vs unintentional (everything else skipped) ----
 def _rungs(**kw):
    base = {
        "install": "pass",
        "upgrade": "pass",
        "backup_restore": "pass",
        "functional": "pass",
    }
    base.update(kw)
    return base
 def test_skips_intentional_vs_unintentional():
    rungs = _rungs(backup_restore="na", functional="na")
    sk = R.skips(rungs, {"backup_restore": "stateless static server"})
    # backup_restore is declared (intentional, with reason); functional skipped but not declared.
    assert sk["intentional"] == {"backup_restore": "stateless static server"}
    assert sk["unintentional"] == ["functional"]
 def test_skips_none_declared_all_unintentional():
    rungs = _rungs(backup_restore="na")
    sk = R.skips(rungs, None)
    assert sk["intentional"] == {}
    assert sk["unintentional"] == ["backup_restore"]
 def test_skips_declaration_only_counts_when_actually_skipped():
    # backup_restore actually ran (pass) → not a skip, so a declaration for it is simply inert.
    rungs = _rungs(backup_restore="pass")
    sk = R.skips(rungs, {"backup_restore": "reason"})
    assert "backup_restore" not in sk["intentional"]
    assert "backup_restore" not in sk["unintentional"]
 def test_build_results_threads_expected_na(tmp_path):
    # Mirrors custom-html-tiny post-change: install + a passing functional (custom) test, but no
    # backup surface (backup_restore declared intentionally skipped).
    recs = [
        {
            "tier": "install",
            "source": "generic",
            "file": "g/test_install.py",
            "rc": 0,
            "junit": _write(tmp_path, "i.xml", JUNIT_PASS),
        },
        {
            "tier": "custom",
            "source": "cc-ci",
            "file": "c/test_serves_content.py",
            "rc": 0,
            "junit": _write(tmp_path, "c.xml", JUNIT_PASS),
        },
    ]
    data = R.build_results(
        recipe="custom-html-tiny",
        version="1.1.0",
        pr="0",
        ref=None,
        records=recs,
        results=_results(backup="skip", restore="skip"),  # custom=pass (default) → functional pass
        backup_capable=False,  # no backupbot label → backup_restore skipped (N/A)
        clean_teardown=True,
        no_secret_leak=True,
        finished_ts=0.0,
        expected_na={"backup_restore": "stateless static file server"},
    )
    # backup_restore skip still caps at L2 (never inflates) — even though functional passes above it,
    # the skip caps the climb — but it's the declared (intentional) rung that capped.
    assert data["level"] == 2
    assert "L3" in data["level_cap_reason"]
    assert data["level_cap_rung"] == "backup_restore"
    assert data["rungs"]["functional"] == "pass"
    assert data["skips"]["intentional"]["backup_restore"] == "stateless static file server"
    assert data["skips"]["unintentional"] == []  # backup_restore declared; functional passed → clean
 def test_write_results_roundtrip(tmp_path):
    data = {"run_id": "42", "level": 3, "stages": []}
    path = R.write_results(data, runs_dir_override=str(tmp_path))