feat(3 U0.2+U0.3): per-test results + results.json with computed level

harness/results.py: JUnit-XML parsing (stdlib) → per-stage/per-test rows; derive_rungs (documented tier+deps/SSO → rung mapping); build_results assembles results.json {recipe,version,pr,ref,run_id, stages[],level,level_cap_reason,rungs,flags{clean_teardown,no_secret_leak},screenshot,summary_card}; write_results (atomic). run_recipe_ci.py: tiers emit --junitxml + append {tier,source,file,rc,junit} records; main() assembles+writes results.json wrapped so a failure NEVER changes the verdict (R7), incl. a narrow leak-scan of the serialised artifact. 17 new unit tests (test_results.py). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 05:55:52 +00:00
parent df54693449
commit 52e5d210d8
5 changed files with 819 additions and 63 deletions
--- a/tests/unit/test_level.py
+++ b/tests/unit/test_level.py
@ -14,8 +14,14 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")
 from harness import level as L  # noqa: E402


-def _rungs(install="pass", upgrade="pass", backup_restore="pass", functional="pass",
-           integration="pass", recipe_local="pass"):
+def _rungs(
+    install="pass",
+    upgrade="pass",
+    backup_restore="pass",
+    functional="pass",
+    integration="pass",
+    recipe_local="pass",
+):
    return {
        "install": install,
        "upgrade": upgrade,
@ -28,6 +34,7 @@ def _rungs(install="pass", upgrade="pass", backup_restore="pass", functional="pa

 # ---- the U0 gate: L4-pass and L2-cap ----

+
 def test_full_clean_climb_to_L6():
    lvl, reason = L.compute_level(_rungs())
    assert lvl == 6
@ -50,6 +57,7 @@ def test_fails_at_L2_capped_at_L1():

 # ---- L0 / install ----

+
 def test_install_fail_is_L0():
    lvl, reason = L.compute_level(_rungs(install="fail"))
    assert lvl == 0
@ -58,6 +66,7 @@ def test_install_fail_is_L0():

 # ---- gap-caps semantics: a higher pass can't rescue a lower gap ----

+
 def test_higher_pass_does_not_rescue_lower_na():
    # backup/restore N/A (stateless app) caps at L2 even though functional would pass.
    lvl, reason = L.compute_level(_rungs(backup_restore="na", functional="pass", integration="na"))
@ -94,6 +103,7 @@ def test_functional_fail_caps_at_L3():

 # ---- input validation ----

+
 def test_invalid_status_raises():
    bad = _rungs()
    bad["functional"] = "passed"  # not in the vocabulary
@ -106,6 +116,7 @@ def test_invalid_status_raises():

 # ---- helpers: backup_restore_status ----

+
 def test_backup_restore_status_pass():
    assert L.backup_restore_status("pass", "pass", True) == "pass"

@ -126,6 +137,7 @@ def test_backup_restore_partial_is_na():

 # ---- helpers: tier_to_rung ----

+
 def test_tier_to_rung_mapping():
    assert L.tier_to_rung("pass") == "pass"
    assert L.tier_to_rung("fail") == "fail"