harness/results.py: JUnit-XML parsing (stdlib) → per-stage/per-test rows; derive_rungs (documented
tier+deps/SSO → rung mapping); build_results assembles results.json {recipe,version,pr,ref,run_id,
stages[],level,level_cap_reason,rungs,flags{clean_teardown,no_secret_leak},screenshot,summary_card};
write_results (atomic). run_recipe_ci.py: tiers emit --junitxml + append {tier,source,file,rc,junit}
records; main() assembles+writes results.json wrapped so a failure NEVER changes the verdict (R7),
incl. a narrow leak-scan of the serialised artifact. 17 new unit tests (test_results.py).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
121 lines
5.6 KiB
Python
121 lines
5.6 KiB
Python
"""Phase 3 — the level ladder (plan-phase3-results-ux.md §4.1, R1).
|
|
|
|
A single integer **level** summarising how far up the quality ladder a recipe run climbed, with
|
|
YunoHost semantics: **a gap caps the level** — you only earn level L if every rung 1..L was a clean
|
|
PASS. The first rung that is not a clean PASS (a real FAIL *or* genuinely N/A for this recipe) stops
|
|
the climb; `cap_reason` records why. This is deliberately conservative: presentation must NEVER make
|
|
a run look greener than its tests (plan §6 cardinal guardrail), so an N/A rung caps just like a fail
|
|
(the L5 example in §4.1 — "recipes with no integration surface cap at L4 by definition" — is exactly
|
|
this: N/A caps, with a recorded reason so the level is *fair*, not inflated).
|
|
|
|
The ladder (§4.1):
|
|
L0 — install failed / app never became healthy.
|
|
L1 — Installs: deploys + passes health/readiness.
|
|
L2 — Upgrades: previous published version → PR version, stays healthy, data intact.
|
|
L3 — Backup/restore: seeded data survives backup → wipe → restore.
|
|
L4 — Functional: recipe-specific functional tests pass.
|
|
L5 — Integration: SSO/OIDC + cross-app integration tests pass.
|
|
L6 — Recipe-local: the recipe repo's own tests/ (D4) pass and are merged.
|
|
|
|
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the unit
|
|
test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`). The orchestrator
|
|
(`run_recipe_ci.py`) is responsible for translating its raw per-tier results + deps/SSO signals into
|
|
the rung-status dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
|
|
|
|
Rung status vocabulary (each rung ∈ these three):
|
|
"pass" — the rung was exercised and passed.
|
|
"fail" — the rung was exercised and failed.
|
|
"na" — the rung does not apply to this recipe (e.g. only one published version → no upgrade;
|
|
not backup-capable; no SSO/integration surface; no recipe-local tests). N/A is NOT a
|
|
failure, but it DOES cap the climb (with a distinct cap_reason) so the level never
|
|
overstates what was actually verified.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install itself
|
|
# did not pass. Each later rung requires every earlier rung to be a clean PASS.
|
|
RUNGS = ("install", "upgrade", "backup_restore", "functional", "integration", "recipe_local")
|
|
|
|
# Human-readable label per rung level, for cap_reason + the summary card.
|
|
RUNG_LABEL = {
|
|
1: "install (deploy + health)",
|
|
2: "upgrade (prev published → PR)",
|
|
3: "backup/restore (data integrity)",
|
|
4: "functional (recipe-specific tests)",
|
|
5: "integration (SSO/OIDC + cross-app)",
|
|
6: "recipe-local (recipe repo tests/)",
|
|
}
|
|
|
|
VALID = {"pass", "fail", "na"}
|
|
|
|
|
|
def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
|
|
"""Map a rung-status dict → (level 0..6, cap_reason).
|
|
|
|
`rungs` must contain a status in {"pass","fail","na"} for every name in RUNGS. The level is the
|
|
highest L such that rungs[1..L] are all "pass"; the first non-"pass" rung caps the climb. L0 is
|
|
returned when the install rung itself is not "pass" (install failed / never healthy).
|
|
|
|
cap_reason explains where the climb stopped:
|
|
- "" (empty) when the recipe earned the top rung (L6, full clean climb).
|
|
- "L<k> <label> FAILED" when a rung was exercised and failed.
|
|
- "L<k> <label> N/A" when a rung does not apply to this recipe.
|
|
Returns the reason for the FIRST rung that stopped the climb (the binding constraint).
|
|
"""
|
|
for name in RUNGS:
|
|
st = rungs.get(name)
|
|
if st not in VALID:
|
|
raise ValueError(
|
|
f"rung {name!r} has invalid status {st!r} (expect one of {sorted(VALID)})"
|
|
)
|
|
|
|
# L0: install did not pass.
|
|
if rungs["install"] != "pass":
|
|
if rungs["install"] == "fail":
|
|
return 0, "L1 " + RUNG_LABEL[1] + " FAILED"
|
|
# install N/A is not a real-world state for a deploy run, but handle it for totality.
|
|
return 0, "L1 " + RUNG_LABEL[1] + " N/A"
|
|
|
|
# Climb: stop at the first rung that is not a clean pass.
|
|
level = 0
|
|
for idx, name in enumerate(RUNGS, start=1):
|
|
if rungs[name] == "pass":
|
|
level = idx
|
|
continue
|
|
# first non-pass rung — caps the climb
|
|
kind = "FAILED" if rungs[name] == "fail" else "N/A"
|
|
return level, f"L{idx} {RUNG_LABEL[idx]} {kind}"
|
|
|
|
# Full clean climb to the top rung.
|
|
return level, ""
|
|
|
|
|
|
def backup_restore_status(backup: str | None, restore: str | None, backup_capable: bool) -> str:
|
|
"""Collapse the backup + restore tier results into the single L3 rung status.
|
|
|
|
Both tiers must pass for the rung to pass (the rung is "seeded data survives backup→wipe→restore",
|
|
which is only verified if BOTH the backup and the restore tier are green). If the recipe is not
|
|
backup-capable, both tiers skip → the rung is N/A (caps at L2, recorded). A fail in either tier
|
|
fails the rung.
|
|
"""
|
|
if not backup_capable:
|
|
return "na"
|
|
vals = {backup, restore}
|
|
if "fail" in vals:
|
|
return "fail"
|
|
if backup == "pass" and restore == "pass":
|
|
return "pass"
|
|
# any skip/None while backup-capable → not verified → treat as N/A (cannot claim L3)
|
|
return "na"
|
|
|
|
|
|
def tier_to_rung(status: str | None) -> str:
|
|
"""Map a single tier result ('pass'|'fail'|'skip'|None) to a rung status. 'skip'/None → 'na'
|
|
(the tier did not apply / did not run), so it caps the climb without being counted as a failure."""
|
|
if status == "pass":
|
|
return "pass"
|
|
if status == "fail":
|
|
return "fail"
|
|
return "na"
|