feat(3 U0.2+U0.3): per-test results + results.json with computed level

harness/results.py: JUnit-XML parsing (stdlib) → per-stage/per-test rows; derive_rungs (documented tier+deps/SSO → rung mapping); build_results assembles results.json {recipe,version,pr,ref,run_id, stages[],level,level_cap_reason,rungs,flags{clean_teardown,no_secret_leak},screenshot,summary_card}; write_results (atomic). run_recipe_ci.py: tiers emit --junitxml + append {tier,source,file,rc,junit} records; main() assembles+writes results.json wrapped so a failure NEVER changes the verdict (R7), incl. a narrow leak-scan of the serialised artifact. 17 new unit tests (test_results.py). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 05:55:52 +00:00
parent df54693449
commit 52e5d210d8
5 changed files with 819 additions and 63 deletions
--- a/runner/harness/level.py
+++ b/runner/harness/level.py
@ -66,7 +66,9 @@ def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
    for name in RUNGS:
        st = rungs.get(name)
        if st not in VALID:
-            raise ValueError(f"rung {name!r} has invalid status {st!r} (expect one of {sorted(VALID)})")
+            raise ValueError(
+                f"rung {name!r} has invalid status {st!r} (expect one of {sorted(VALID)})"
+            )

    # L0: install did not pass.
    if rungs["install"] != "pass":
--- a/runner/harness/results.py
+++ b/runner/harness/results.py
@ -0,0 +1,268 @@
+"""Phase 3 — structured run results + results.json (plan-phase3-results-ux.md §4.2, R1/R3).
+
+Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
+  { recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
+    level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
+
+The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
+parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
+rung-status dict derived here (`derive_rungs`) from the tier results + deps/SSO signals the
+orchestrator holds; that mapping is documented in DECISIONS.md (Phase 3).
+
+This module is import-pure (no side effects at import). `write_results` is the only writer; the
+orchestrator calls the build/write path inside a try/except so a results failure NEVER changes the
+run's exit code (R7 — cosmetics never block the pipeline).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import xml.etree.ElementTree as ET
+
+from . import level as level_mod
+
+# Where per-run artifacts (results.json, screenshot, summary card) are written on the runner host.
+# The dashboard serves these read-only at /runs/<run_id>/... (U0.4). Overridable for tests.
+RUNS_DIR_DEFAULT = "/var/lib/cc-ci-runs"
+
+
+def runs_dir() -> str:
+    return os.environ.get("CCCI_RUNS_DIR", RUNS_DIR_DEFAULT)
+
+
+def run_id() -> str:
+    """Stable id for this run. Prefer the Drone build number (what the PR comment + dashboard link
+    to); fall back to the unique run domain so a hand-run still gets a distinct artifact dir."""
+    n = os.environ.get("DRONE_BUILD_NUMBER")
+    if n and n.strip():
+        return n.strip()
+    return os.environ.get("CCCI_APP_DOMAIN") or os.environ.get("CCCI_RUN_ID") or "manual"
+
+
+def junit_file(junit_dir: str, tier: str, source: str, path: str) -> str:
+    """Deterministic per-(tier,source,file) JUnit XML path under junit_dir."""
+    base = os.path.splitext(os.path.basename(path))[0]
+    safe = f"{tier}__{source}__{base}".replace("/", "_").replace(os.sep, "_")
+    return os.path.join(junit_dir, safe + ".xml")
+
+
+def _case_status(case: ET.Element) -> tuple[str, str]:
+    """(status, message) for one <testcase>. JUnit: child <failure>/<error>/<skipped>, else passed."""
+    for tag, st in (("error", "error"), ("failure", "fail"), ("skipped", "skip")):
+        el = case.find(tag)
+        if el is not None:
+            return st, (el.get("message") or "").strip()
+    return "pass", ""
+
+
+def parse_junit(xml_path: str) -> list[dict]:
+    """Parse one JUnit XML file → list of per-test rows {name, classname, status, ms, message}.
+    Tolerant: a missing/corrupt file yields []."""
+    try:
+        tree = ET.parse(xml_path)
+    except (OSError, ET.ParseError):
+        return []
+    rows: list[dict] = []
+    for case in tree.iter("testcase"):
+        status, message = _case_status(case)
+        try:
+            ms = int(round(float(case.get("time", "0")) * 1000))
+        except (TypeError, ValueError):
+            ms = 0
+        rows.append(
+            {
+                "name": case.get("name", "?"),
+                "classname": case.get("classname", ""),
+                "status": status,
+                "ms": ms,
+                "message": message,
+            }
+        )
+    return rows
+
+
+def _stage_status(tests: list[dict]) -> str:
+    """Roll per-test rows up to a stage status. Any error/fail → fail; else if any pass → pass;
+    else (all skipped / empty) → skip."""
+    sts = {t["status"] for t in tests}
+    if "fail" in sts or "error" in sts:
+        return "fail"
+    if "pass" in sts:
+        return "pass"
+    return "skip"
+
+
+def collect_stages(records: list[dict]) -> list[dict]:
+    """Group per-file run records into ordered stage dicts with their per-test breakdown.
+
+    `records` items: {tier, source, file, rc, junit}. Tests are read from each file's JUnit XML; if a
+    file produced no JUnit (e.g. pytest crashed before writing), fall back to a single synthetic row
+    derived from its exit code so the stage still reflects reality (rc!=0 → fail).
+    """
+    order = ("install", "upgrade", "backup", "restore", "custom")
+    by_tier: dict[str, list[dict]] = {}
+    for rec in records:
+        tests = parse_junit(rec.get("junit", "")) if rec.get("junit") else []
+        if not tests:
+            # No JUnit rows — synthesize from the exit code so a crash isn't shown as "no tests".
+            base = os.path.basename(rec.get("file", "?"))
+            tests = [
+                {
+                    "name": base,
+                    "classname": rec.get("source", ""),
+                    "status": "pass" if rec.get("rc", 1) == 0 else "fail",
+                    "ms": 0,
+                    "message": "" if rec.get("rc", 1) == 0 else "tier produced no JUnit; exit!=0",
+                }
+            ]
+        for t in tests:
+            t["source"] = rec.get("source", "")
+        by_tier.setdefault(rec["tier"], []).extend(tests)
+    stages = []
+    for tier in order:
+        if tier in by_tier:
+            tests = by_tier[tier]
+            stages.append({"name": tier, "status": _stage_status(tests), "tests": tests})
+    return stages
+
+
+def _has_repo_local(records: list[dict]) -> bool:
+    return any(r.get("source") == "repo-local" for r in records)
+
+
+def _repo_local_passed(records: list[dict]) -> bool:
+    repo = [r for r in records if r.get("source") == "repo-local"]
+    return bool(repo) and all(r.get("rc", 1) == 0 for r in repo)
+
+
+def derive_rungs(
+    results: dict[str, str],
+    *,
+    backup_capable: bool,
+    declared: list[str] | None,
+    deps_ready: bool,
+    sso_unverified: bool,
+    has_custom: bool,
+    has_repo_local: bool,
+    repo_local_passed: bool,
+) -> dict[str, str]:
+    """Translate the orchestrator's tier results + deps/SSO signals into the rung-status dict
+    harness.level consumes. Documented in DECISIONS.md (Phase 3). Conservative by design — never
+    reports a rung 'pass' it can't substantiate (cardinal guardrail: presentation never inflates).
+
+      L1 install    : install tier pass.
+      L2 upgrade    : upgrade tier (skip → N/A: only one published version).
+      L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
+      L4 functional : the recipe-specific functional (non-deps) tests pass — the custom tier, minus
+                      its SSO/integration tests. N/A if the recipe has no custom tests at all.
+      L5 integration: SSO/OIDC + cross-app. Applies ONLY if the recipe declares deps (else N/A — the
+                      "no integration surface caps at L4" rule, §4.1). pass iff deps wired
+                      (deps_ready) and not sso_unverified and the custom tier didn't fail.
+      L6 recipe-loc : the recipe repo's own tests/ (repo-local source) ran and passed (N/A if none).
+    """
+    declared = declared or []
+    rungs: dict[str, str] = {}
+    rungs["install"] = level_mod.tier_to_rung(results.get("install"))
+    rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
+    rungs["backup_restore"] = level_mod.backup_restore_status(
+        results.get("backup"), results.get("restore"), backup_capable
+    )
+
+    custom = results.get("custom")
+    # Functional rung (L4): the non-deps custom tests.
+    if not has_custom or custom == "skip" or custom is None:
+        rungs["functional"] = "na"
+    elif custom == "fail":
+        # A custom test failed. With declared deps we cannot cheaply tell functional-vs-SSO apart, so
+        # conservatively fail the functional rung (caps at L3) — never inflate.
+        rungs["functional"] = "fail"
+    else:  # custom == "pass"
+        rungs["functional"] = "pass"
+
+    # Integration rung (L5): only recipes with an SSO/integration surface (declared deps) can climb.
+    if not declared:
+        rungs["integration"] = "na"
+    elif sso_unverified or not deps_ready or custom == "fail":
+        # SSO not wired/verified, or a custom test failed → integration not verified.
+        rungs["integration"] = "fail"
+    elif custom == "pass":
+        rungs["integration"] = "pass"
+    else:
+        # declared deps but no custom tests ran — can't claim integration verified
+        rungs["integration"] = "na"
+
+    # Recipe-local rung (L6).
+    if not has_repo_local:
+        rungs["recipe_local"] = "na"
+    else:
+        rungs["recipe_local"] = "pass" if repo_local_passed else "fail"
+    return rungs
+
+
+def build_results(
+    *,
+    recipe: str,
+    version: str | None,
+    pr: str,
+    ref: str | None,
+    records: list[dict],
+    results: dict[str, str],
+    backup_capable: bool,
+    declared: list[str] | None,
+    deps_ready: bool,
+    sso_unverified: bool,
+    clean_teardown: bool,
+    no_secret_leak: bool,
+    finished_ts: float | None,
+    screenshot: str | None = None,
+    summary_card: str | None = None,
+) -> dict:
+    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
+    stamps it) so this stays pure and deterministic for unit tests."""
+    stages = collect_stages(records)
+    has_custom = any(r["tier"] == "custom" for r in records)
+    rungs = derive_rungs(
+        results,
+        backup_capable=backup_capable,
+        declared=declared,
+        deps_ready=deps_ready,
+        sso_unverified=sso_unverified,
+        has_custom=has_custom,
+        has_repo_local=_has_repo_local(records),
+        repo_local_passed=_repo_local_passed(records),
+    )
+    lvl, cap_reason = level_mod.compute_level(rungs)
+    return {
+        "schema": 1,
+        "run_id": run_id(),
+        "recipe": recipe,
+        "version": version,
+        "pr": str(pr),
+        "ref": (ref or "")[:12],
+        "finished": finished_ts,
+        "level": lvl,
+        "level_cap_reason": cap_reason,
+        "rungs": rungs,
+        "stages": stages,
+        "results": results,
+        "flags": {
+            "clean_teardown": bool(clean_teardown),
+            "no_secret_leak": bool(no_secret_leak),
+        },
+        "screenshot": screenshot,
+        "summary_card": summary_card,
+    }
+
+
+def write_results(data: dict, runs_dir_override: str | None = None) -> str:
+    """Write results.json into the run's artifact dir; return its path. Creates the dir."""
+    rd = runs_dir_override or runs_dir()
+    out_dir = os.path.join(rd, data["run_id"])
+    os.makedirs(out_dir, exist_ok=True)
+    path = os.path.join(out_dir, "results.json")
+    tmp = path + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+    os.replace(tmp, path)
+    return path