cc-ci/runner/harness/results.py

"""Phase 3 — structured run results + results.json (plan-phase3-results-ux.md §4.2, R1/R3).

Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
  { recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
    level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }

The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
rung-status dict derived here (`derive_rungs`) from the tier results + deps/SSO signals the
orchestrator holds; that mapping is documented in DECISIONS.md (Phase 3).

This module is import-pure (no side effects at import). `write_results` is the only writer; the
orchestrator calls the build/write path inside a try/except so a results failure NEVER changes the
run's exit code (R7 — cosmetics never block the pipeline).
"""

from __future__ import annotations

import json
import os
import xml.etree.ElementTree as ET

from . import level as level_mod

# Where per-run artifacts (results.json, screenshot, summary card) are written on the runner host.
# The dashboard serves these read-only at /runs/<run_id>/... (U0.4). Overridable for tests.
RUNS_DIR_DEFAULT = "/var/lib/cc-ci-runs"


def runs_dir() -> str:
    return os.environ.get("CCCI_RUNS_DIR", RUNS_DIR_DEFAULT)


def run_id() -> str:
    """Stable id for this run. Prefer the Drone build number (what the PR comment + dashboard link
    to); fall back to the unique run domain so a hand-run still gets a distinct artifact dir."""
    n = os.environ.get("DRONE_BUILD_NUMBER")
    if n and n.strip():
        return n.strip()
    return os.environ.get("CCCI_APP_DOMAIN") or os.environ.get("CCCI_RUN_ID") or "manual"


def junit_file(junit_dir: str, tier: str, source: str, path: str) -> str:
    """Deterministic per-(tier,source,file) JUnit XML path under junit_dir."""
    base = os.path.splitext(os.path.basename(path))[0]
    safe = f"{tier}__{source}__{base}".replace("/", "_").replace(os.sep, "_")
    return os.path.join(junit_dir, safe + ".xml")


def _case_status(case: ET.Element) -> tuple[str, str]:
    """(status, message) for one <testcase>. JUnit: child <failure>/<error>/<skipped>, else passed."""
    for tag, st in (("error", "error"), ("failure", "fail"), ("skipped", "skip")):
        el = case.find(tag)
        if el is not None:
            return st, (el.get("message") or "").strip()
    return "pass", ""


def parse_junit(xml_path: str) -> list[dict]:
    """Parse one JUnit XML file → list of per-test rows {name, classname, status, ms, message}.
    Tolerant: a missing/corrupt file yields []."""
    try:
        tree = ET.parse(xml_path)
    except (OSError, ET.ParseError):
        return []
    rows: list[dict] = []
    for case in tree.iter("testcase"):
        status, message = _case_status(case)
        try:
            ms = int(round(float(case.get("time", "0")) * 1000))
        except (TypeError, ValueError):
            ms = 0
        rows.append(
            {
                "name": case.get("name", "?"),
                "classname": case.get("classname", ""),
                "status": status,
                "ms": ms,
                "message": message,
            }
        )
    return rows


def _stage_status(tests: list[dict]) -> str:
    """Roll per-test rows up to a stage status. Any error/fail → fail; else if any pass → pass;
    else (all skipped / empty) → skip."""
    sts = {t["status"] for t in tests}
    if "fail" in sts or "error" in sts:
        return "fail"
    if "pass" in sts:
        return "pass"
    return "skip"


def collect_stages(records: list[dict]) -> list[dict]:
    """Group per-file run records into ordered stage dicts with their per-test breakdown.

    `records` items: {tier, source, file, rc, junit}. Tests are read from each file's JUnit XML; if a
    file produced no JUnit (e.g. pytest crashed before writing), fall back to a single synthetic row
    derived from its exit code so the stage still reflects reality (rc!=0 → fail).
    """
    order = ("install", "upgrade", "backup", "restore", "custom")
    by_tier: dict[str, list[dict]] = {}
    for rec in records:
        tests = parse_junit(rec.get("junit", "")) if rec.get("junit") else []
        if not tests:
            # No JUnit rows — synthesize from the exit code so a crash isn't shown as "no tests".
            base = os.path.basename(rec.get("file", "?"))
            tests = [
                {
                    "name": base,
                    "classname": rec.get("source", ""),
                    "status": "pass" if rec.get("rc", 1) == 0 else "fail",
                    "ms": 0,
                    "message": "" if rec.get("rc", 1) == 0 else "tier produced no JUnit; exit!=0",
                }
            ]
        for t in tests:
            t["source"] = rec.get("source", "")
        by_tier.setdefault(rec["tier"], []).extend(tests)
    stages = []
    for tier in order:
        if tier in by_tier:
            tests = by_tier[tier]
            stages.append({"name": tier, "status": _stage_status(tests), "tests": tests})
    return stages


def _has_repo_local(records: list[dict]) -> bool:
    return any(r.get("source") == "repo-local" for r in records)


def _repo_local_passed(records: list[dict]) -> bool:
    repo = [r for r in records if r.get("source") == "repo-local"]
    return bool(repo) and all(r.get("rc", 1) == 0 for r in repo)


def derive_rungs(
    results: dict[str, str],
    *,
    backup_capable: bool,
    declared: list[str] | None,
    deps_ready: bool,
    sso_unverified: bool,
    has_custom: bool,
    has_repo_local: bool,
    repo_local_passed: bool,
) -> dict[str, str]:
    """Translate the orchestrator's tier results + deps/SSO signals into the rung-status dict
    harness.level consumes. Documented in DECISIONS.md (Phase 3). Conservative by design — never
    reports a rung 'pass' it can't substantiate (cardinal guardrail: presentation never inflates).

      L1 install    : install tier pass.
      L2 upgrade    : upgrade tier (skip → N/A: only one published version).
      L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
      L4 functional : the recipe-specific functional (non-deps) tests pass — the custom tier, minus
                      its SSO/integration tests. N/A if the recipe has no custom tests at all.
      L5 integration: SSO/OIDC + cross-app. Applies ONLY if the recipe declares deps (else N/A — the
                      "no integration surface caps at L4" rule, §4.1). pass iff deps wired
                      (deps_ready) and not sso_unverified and the custom tier didn't fail.
      L6 recipe-loc : the recipe repo's own tests/ (repo-local source) ran and passed (N/A if none).
    """
    declared = declared or []
    rungs: dict[str, str] = {}
    rungs["install"] = level_mod.tier_to_rung(results.get("install"))
    rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
    rungs["backup_restore"] = level_mod.backup_restore_status(
        results.get("backup"), results.get("restore"), backup_capable
    )

    custom = results.get("custom")
    # Functional rung (L4): the non-deps custom tests.
    if not has_custom or custom == "skip" or custom is None:
        rungs["functional"] = "na"
    elif custom == "fail":
        # A custom test failed. With declared deps we cannot cheaply tell functional-vs-SSO apart, so
        # conservatively fail the functional rung (caps at L3) — never inflate.
        rungs["functional"] = "fail"
    else:  # custom == "pass"
        rungs["functional"] = "pass"

    # Integration rung (L5): only recipes with an SSO/integration surface (declared deps) can climb.
    if not declared:
        rungs["integration"] = "na"
    elif sso_unverified or not deps_ready or custom == "fail":
        # SSO not wired/verified, or a custom test failed → integration not verified.
        rungs["integration"] = "fail"
    elif custom == "pass":
        rungs["integration"] = "pass"
    else:
        # declared deps but no custom tests ran — can't claim integration verified
        rungs["integration"] = "na"

    # Recipe-local rung (L6).
    if not has_repo_local:
        rungs["recipe_local"] = "na"
    else:
        rungs["recipe_local"] = "pass" if repo_local_passed else "fail"
    return rungs


# Rungs where an *undeclared* N/A is suspicious — it usually means a recipe SHOULD have this coverage
# but nobody added it (a backup label, a functional test), i.e. an accidental gap rather than a real
# property of the recipe. For these, an undeclared N/A is surfaced as a "possible coverage gap" unless
# the recipe declares it intentional via recipe_meta.EXPECTED_NA. The other rungs (upgrade — only one
# published version; integration — no SSO surface; recipe_local — no repo-local tests) are
# *structurally* optional: an N/A there is the normal case and is not flagged.
GAP_SENSITIVE_RUNGS = ("backup_restore", "functional")


def classify_na(rungs: dict[str, str], expected_na: dict | None) -> dict:
    """Distinguish *intentionally* N/A rungs from *accidentally* missing ones (operator request).

    A recipe declares intentional N/A in `recipe_meta.EXPECTED_NA = {rung: reason}`. N/A always caps
    the level either way (the harness never inflates — a rung that wasn't verified wasn't verified);
    this only EXPLAINS the cap so a reviewer can tell "this recipe legitimately has no backup surface"
    from "someone forgot to add the backup test". Returns:
      { "rungs": {rung: {"intent": "declared"|"undeclared", "reason": str}},  # one per N/A rung
        "gaps": [rung, ...],            # gap-sensitive rungs that are N/A and NOT declared
        "stale_declared": [rung, ...] } # rungs declared N/A but actually exercised (stale opt-out)
    """
    expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
    na: dict[str, dict] = {}
    for rung, st in rungs.items():
        if st != "na":
            continue
        if rung in expected:
            na[rung] = {"intent": "declared", "reason": expected[rung]}
        else:
            na[rung] = {"intent": "undeclared", "reason": ""}
    gaps = [r for r in GAP_SENSITIVE_RUNGS if na.get(r, {}).get("intent") == "undeclared"]
    stale = sorted(r for r in expected if rungs.get(r) not in (None, "na"))
    return {"rungs": na, "gaps": gaps, "stale_declared": stale}


def cap_intent(rungs: dict[str, str], level: int, cap_reason: str, na_info: dict) -> str:
    """A short clause explaining the level cap when the capping rung is N/A: the declared reason if
    intentional, a 'possible coverage gap' note if it's an undeclared gap-sensitive rung, else ''."""
    if not cap_reason:
        return ""
    capped = level_mod.RUNGS[level] if 0 <= level < len(level_mod.RUNGS) else None
    if not capped or rungs.get(capped) != "na":
        return ""
    entry = na_info["rungs"].get(capped, {})
    if entry.get("intent") == "declared":
        return f"intentional · {entry['reason']}"
    if capped in GAP_SENSITIVE_RUNGS:
        return "undeclared N/A — possible coverage gap (add a test or declare EXPECTED_NA)"
    return ""


def build_results(
    *,
    recipe: str,
    version: str | None,
    pr: str,
    ref: str | None,
    records: list[dict],
    results: dict[str, str],
    backup_capable: bool,
    declared: list[str] | None,
    deps_ready: bool,
    sso_unverified: bool,
    clean_teardown: bool,
    no_secret_leak: bool,
    finished_ts: float | None,
    screenshot: str | None = None,
    summary_card: str | None = None,
    expected_na: dict | None = None,
) -> dict:
    """Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
    stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
    declared intentional-N/A map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
    accidentally-missing coverage."""
    stages = collect_stages(records)
    has_custom = any(r["tier"] == "custom" for r in records)
    rungs = derive_rungs(
        results,
        backup_capable=backup_capable,
        declared=declared,
        deps_ready=deps_ready,
        sso_unverified=sso_unverified,
        has_custom=has_custom,
        has_repo_local=_has_repo_local(records),
        repo_local_passed=_repo_local_passed(records),
    )
    lvl, cap_reason = level_mod.compute_level(rungs)
    na_info = classify_na(rungs, expected_na)
    intent = cap_intent(rungs, lvl, cap_reason, na_info)
    return {
        "schema": 1,
        "run_id": run_id(),
        "recipe": recipe,
        "version": version,
        "pr": str(pr),
        "ref": (ref or "")[:12],
        "finished": finished_ts,
        "level": lvl,
        "level_cap_reason": cap_reason,
        "level_cap_intent": intent,
        "rungs": rungs,
        "na": na_info,
        "stages": stages,
        "results": results,
        "flags": {
            "clean_teardown": bool(clean_teardown),
            "no_secret_leak": bool(no_secret_leak),
        },
        "screenshot": screenshot,
        "summary_card": summary_card,
    }


def write_results(data: dict, runs_dir_override: str | None = None) -> str:
    """Write results.json into the run's artifact dir; return its path. Creates the dir."""
    rd = runs_dir_override or runs_dir()
    out_dir = os.path.join(rd, data["run_id"])
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, "results.json")
    tmp = path + ".tmp"
    with open(tmp, "w") as f:
        json.dump(data, f, indent=2, sort_keys=True)
    os.replace(tmp, path)
    return path