Revert "feat(lvl5): P1 — 5-rung ladder (L5=abra recipe lint) + de-capped level semantics"
All checks were successful
continuous-integration/drone/push Build is passing

This reverts commit e219a7891d.
This commit is contained in:
autonomic-bot
2026-06-11 07:46:57 +00:00
parent 589943f46e
commit cd62743055
12 changed files with 336 additions and 1065 deletions

View File

@ -21,24 +21,23 @@ from __future__ import annotations
import html
import os
# Level → colour ramp (YunoHost-ish): red at the floor, climbing to green at the top (L5 = full
# clean climb incl. lint — phase lvl5).
# Level → colour ramp (YunoHost-ish): red at the floor, climbing to green at the top.
LEVEL_COLOR = {
0: "#e5534b", # red — install failed
1: "#e0823d", # orange
2: "#e0823d",
3: "#d9b343", # amber
4: "#a0b93f", # yellow-green — above functional, lint not earned
5: "#3fb950", # bright green — full climb (lint passed)
4: "#a0b93f", # yellow-green
5: "#57ab5a", # green
6: "#3fb950", # bright green — full climb
}
STATUS_MARK = {"pass": "", "fail": "", "skip": "", "error": "", "na": "", "unver": ""}
STATUS_MARK = {"pass": "", "fail": "", "skip": "", "error": "", "na": ""}
STATUS_COLOR = {
"pass": "#3fb950",
"fail": "#f85149",
"error": "#f85149",
"skip": "#8b949e",
"na": "#8b949e",
"unver": "#d29922", # amber — exercised? no: should have run and wasn't verified
}
@ -80,15 +79,44 @@ def render_badge_svg(label: str, message: str, color: str) -> str:
)
# Amber for UNVERIFIED rung rows in the table (a rung that should have run and wasn't checked).
# Third-segment colours for the level badge: amber = an UNINTENTIONAL skip (a rung skipped but not
# in the recipe's intentional list — likely missing coverage) capped the climb; muted = an
# INTENTIONAL skip (declared in recipe_meta.EXPECTED_NA — nothing to fix). Font-safe text labels
# (no emoji) so the SVG renders anywhere.
GAP_COLOR = "#d29922"
EXPECT_COLOR = "#6e7681"
def level_badge_svg(level: int) -> str:
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level — NUMBER + COLOUR ONLY
(operator-specified, phase lvl5). 'Why isn't it higher' lives in the card's per-rung table,
never on the badge."""
return render_badge_svg("cc-ci", f"level {int(level)}", level_color(level))
def level_badge_svg(level: int, cap_reason: str = "", cap_skip: str = "") -> str:
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level (R6), with a THIRD segment
that differentiates *why* the climb stopped when a SKIP capped it (`cap_skip`):
- "unintentional" (a rung skipped but not in the recipe's intentional list): amber 'gap?'.
- "intentional" (a skip declared in recipe_meta.EXPECTED_NA): muted 'expected'.
- "" (clean cap / full climb / a real failure): no third segment (the level + card carry it).
The badge never inflates — it only annotates the cap the level already reflects."""
label, msg = "cc-ci", f"level {int(level)}"
lw, mw = _text_width(label), _text_width(msg)
third: tuple[str, str] | None = None
if cap_skip == "unintentional":
third = ("gap?", GAP_COLOR)
elif cap_skip == "intentional":
third = ("expected", EXPECT_COLOR)
if third is None:
return render_badge_svg(label, msg, level_color(level))
txt, tcolor = third
tw = _text_width(txt)
w = lw + mw + tw
return (
f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="20" role="img" '
f'aria-label="{html.escape(label)}: {html.escape(msg)} ({html.escape(txt)})">'
f'<rect width="{lw}" height="20" fill="#555"/>'
f'<rect x="{lw}" width="{mw}" height="20" fill="{level_color(level)}"/>'
f'<rect x="{lw + mw}" width="{tw}" height="20" fill="{tcolor}"/>'
f'<g fill="#fff" font-family="Verdana,Geneva,sans-serif" font-size="11">'
f'<text x="6" y="14">{html.escape(label)}</text>'
f'<text x="{lw + 6}" y="14">{html.escape(msg)}</text>'
f'<text x="{lw + mw + 6}" y="14">{html.escape(txt)}</text></g></svg>'
)
def _stage_rows(stages: list[dict]) -> str:
@ -113,13 +141,12 @@ def _stage_rows(stages: list[dict]) -> str:
return "\n".join(rows) or '<tr><td colspan="3">no stages</td></tr>'
# Friendly rung labels for the skip/unverified rows (the five essential rungs).
# Friendly rung labels for the skip rows (the four essential rungs).
RUNG_LABEL = {
"install": "install",
"upgrade": "upgrade",
"backup_restore": "backup/restore",
"functional": "functional",
"lint": "lint",
}
SKIP_GREEN = (
"#57ab5a" # muted green — an intentional skip reads like a pass (but labelled, never inflating)
@ -127,10 +154,9 @@ SKIP_GREEN = (
def _skip_rows(skips: dict) -> str:
"""Render the non-run rungs as stage-like rows (phase lvl5 semantics). An INTENTIONAL skip
(declared/structural — the rung does not apply, the climb continues past it) is muted green
with its reason on the line below; an UNVERIFIED rung (should have run, wasn't checked — the
level cannot rise above it) is amber 'unverified'."""
"""Render SKIPPED rungs as stage-like rows. An intentional (declared) skip looks like a pass row
but its status says 'INTENTIONAL SKIP' (muted green) with the declared reason on the line below;
an unintentional skip is amber 'UNINTENTIONAL SKIP' with a prompt to add a test or declare it."""
rows = []
for rung, reason in (skips.get("intentional") or {}).items():
rows.append(
@ -145,11 +171,11 @@ def _skip_rows(skips: dict) -> str:
rows.append(
f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{GAP_COLOR}">⊘</span>'
f"<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>"
f'<td class="st" style="color:{GAP_COLOR}">unverified</td></tr>'
f'<td class="st" style="color:{GAP_COLOR}">unintentional skip</td></tr>'
)
rows.append(
'<tr class="skipreason"><td></td><td colspan="2">rung did not run / could not be '
"checked — the level cannot rise above an unverified rung</td></tr>"
'<tr class="skipreason"><td></td><td colspan="2">not declared in EXPECTED_NA — add the '
"missing test/label, or declare the skip with a reason</td></tr>"
)
return "\n".join(rows)
@ -158,15 +184,13 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
"""Build the summary-card HTML from a results.json dict. `screenshot_rel` is the relative path to
the screenshot PNG (same dir as the card) — omitted from the card if None / absent.
The card shows exactly what the data says: recipe + version, the level, the per-stage/per-test
✔/✘ table (+ skip/unverified rung rows — the SOLE carrier of "why isn't the level higher"),
the invariant flags, and the app screenshot. No computation here. Tolerates old (schema-1)
artifacts: the ladder height is read off the rungs the artifact actually has."""
The card shows exactly what the data says: recipe + version, the level badge + cap reason, the
per-stage/per-test ✔/✘ table, the invariant flags, and the app screenshot. No computation here."""
recipe = html.escape(str(data.get("recipe", "?")))
version = html.escape(str(data.get("version") or data.get("ref") or ""))
level = int(data.get("level", 0))
# Old (pre-lvl5) artifacts have a 4-rung ladder — render their "of N" honestly.
ladder_top = 5 if "lint" in (data.get("rungs") or {}) else 4
cap_reason = str(data.get("level_cap_reason") or "")
cap = html.escape(cap_reason)
sk = data.get("skips", {}) or {}
color = level_color(level)
flags = data.get("flags", {}) or {}
@ -197,7 +221,7 @@ body{{margin:0;font-family:system-ui,-apple-system,Segoe UI,sans-serif;backgroun
.lvl .num{{display:inline-block;min-width:64px;padding:.3rem .7rem;border-radius:10px;
font-size:1.6rem;font-weight:700;color:#0d1117;background:{color}}}
.lvl .lbl{{display:block;color:#8b949e;font-size:.72rem;text-transform:uppercase;margin-top:.2rem}}
.ladder{{padding:.4rem 1.3rem;color:#8b949e;font-size:.82rem;border-bottom:1px solid #21262d}}
.cap{{padding:.4rem 1.3rem;color:#8b949e;font-size:.82rem;border-bottom:1px solid #21262d}}
.body{{display:flex;gap:1rem;padding:1rem 1.3rem}}
.tbl{{flex:1}}
table{{border-collapse:collapse;width:100%;font-size:.85rem}}
@ -214,12 +238,12 @@ tr.skipreason td{{color:#8b949e;font-size:.78rem;font-style:italic;padding-top:0
.shot.noshot{{display:flex;align-items:center;justify-content:center;height:225px;color:#8b949e;font-size:.85rem}}
.flags{{display:flex;gap:.6rem;padding:.6rem 1.3rem 1rem}}
.flag{{border:1px solid;border-radius:6px;padding:.15rem .5rem;font-size:.78rem;color:#c9d1d9}}
.ladder b{{color:#c9d1d9}}
.cap b{{color:#c9d1d9}}
</style></head><body><div class="card">
<div class="hd">{FLOWER_SVG}
<div class="title"><h1>{recipe}</h1><span class="ver">{version}</span></div>
<div class="lvl"><span class="num">{level}</span><span class="lbl">level</span></div></div>
<div class="ladder"><b>level {level} of {ladder_top}</b></div>
<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (4)"}</div>
<div class="body"><div class="tbl"><table>{rows}</table></div>{shot_html}</div>
<div class="flags">{"".join(flag_bits)}</div>
</div></body></html>"""

View File

@ -1,67 +1,67 @@
"""The level ladder — five rungs, no capping (phase lvl5, plan-phase-lvl5-lint-rung.md).
"""Phase 3 — the level ladder (plan-phase3-results-ux.md §4.1, R1).
A single integer **level** summarising how far up the quality ladder a recipe run climbed:
A single integer **level** summarising how far up the quality ladder a recipe run climbed, with
YunoHost semantics: **a gap caps the level** — you only earn level L if every rung 1..L was a clean
PASS. The first rung that is not a clean PASS (a real FAIL *or* genuinely N/A for this recipe) stops
the climb; `cap_reason` records why. This is deliberately conservative: presentation must NEVER make
a run look greener than its tests (plan §6 cardinal guardrail), so an N/A rung caps just like a fail
— with a recorded reason so the level is *fair*, not inflated.
The ladder is the FOUR essential rungs every recipe is held to:
L0 — install failed / app never became healthy.
L1 — Installs: deploys + passes health/readiness.
L2 — Upgrades: previous published version → PR version, stays healthy, data intact.
L3 — Backup/restore: seeded data survives backup → wipe → restore.
L4 — Functional: recipe-specific functional tests pass.
L5 — Lint: `abra recipe lint` passes against the exact ref under test.
Semantics (operator-decided 2026-06-11, recorded in DECISIONS.md — replaces the Phase-3
"N/A caps" rule):
Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) are **OPTIONAL**
capabilities — they are NOT part of the level ladder and never cap it. They still run when present
(and SSO is still enforced for the run VERDICT via the deps/SSO checks in run_recipe_ci.py), but a
recipe without an SSO surface or without repo-local tests is simply not penalised on the level.
level = max i such that rung_i == "pass" and every rung j < i is "pass" or "skip"; 0 if none.
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the unit
test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`). The orchestrator
(`run_recipe_ci.py`) is responsible for translating its raw per-tier results into the rung-status
dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
A rung has one of FOUR statuses:
"pass" — exercised and passed.
"fail" — exercised and failed. Blocks: no rung above it can count.
"skip" — INTENTIONAL skip: the rung genuinely does not apply to this recipe, from a
declared or structural fact (not backup-capable; only one published version;
declared in recipe_meta.EXPECTED_NA). Does NOT stop the climb.
"unver" — UNINTENTIONAL not-verified: the rung SHOULD have run but didn't (infra error,
missing tool, harness exception, prior-stage abort, timeout). Blocks exactly
like a fail — the level never rises above a rung that wasn't actually checked.
The per-rung table (results.json `rungs`, card, dashboard) is the SOLE carrier of "why isn't
this level higher" — there is no cap_reason. The classification of every N/A source into
skip-vs-unver lives in derive_rungs (results.py) and is tabulated in DECISIONS.md; anything
unclassifiable defaults to "unver" (conservative: never claim what wasn't checked).
Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) remain
OPTIONAL capabilities — not rungs, never counted (SSO is still enforced for the run VERDICT
via the deps/SSO checks in run_recipe_ci.py).
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the
unit test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`).
Rung status vocabulary (each rung ∈ these three):
"pass" the rung was exercised and passed.
"fail" the rung was exercised and failed.
"na" — the rung does not apply to this recipe (e.g. only one published version → no upgrade;
not backup-capable). N/A is NOT a failure, but it DOES cap the climb (with a distinct
cap_reason) so the level never overstates what was actually verified.
"""
from __future__ import annotations
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install
# itself did not pass. These five are the ESSENTIAL rungs — integration/recipe-local are
# optional and deliberately NOT in this tuple.
RUNGS = ("install", "upgrade", "backup_restore", "functional", "lint")
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install itself
# did not pass. Each later rung requires every earlier rung to be a clean PASS. These four are the
# ESSENTIAL rungs — integration/recipe-local are optional and deliberately NOT in this tuple.
RUNGS = ("install", "upgrade", "backup_restore", "functional")
# Human-readable label per rung level, for the summary card / docs.
# Human-readable label per rung level, for cap_reason + the summary card.
RUNG_LABEL = {
1: "install (deploy + health)",
2: "upgrade (prev published → PR)",
3: "backup/restore (data integrity)",
4: "functional (recipe-specific tests)",
5: "lint (abra recipe lint)",
}
VALID = {"pass", "fail", "skip", "unver"}
VALID = {"pass", "fail", "na"}
def compute_level(rungs: dict[str, str]) -> int:
"""Map a rung-status dict → level 0..5.
def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
"""Map a rung-status dict → (level 0..4, cap_reason).
`rungs` must contain a status in VALID for every name in RUNGS. The level is the highest
i such that rungs[i] == "pass" and every rung below i is "pass" or "skip" (an intentional
skip does not stop the climb). A "fail" or "unver" rung blocks: rungs above it cannot
count, however green. 0 when no rung qualifies.
`rungs` must contain a status in {"pass","fail","na"} for every name in RUNGS. The level is the
highest L such that rungs[1..L] are all "pass"; the first non-"pass" rung caps the climb. L0 is
returned when the install rung itself is not "pass" (install failed / never healthy).
cap_reason explains where the climb stopped:
- "" (empty) when the recipe earned the top rung (L4, full clean climb).
- "L<k> <label> FAILED" when a rung was exercised and failed.
- "L<k> <label> N/A" when a rung does not apply to this recipe.
Returns the reason for the FIRST rung that stopped the climb (the binding constraint).
"""
for name in RUNGS:
st = rungs.get(name)
@ -69,44 +69,52 @@ def compute_level(rungs: dict[str, str]) -> int:
raise ValueError(
f"rung {name!r} has invalid status {st!r} (expect one of {sorted(VALID)})"
)
# L0: install did not pass.
if rungs["install"] != "pass":
if rungs["install"] == "fail":
return 0, "L1 " + RUNG_LABEL[1] + " FAILED"
# install N/A is not a real-world state for a deploy run, but handle it for totality.
return 0, "L1 " + RUNG_LABEL[1] + " N/A"
# Climb: stop at the first rung that is not a clean pass.
level = 0
for idx, name in enumerate(RUNGS, start=1):
st = rungs[name]
if st == "pass":
if rungs[name] == "pass":
level = idx
elif st == "skip":
continue
else: # fail / unver — nothing above this rung can count
break
return level
# first non-pass rung — caps the climb
kind = "FAILED" if rungs[name] == "fail" else "N/A"
return level, f"L{idx} {RUNG_LABEL[idx]} {kind}"
# Full clean climb to the top rung.
return level, ""
def backup_restore_status(backup: str | None, restore: str | None, backup_capable: bool) -> str:
"""Collapse the backup + restore tier results into the single L3 rung status.
Not backup-capable (a declared/structural fact: no backupbot labels, or
recipe_meta.BACKUP_CAPABLE=False) → "skip" — the rung genuinely does not apply.
Otherwise both tiers must pass for the rung to pass; a fail in either tier fails it; any
other shape (tier skipped or never ran while backup-capable — e.g. a prior-stage abort)
is "unver": the rung should have been verified and wasn't.
Both tiers must pass for the rung to pass (the rung is "seeded data survives backup→wipe→restore",
which is only verified if BOTH the backup and the restore tier are green). If the recipe is not
backup-capable, both tiers skip → the rung is N/A (caps at L2, recorded). A fail in either tier
fails the rung.
"""
if not backup_capable:
return "skip"
return "na"
vals = {backup, restore}
if "fail" in vals:
return "fail"
if backup == "pass" and restore == "pass":
return "pass"
return "unver"
# any skip/None while backup-capable → not verified → treat as N/A (cannot claim L3)
return "na"
def tier_to_rung(status: str | None) -> str:
"""Map a single tier result ('pass'|'fail'|'skip'|None) to a rung status, with NO
intentionality information: a tier that did not produce a pass/fail is "unver" (it should
have run and wasn't verified). The caller (derive_rungs) upgrades "unver" to "skip" where
a declared/structural fact makes the skip intentional — never the other way around."""
"""Map a single tier result ('pass'|'fail'|'skip'|None) to a rung status. 'skip'/None → 'na'
(the tier did not apply / did not run), so it caps the climb without being counted as a failure."""
if status == "pass":
return "pass"
if status == "fail":
return "fail"
return "unver"
return "na"

View File

@ -1,171 +0,0 @@
"""L5 lint rung — run `abra recipe lint` against the exact ref under test (phase lvl5).
Executor + classifier for the fifth ladder rung. Design constraints (plan-phase-lvl5 §2):
- **Lints the recipe's CONTENT, not the harness plumbing.** abra lint reads every
`compose*.yml` in the tree (including the CI's untracked install_steps overlays) and
force-fetches tags from `origin` (which on PR runs is the private mirror, unauthenticated
here → FATA). Both are harness artifacts, so the executor lints a PRISTINE scratch clone of
the per-run tree, checked out at the exact tested ref: `origin` becomes a local path (tag
fetch works offline, no auth) and the run's true tag set rides along (fetch_recipe pulls the
upstream version tags into the per-run tree). No lint rule is filtered or ignored.
- **rc is not the verdict.** `abra recipe lint` exits non-zero only when it cannot lint
(FATA); rule outcomes live in its table — error-severity ❌ rows print a trailing
"WARN critical errors present …" sentinel but still exit 0. So the classifier parses the
table: FAIL iff an error-severity rule is unsatisfied (or the FATA is content-attributable:
"unable to validate recipe" — the recipe config itself is invalid). PASS iff the table
rendered and no error rule failed. ANYTHING else — timeout, abra/script missing, tag-fetch
FATA, unparseable output — is "unver": loud, never a silent pass, never an intentional skip.
- **Best-effort + time-bounded.** Hard ~60s timeout (observed runtime ≈0.7s); the caller
wraps run_lint in try/except besides — a wedged lint can never hang or fail a run, and the
run VERDICT is untouched by any lint outcome (lint is a level rung, not a gate).
- Full command output (+ cmd, rc, ref header) is captured to `lint.txt` in the run artifact
dir; results.json carries status + short excerpt (failing rule ids).
abra needs a PTY even with -n ("inappropriate ioctl on device") → run via util-linux
`script -qec`, same trick as harness.abra._run_pty.
"""
from __future__ import annotations
import os
import re
import shlex
import shutil
import subprocess
import tempfile
from . import abra
LINT_TIMEOUT = 60 # hard budget, seconds; observed ~0.7s per recipe
# Strip ANSI escape sequences from PTY output before parsing.
_ANSI = re.compile(r"\x1b\[[0-9;?]*[A-Za-z]")
# A table row: │ R014 │ description │ error │ ✅/❌ │ skipped │ how-to-fix │
_ROW = re.compile(r"^\s*│\s*(R\d+)\s*│(.*?)│\s*(warn|error)\s*│\s*(✅|❌)\s*│\s*([^│]*)│")
# abra's trailing sentinel when any error-severity rule is unsatisfied (cross-check only).
_SENTINEL = "critical errors present"
# FATA classes that are the RECIPE's fault (its config cannot even be validated) — a lint
# FAIL, not an unverified rung. Everything else non-zero is environmental → unver.
_CONTENT_FATA = "unable to validate recipe"
def parse_table(output: str) -> list[dict]:
"""Parse the lint table → rows {rule, desc, severity, satisfied(bool), skipped(bool)}.
Tolerant: lines that don't match are ignored; returns [] when no table rendered."""
rows = []
for line in _ANSI.sub("", output).replace("\r", "\n").splitlines():
m = _ROW.match(line)
if not m:
continue
rule, desc, severity, mark, skipped = m.groups()
rows.append(
{
"rule": rule,
"desc": desc.strip(),
"severity": severity,
"satisfied": mark == "",
"skipped": skipped.strip() not in ("", "-"),
}
)
return rows
def classify(rc: int | None, output: str) -> tuple[str, str, list[str]]:
"""(status, detail, failed_rule_ids) from a finished lint invocation.
status ∈ {"pass","fail","unver"}; never a silent pass: pass requires a parsed table with
zero unsatisfied error-severity rules AND no sentinel. `rc=None` means the run itself blew
up (timeout/missing binary) — always unver; the caller supplies the detail.
"""
if rc is None:
return "unver", "lint did not run", []
if rc != 0:
first = next((ln for ln in _ANSI.sub("", output).splitlines() if "FATA" in ln), "").strip()
if _CONTENT_FATA in output:
# The recipe config itself failed validation — attributable to recipe content.
return "fail", first or "recipe config failed validation", []
return "unver", first or f"abra recipe lint exited {rc} with no table", []
rows = parse_table(output)
if not rows:
return "unver", "no lint table in output (rc=0)", []
failed = [
r["rule"]
for r in rows
if r["severity"] == "error" and not r["satisfied"] and not r["skipped"]
]
if failed:
return "fail", f"error rule(s) unsatisfied: {', '.join(failed)}", failed
if _SENTINEL in output:
# abra says critical errors but our parse found none — distrust the parse, never inflate.
return "fail", "abra reported critical errors (table parse found none)", []
return "pass", "", []
def run_lint(recipe: str, ref: str | None, out_dir: str | None) -> dict:
"""Execute the lint rung for `recipe` at exactly `ref` (a sha; None → the per-run tree's
current HEAD). Returns {"status","detail","rules_failed"} and writes lint.txt into
`out_dir` (when given). Never raises: every failure mode is caught into status "unver"."""
scratch = None
rc: int | None = None
output = ""
try:
src_tree = abra.recipe_dir(recipe)
scratch = tempfile.mkdtemp(prefix="ccci-lint-")
lint_abra = os.path.join(scratch, "abra")
os.makedirs(os.path.join(lint_abra, "recipes"))
clone = os.path.join(lint_abra, "recipes", recipe)
subprocess.run(
["git", "clone", "--quiet", src_tree, clone],
check=True,
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
)
if ref:
subprocess.run(
["git", "-C", clone, "checkout", "-f", "--quiet", ref],
check=True,
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
)
# catalogue: R006 (published catalogue version) reads it; servers: harmless, some abra
# paths stat it. Symlink the live ones (read-only use).
for shared in ("catalogue", "servers"):
src = os.path.join(abra.abra_dir(), shared)
if os.path.exists(src):
os.symlink(os.path.realpath(src), os.path.join(lint_abra, shared))
env = dict(os.environ, ABRA_DIR=lint_abra)
proc = subprocess.run(
["script", "-qec", f"abra recipe lint -n {shlex.quote(recipe)}", "/dev/null"],
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
env=env,
)
rc, output = proc.returncode, proc.stdout + proc.stderr
status, detail, failed = classify(rc, output)
except subprocess.TimeoutExpired:
status, detail, failed = "unver", f"lint timed out after {LINT_TIMEOUT}s", []
except Exception as e: # noqa: BLE001 — rung must never break the run; unver is the honest floor
status, detail, failed = "unver", f"lint executor error: {e.__class__.__name__}: {e}", []
finally:
if scratch:
shutil.rmtree(scratch, ignore_errors=True)
if status == "unver":
print(f"!! lint rung UNVERIFIED for {recipe}: {detail}", flush=True)
if out_dir:
try:
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(out_dir, "lint.txt"), "w", encoding="utf-8") as f:
f.write(
f"$ abra recipe lint -n {recipe} (ref={ref or 'HEAD'})\n"
f"rc={rc} status={status} {detail}\n\n{output}"
)
except OSError as e:
print(f" lint: could not write lint.txt (non-fatal): {e}", flush=True)
return {"status": status, "detail": detail, "rules_failed": failed}

View File

@ -1,22 +1,20 @@
"""Structured run results + results.json (Phase 3 §4.2 R1/R3; level semantics: phase lvl5).
"""Phase 3 — structured run results + results.json (plan-phase3-results-ux.md §4.2, R1/R3).
Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying:
Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
{ recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
level, rungs, lint:{status,detail,rules_failed},
level, level_cap_reason, level_cap_rung, rungs,
skips:{intentional:{rung:reason}, unintentional:[rung]},
flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
Rung statuses (phase lvl5, operator-decided — see harness.level + DECISIONS.md): every rung is
"pass" | "fail" | "skip" (INTENTIONAL — a declared/structural fact says the rung does not apply)
| "unver" (UNINTENTIONAL — the rung should have run and wasn't verified; blocks the level like a
fail). `derive_rungs` is the single place every N/A source is classified; anything it cannot
attribute to a declared/structural fact defaults to "unver" (conservative). `skips` mirrors that
split into results.json: intentional {rung: reason} / unintentional [rung] (= the unver rungs).
`skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
never claims a rung it did not verify; this only labels *why* a skip happened.
The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
rung-status dict derived here (`derive_rungs`) from the tier results + structural signals the
orchestrator holds; the classification table is in DECISIONS.md (phase lvl5).
rung-status dict derived here (`derive_rungs`) from the tier results + deps/SSO signals the
orchestrator holds; that mapping is documented in DECISIONS.md (Phase 3).
This module is import-pure (no side effects at import). `write_results` is the only writer; the
orchestrator calls the build/write path inside a try/except so a results failure NEVER changes the
@ -140,90 +138,53 @@ def derive_rungs(
results: dict[str, str],
*,
backup_capable: bool,
has_upgrade_target: bool,
expected_na: dict | None = None,
lint_status: str | None = None,
has_custom: bool,
) -> dict[str, str]:
"""Translate the orchestrator's tier results + structural signals into the rung-status dict
harness.level consumes — the FIVE essential rungs. This is the SINGLE place every N/A source
is classified intentional ("skip") vs unintentional ("unver"); the table lives in DECISIONS.md
(phase lvl5). Conservative by design: never reports "pass" it can't substantiate, and any
rung that did not produce a pass/fail and has NO declared/structural reason is "unver".
"""Translate the orchestrator's tier results into the rung-status dict harness.level consumes —
the FOUR essential rungs only. Conservative by design — never reports a rung 'pass' it can't
substantiate (cardinal guardrail: presentation never inflates).
L1 install : install tier pass. Always applies — never "skip" (non-run → unver).
L2 upgrade : upgrade tier. Tier skipped + no upgrade target (only one published
version, structural) → "skip"; declared in EXPECTED_NA → "skip";
anything else non-pass/fail (prior-stage abort, tier excluded) → "unver".
L3 backup/res : backup AND restore tiers pass. Not backup-capable (declared/structural)
"skip"; EXPECTED_NA → "skip"; unverified-while-capable → "unver".
L4 functional : the custom tier. No custom tests / tier skipped → EXPECTED_NA-declared
"skip", else "unver" (absent functional coverage is a gap, not an
intentional property of the recipe).
L5 lint : from the lint executor (harness.lint). pass/fail only — every recipe can
be linted, so there is NO intentional-skip escape hatch: a lint that
could not run (timeout, abra missing, executor error) is "unver".
L1 install : install tier pass.
L2 upgrade : upgrade tier (skip → N/A: only one published version).
L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
L4 functional : recipe-specific functional tests pass — the custom tier. N/A if none ran.
Integration (SSO/OIDC) and recipe-local are OPTIONAL and intentionally NOT rungs here — they
never affect the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
never cap the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
"""
expected = set((expected_na or {}).keys())
rungs: dict[str, str] = {}
rungs["install"] = level_mod.tier_to_rung(results.get("install"))
up = results.get("upgrade")
if up in ("pass", "fail"):
rungs["upgrade"] = up
elif up == "skip" and not has_upgrade_target:
# The orchestrator skipped the tier for the structural reason: nothing to upgrade from.
rungs["upgrade"] = "skip"
elif "upgrade" in expected:
rungs["upgrade"] = "skip"
else:
rungs["upgrade"] = "unver"
br = level_mod.backup_restore_status(
rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
rungs["backup_restore"] = level_mod.backup_restore_status(
results.get("backup"), results.get("restore"), backup_capable
)
if br == "unver" and "backup_restore" in expected:
br = "skip"
rungs["backup_restore"] = br
custom = results.get("custom")
if custom in ("pass", "fail"):
rungs["functional"] = custom
elif "functional" in expected:
rungs["functional"] = "skip"
else:
rungs["functional"] = "unver"
rungs["lint"] = lint_status if lint_status in ("pass", "fail") else "unver"
if not has_custom or custom == "skip" or custom is None:
rungs["functional"] = "na"
elif custom == "fail":
rungs["functional"] = "fail"
else: # custom == "pass"
rungs["functional"] = "pass"
return rungs
# Reasons attached to STRUCTURAL intentional skips (no EXPECTED_NA declaration needed — the
# fact is read off the recipe itself).
_STRUCTURAL_REASON = {
"upgrade": "only one published version — no upgrade target",
"backup_restore": "not backup-capable (no backupbot labels / declared)",
}
def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
"""Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).
def skips(
rungs: dict[str, str],
expected_na: dict | None,
) -> dict:
"""Mirror the rung classification into results.json's `skips` block:
{ "intentional": {rung: reason, ...}, # status "skip" — declared/structural, with why
"unintentional": [rung, ...] } # status "unver" — should have run, wasn't verified
The reason is the recipe's EXPECTED_NA declaration when present, else the structural fact
derive_rungs skipped on. Purely descriptive — the level math lives in harness.level."""
A recipe lists the rungs it intentionally skips, each with a reason, in
`recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
**intentional** iff it is in that list; any rung that is skipped and NOT in the list is
**unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
level either way — the harness never claims a rung it did not verify — this only labels *why* a
skip happened. Returns:
{ "intentional": {rung: reason, ...}, # skipped AND declared in EXPECTED_NA
"unintentional": [rung, ...] } # skipped but NOT declared
"""
expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
intentional = {
r: expected.get(r) or _STRUCTURAL_REASON.get(r, "declared intentional")
for r, st in rungs.items()
if st == "skip"
}
unintentional = sorted(r for r, st in rungs.items() if st == "unver")
na = [r for r, st in rungs.items() if st == "na"]
intentional = {r: expected[r] for r in na if r in expected}
unintentional = sorted(r for r in na if r not in expected)
return {"intentional": intentional, "unintentional": unintentional}
@ -239,8 +200,6 @@ def build_results(
clean_teardown: bool,
no_secret_leak: bool,
finished_ts: float | None,
has_upgrade_target: bool = True,
lint: dict | None = None,
screenshot: str | None = None,
summary_card: str | None = None,
expected_na: dict | None = None,
@ -248,41 +207,17 @@ def build_results(
) -> dict:
"""Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
declared intentional-skip map (recipe_meta.EXPECTED_NA); `has_upgrade_target` is the structural
"a previous published version exists" fact; `lint` is harness.lint.run_lint's result dict
(None — e.g. an old caller — derives the lint rung as "unver": never a silent pass)."""
declared intentional-skip map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
accidentally-missing coverage."""
stages = collect_stages(records)
lint = lint or {}
lint_status = lint.get("status")
rungs = derive_rungs(
results,
backup_capable=backup_capable,
has_upgrade_target=has_upgrade_target,
expected_na=expected_na,
lint_status=lint_status,
)
# Surface lint in the per-stage table too (it has no pytest/JUnit tier), so the card's
# stage breakdown carries all five rungs.
if rungs["lint"] != "skip": # lint is never "skip", but stay defensive
stages.append(
{
"name": "lint",
"status": rungs["lint"],
"tests": [
{
"name": "abra recipe lint",
"classname": "lint",
"source": "harness",
"status": rungs["lint"],
"ms": 0,
"message": str(lint.get("detail") or ""),
}
],
}
)
lvl = level_mod.compute_level(rungs)
has_custom = any(r["tier"] == "custom" for r in records)
rungs = derive_rungs(results, backup_capable=backup_capable, has_custom=has_custom)
lvl, cap_reason = level_mod.compute_level(rungs)
# The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
# (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
capped = level_mod.RUNGS[lvl] if cap_reason else None
return {
"schema": 2,
"schema": 1,
"run_id": run_id(),
"recipe": recipe,
"version": version,
@ -290,12 +225,9 @@ def build_results(
"ref": (ref or "")[:12],
"finished": finished_ts,
"level": lvl,
"level_cap_reason": cap_reason,
"level_cap_rung": capped,
"rungs": rungs,
"lint": {
"status": rungs["lint"],
"detail": str(lint.get("detail") or ""),
"rules_failed": list(lint.get("rules_failed") or []),
},
"skips": skips(rungs, expected_na),
"stages": stages,
"results": results,

View File

@ -58,9 +58,6 @@ from harness import ( # noqa: E402
from harness import ( # noqa: E402
deps as deps_mod,
)
from harness import ( # noqa: E402
lint as lint_mod,
)
from harness import ( # noqa: E402
manifest as manifest_mod,
)
@ -931,24 +928,6 @@ def main() -> int:
run_artifact_dir = os.path.join(results_mod.runs_dir(), results_mod.run_id())
junit_dir = os.path.join(run_artifact_dir, "junit")
records: list[dict] = []
# L5 lint rung (phase lvl5): `abra recipe lint` against the EXACT tested ref, in a pristine
# scratch clone (harness.lint — the per-run tree is still at head_ref here, before any
# version-pinning checkout). Level rung only — NEVER the verdict: run_lint catches every
# failure mode into status "unver" (60s hard budget) and this belt-and-braces wrap makes a
# crashed executor identical to "could not verify".
lint_result = {"status": "unver", "detail": "lint executor crashed", "rules_failed": []}
try:
lint_result = lint_mod.run_lint(recipe, head_ref, run_artifact_dir)
except Exception as e: # noqa: BLE001 — lint is a rung, not a gate; never touches the verdict
print(
f"!! lint rung executor crashed (non-fatal, rung=unver): {_scrub(str(e))}", flush=True
)
print(
f"lint rung: {lint_result['status']}"
f"{'' + lint_result['detail'] if lint_result.get('detail') else ''}",
flush=True,
)
with contextlib.suppress(OSError):
os.makedirs(junit_dir, exist_ok=True)
@ -1274,8 +1253,6 @@ def main() -> int:
records=records,
results=results,
backup_capable=backup_cap,
has_upgrade_target=prev is not None, # structural: a previous published version exists
lint=lint_result, # L5 rung (phase lvl5)
clean_teardown=clean_teardown,
no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact
screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded
@ -1293,15 +1270,17 @@ def main() -> int:
file=sys.stderr,
)
path = results_mod.write_results(data)
print(f"results.json written: {path} (level={data['level']} of 5)", flush=True)
# Surface UNVERIFIED rungs in the CI log (non-blocking, R7): a rung that should have run
# and wasn't verified blocks the level above it — fill the coverage, or (where a
# declared/structural reason genuinely applies) declare it in EXPECTED_NA.
print(
f"results.json written: {path} (level={data['level']}"
f"{'' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
flush=True,
)
# Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
# but is not in the recipe's intentional list — either add the missing coverage or declare it.
for rung in data.get("skips", {}).get("unintentional", []):
print(
f"⚠ coverage: rung '{rung}' is UNVERIFIED (did not run / could not be checked) — "
f"the level cannot rise above it. Add the missing test/coverage, or declare a "
f"genuine inapplicability in tests/{recipe}/recipe_meta.py "
f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
flush=True,
)
@ -1323,10 +1302,21 @@ def main() -> int:
with open(html_path, "w", encoding="utf-8") as f:
f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
# Badge = level only (number + colour) — the per-rung table on the card is the sole
# carrier of "why isn't this higher" (operator-specified, phase lvl5).
capped = data.get("level_cap_rung")
sk = data.get("skips", {})
cap_skip = (
"intentional"
if capped in (sk.get("intentional") or {})
else "unintentional"
if capped in (sk.get("unintentional") or [])
else ""
)
with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
f.write(card_mod.level_badge_svg(data["level"]))
f.write(
card_mod.level_badge_svg(
data["level"], data.get("level_cap_reason", ""), cap_skip
)
)
print(
f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
f"badge.svg written into {run_artifact_dir}",