feat(lvl5): merge phase-lvl5 → main after M1 PASS (review cfc87fd) — implementation content taken verbatim from the Adversary-verified branch tip 3d8d286
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
autonomic-bot
2026-06-11 07:56:34 +00:00
14 changed files with 1125 additions and 379 deletions

View File

@ -38,6 +38,7 @@ _RUN_FILES = {
"screenshot.png": "image/png",
"badge.svg": "image/svg+xml",
"summary.html": "text/html; charset=utf-8",
"lint.txt": "text/plain; charset=utf-8",
}
_RUN_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*$")
@ -71,8 +72,7 @@ _LEVEL_COLOR = {
2: "#e0823d",
3: "#d9b343",
4: "#a0b93f",
5: "#57ab5a",
6: "#3fb950",
5: "#3fb950", # bright green — full 5-rung climb incl. lint (phase lvl5)
}
@ -152,7 +152,6 @@ def _build_row(b):
"ref": ref[:8],
"version": res.get("version") or ref[:12] or "",
"level": res.get("level"),
"level_cap_reason": res.get("level_cap_reason") or "",
"has_screenshot": bool(res.get("screenshot")),
"flags": res.get("flags") or {},
"finished": b.get("finished") or 0,
@ -220,7 +219,6 @@ a{color:#58a6ff;text-decoration:none} a:hover{text-decoration:underline}
.name{font-weight:700;font-size:1.05rem;color:#e6edf3}
.row{display:flex;align-items:center;gap:.5rem;flex-wrap:wrap;font-size:.82rem}
.pill{color:#fff;padding:.08rem .5rem;border-radius:.5rem;font-size:.75rem;font-weight:600}
.cap{color:#8b949e;font-size:.75rem}
code{background:#0d1117;border:1px solid #21262d;border-radius:.3rem;padding:0 .3rem;font-size:.78rem;color:#c9d1d9}
.flags{display:flex;gap:.4rem;font-size:.72rem;color:#8b949e}
.foot{margin-top:auto;display:flex;justify-content:space-between;font-size:.8rem;padding-top:.3rem;border-top:1px solid #21262d}
@ -274,17 +272,12 @@ def _card(r):
f'<a class="shot" href="{run_url}" title="open run">'
f'<span class="ph">no screenshot</span>{_level_pill(r["level"])}</a>'
)
cap = (
f'<div class="cap">{html.escape(r["level_cap_reason"])}</div>'
if r["level_cap_reason"]
else ""
)
return (
f'<div class="card">{shot}<div class="body">'
f'<div class="name">{html.escape(r["recipe"])}</div>'
f'<div class="row"><span class="pill" style="background:{color}">{html.escape(r["status"])}</span>'
f'<code>{html.escape(r["version"])}</code></div>'
f"{cap}{_flags_html(r['flags'])}"
f"{_flags_html(r['flags'])}"
f'<div class="foot"><a href="{run_url}">run #{num} · {_ago(r["finished"])}</a>'
f'<a href="/recipe/{html.escape(r["recipe"])}">history →</a></div>'
f"</div></div>"

View File

@ -115,8 +115,8 @@ _This table is GENERATED from the `runner/harness/meta.py` KEYS registry by `scr
| `HEALTH_OK` | `tuple[int]` | `(200, 301, 302)` | Acceptable HTTP status codes for health. |
| `DEPLOY_TIMEOUT` | `int` | `600` | Max seconds to wait for swarm convergence per deploy. |
| `HTTP_TIMEOUT` | `int` | `300` | Max seconds to wait for HTTP health after convergence. |
| `BACKUP_CAPABLE` | `bool` | `None` | Override the backup-tier capability auto-detect (compose `backupbot.backup` labels). `False` forces N/A; `True` forces the tier on; unset = auto-detect. |
| `EXPECTED_NA` | `dict` | `None` | Declare an N/A rung intentional: `{rung: reason}`. The cap stands either way; only the report wording changes. |
| `BACKUP_CAPABLE` | `bool` | `None` | Override the backup-tier capability auto-detect (compose `backupbot.backup` labels). `False` forces an intentional skip of the backup/restore rung; `True` forces the tier on; unset = auto-detect. |
| `EXPECTED_NA` | `dict` | `None` | Declare a non-run rung an INTENTIONAL skip: `{rung: reason}` — the level climbs past it; an undeclared non-run rung is *unverified* and blocks the level above it (classification table: machine-docs/DECISIONS.md phase lvl5). Never overrides an exercised pass/fail; the `lint` rung has no escape hatch. |
| `READY_PROBE` | `hook` | `None` | Callable `(ctx) -> [probe, ...]` returning extra readiness probes, run after install AND after upgrade: HTTP `{host, path, ok}` or TCP `{tcp_host, tcp_port, stable}`. |
| `UPGRADE_BASE_VERSION` | `str` | `None` | Exact published tag overriding the upgrade tier's base (default: `recipe_versions[-2]`). |
| `BACKUP_VERIFY` | `hook` | `None` | Callable `(ctx) -> bool` post-backup data-capture check; `False` re-runs the backup (truncated-dump race guard), retried up to 3 attempts. |

View File

@ -10,12 +10,9 @@ It is the R8 reference for Phase 3 (`plan-phase3-results-ux.md`).
---
## 1. The level ladder (R1)
## 1. The level ladder (phase lvl5 semantics, operator-decided 2026-06-11)
Every run earns a single integer **level 06**. The ladder is cumulative with **YunoHost
gap-caps-the-level** semantics: you earn level `L` only if **every rung 1..L was a clean PASS**. The
first rung that is not a clean PASS — a real **FAIL** *or* genuinely **N/A** for this recipe — stops
the climb, and `level_cap_reason` records which rung and why.
Every run earns a single integer **level 05** over the FIVE essential rungs:
| Level | Rung | Earned when |
|------:|------|-------------|
@ -24,42 +21,52 @@ the climb, and `level_cap_reason` records which rung and why.
| **L2** | upgrade | previous published version → PR/latest, stays healthy, data intact. |
| **L3** | backup/restore | seeded data survives backup → wipe → restore. |
| **L4** | functional | the recipe-specific functional tests pass. |
| **L5** | integration | SSO/OIDC + cross-app integration tests pass. |
| **L6** | recipe-local | the recipe repo's own `tests/` (D4) pass and are merged. |
| **L5** | lint | `abra recipe lint` passes against the exact ref under test. |
**N/A caps, fairly.** A rung that does not apply to a recipe (only one published version → no
upgrade; not backup-capable; no SSO/integration surface; no recipe-local tests) is **N/A**, which
caps the climb at the rung below it with a recorded reason — it is *not* counted as a failure. This is
the only fair reading of "a missing lower rung caps the level": e.g. a recipe with **no integration
surface caps at L4 by definition**, shown as `level_cap_reason = "L5 integration … N/A"`. A stateless
app whose functional tests pass but which cannot be backed up is honestly capped at **L2** (`"L3
backup/restore … N/A"`) rather than shown as L4 — understating is safe; overstating is forbidden.
Each rung has one of FOUR statuses, and the level is:
Worked examples (real runs):
- `uptime-kuma` — install+upgrade+backup+restore+functional all pass, no SSO surface → **L4**
(`cap = "L5 integration (SSO/OIDC + cross-app) N/A"`).
- `custom-html-tiny` — stateless, not backup-capable: install+upgrade pass, backup/restore N/A →
**L2** (`cap = "L3 backup/restore (data integrity) N/A"`).
level = the highest rung that PASSED, where every rung below it is "pass" or an intentional skip
- **pass / fail** — the rung was exercised. A FAIL blocks: no rung above it counts, however green.
- **skip (intentional)** — the rung *genuinely does not apply*, from a declared or structural fact:
not backup-capable (declared), only one published version (no upgrade target), or a declared
`EXPECTED_NA`. Intentional skips are **climbed past** — a stateless recipe with passing
functional tests and a clean lint reaches **L5**, not the old "capped at 2".
- **unver (unverified)** — the rung *should* have run but didn't: infra error, missing tool,
harness exception, prior-stage abort, timeout. **The level cannot rise above an unverified
rung** — it blocks exactly like a fail (we never claim what we didn't check). Anything
unclassifiable defaults to unver (conservative).
There is **no capping concept** (no `cap_reason`, no `capped`): the per-rung table
(✔ / ✘ / intentional-skip / unverified) on the card and in `results.json.rungs` is the sole
carrier of "why isn't this level higher". Worked examples:
- install ✔, upgrade ✘, backup ✔, functional ✔, lint ✔ → **level 1** (fail blocks).
- install ✔, upgrade ✔, backup skip (not capable), functional ✔, lint ✔ → **level 5**.
- install ✔, upgrade ✔, backup unver (harness error), functional ✔, lint ✔ → **level 2**.
- all four ✔, lint unver (abra missing) → **level 4** (an unverified top rung isn't earned).
Integration (SSO/OIDC + cross-app) and recipe-local tests are **optional capabilities**, not
rungs — they never affect the level (SSO remains enforced for the run VERDICT).
### How tiers map to rungs (the translation layer)
`run_recipe_ci.py` holds the run's per-tier results (`install/upgrade/backup/restore/custom`) +
deps/SSO signals; `runner/harness/results.py::derive_rungs` maps them to the rung-status dict that
`runner/harness/level.py::compute_level` scores. The mapping (also in `DECISIONS.md`, Phase 3):
structural signals; `runner/harness/results.py::derive_rungs` maps them to the rung-status dict
that `runner/harness/level.py::compute_level` scores. The full intentional-vs-unintentional
classification table for every N/A source is in `machine-docs/DECISIONS.md` (phase lvl5). Summary:
- **install** ← install tier (pass/fail).
- **upgrade** ← upgrade tier; `skip`**na** (only one published version).
- **install** ← install tier (pass/fail; a non-run is unver — install always applies).
- **upgrade** ← upgrade tier; tier skipped with no upgrade target (single published version,
structural) → skip; declared `EXPECTED_NA` → skip; otherwise unver.
- **backup_restore** ← backup AND restore tiers both pass → pass; either fail → fail; not
backup-capable **na**.
- **functional** ← the custom tier minus its SSO tests; a custom failure conservatively fails this
rung (we don't split functional-vs-SSO failure → never inflate); no custom tests → **na**.
- **integration** ← applies only if the recipe declares deps; pass iff deps wired and SSO verified and
custom didn't fail; recipes with no declared deps → **na** (the "caps at L4" rule).
- **recipe_local** ← the recipe repo's own `tests/` (discovery source `repo-local`) ran and passed;
none present → **na**.
The pure scorer is exhaustively unit-tested + fuzz-verified (all 729 rung combinations: level ==
count of leading consecutive passes, zero inflation).
backup-capable (structural/declared) → skip; unverified-while-capable → unver.
- **functional** ← the custom tier; a custom failure conservatively fails this rung; no custom
tests is a coverage GAP → unver, unless declared `EXPECTED_NA["functional"]` → skip.
- **lint** ← the lint executor (`runner/harness/lint.py`): `abra recipe lint` on a pristine
scratch clone of the run's recipe tree at the exact tested sha, 60s hard budget, full output in
the run artifact `lint.txt`. pass/fail only — when lint can't run the rung is **unver** (never
a silent pass, never an intentional skip). Lint never changes the run verdict.
### Invariant flags (shown, not climbed)
@ -77,19 +84,29 @@ build number, or the run's unique app domain for a hand-run). Schema:
```json
{
"schema": 1, "run_id": "...", "recipe": "...", "version": "...", "pr": "...", "ref": "...",
"schema": 2, "run_id": "...", "recipe": "...", "version": "...", "pr": "...", "ref": "...",
"finished": 0.0,
"level": 4, "level_cap_reason": "L5 integration (SSO/OIDC + cross-app) N/A",
"rungs": {"install":"pass","upgrade":"pass","backup_restore":"pass","functional":"pass",
"integration":"na","recipe_local":"na"},
"level": 5,
"rungs": {"install":"pass","upgrade":"pass","backup_restore":"skip","functional":"pass",
"lint":"pass"},
"lint": {"status":"pass","detail":"","rules_failed":[]},
"skips": {"intentional": {"backup_restore": "not backup-capable (no backupbot labels / declared)"},
"unintentional": []},
"stages": [{"name":"install","status":"pass",
"tests":[{"name":"test_serving","status":"pass","ms":168,"source":"generic"}]}],
"results": {"install":"pass","upgrade":"pass","backup":"pass","restore":"pass","custom":"pass"},
"results": {"install":"pass","upgrade":"pass","backup":"skip","restore":"skip","custom":"pass"},
"flags": {"clean_teardown": true, "no_secret_leak": true},
"screenshot": "screenshot.png", "summary_card": "summary.png"
}
```
`rungs` carries the four-status vocabulary above; `skips.intentional` maps each intentionally
skipped rung to its (declared or structural) reason and `skips.unintentional` lists the
unverified rungs. `lint` carries the L5 rung outcome + failing rule ids; the full
`abra recipe lint` output is served at `/runs/<run_id>/lint.txt`. Pre-lvl5 artifacts
(`"schema": 1`, 4-rung ladder, `level_cap_reason`/`level_cap_rung` present, `"na"` statuses)
are still rendered as-is by the dashboard/card — their stored level is never recomputed.
Assembly is **best-effort**: a failure to build/write `results.json` is logged but never changes the
run's exit code (cosmetics never block the pipeline, R7).

View File

@ -21,23 +21,24 @@ from __future__ import annotations
import html
import os
# Level → colour ramp (YunoHost-ish): red at the floor, climbing to green at the top.
# Level → colour ramp (YunoHost-ish): red at the floor, climbing to green at the top (L5 = full
# clean climb incl. lint — phase lvl5).
LEVEL_COLOR = {
0: "#e5534b", # red — install failed
1: "#e0823d", # orange
2: "#e0823d",
3: "#d9b343", # amber
4: "#a0b93f", # yellow-green
5: "#57ab5a", # green
6: "#3fb950", # bright green — full climb
4: "#a0b93f", # yellow-green — above functional, lint not earned
5: "#3fb950", # bright green — full climb (lint passed)
}
STATUS_MARK = {"pass": "", "fail": "", "skip": "", "error": "", "na": ""}
STATUS_MARK = {"pass": "", "fail": "", "skip": "", "error": "", "na": "", "unver": ""}
STATUS_COLOR = {
"pass": "#3fb950",
"fail": "#f85149",
"error": "#f85149",
"skip": "#8b949e",
"na": "#8b949e",
"unver": "#d29922", # amber — exercised? no: should have run and wasn't verified
}
@ -79,44 +80,15 @@ def render_badge_svg(label: str, message: str, color: str) -> str:
)
# Third-segment colours for the level badge: amber = an UNINTENTIONAL skip (a rung skipped but not
# in the recipe's intentional list — likely missing coverage) capped the climb; muted = an
# INTENTIONAL skip (declared in recipe_meta.EXPECTED_NA — nothing to fix). Font-safe text labels
# (no emoji) so the SVG renders anywhere.
# Amber for UNVERIFIED rung rows in the table (a rung that should have run and wasn't checked).
GAP_COLOR = "#d29922"
EXPECT_COLOR = "#6e7681"
def level_badge_svg(level: int, cap_reason: str = "", cap_skip: str = "") -> str:
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level (R6), with a THIRD segment
that differentiates *why* the climb stopped when a SKIP capped it (`cap_skip`):
- "unintentional" (a rung skipped but not in the recipe's intentional list): amber 'gap?'.
- "intentional" (a skip declared in recipe_meta.EXPECTED_NA): muted 'expected'.
- "" (clean cap / full climb / a real failure): no third segment (the level + card carry it).
The badge never inflates — it only annotates the cap the level already reflects."""
label, msg = "cc-ci", f"level {int(level)}"
lw, mw = _text_width(label), _text_width(msg)
third: tuple[str, str] | None = None
if cap_skip == "unintentional":
third = ("gap?", GAP_COLOR)
elif cap_skip == "intentional":
third = ("expected", EXPECT_COLOR)
if third is None:
return render_badge_svg(label, msg, level_color(level))
txt, tcolor = third
tw = _text_width(txt)
w = lw + mw + tw
return (
f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="20" role="img" '
f'aria-label="{html.escape(label)}: {html.escape(msg)} ({html.escape(txt)})">'
f'<rect width="{lw}" height="20" fill="#555"/>'
f'<rect x="{lw}" width="{mw}" height="20" fill="{level_color(level)}"/>'
f'<rect x="{lw + mw}" width="{tw}" height="20" fill="{tcolor}"/>'
f'<g fill="#fff" font-family="Verdana,Geneva,sans-serif" font-size="11">'
f'<text x="6" y="14">{html.escape(label)}</text>'
f'<text x="{lw + 6}" y="14">{html.escape(msg)}</text>'
f'<text x="{lw + mw + 6}" y="14">{html.escape(txt)}</text></g></svg>'
)
def level_badge_svg(level: int) -> str:
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level — NUMBER + COLOUR ONLY
(operator-specified, phase lvl5). 'Why isn't it higher' lives in the card's per-rung table,
never on the badge."""
return render_badge_svg("cc-ci", f"level {int(level)}", level_color(level))
def _stage_rows(stages: list[dict]) -> str:
@ -141,12 +113,13 @@ def _stage_rows(stages: list[dict]) -> str:
return "\n".join(rows) or '<tr><td colspan="3">no stages</td></tr>'
# Friendly rung labels for the skip rows (the four essential rungs).
# Friendly rung labels for the skip/unverified rows (the five essential rungs).
RUNG_LABEL = {
"install": "install",
"upgrade": "upgrade",
"backup_restore": "backup/restore",
"functional": "functional",
"lint": "lint",
}
SKIP_GREEN = (
"#57ab5a" # muted green — an intentional skip reads like a pass (but labelled, never inflating)
@ -154,9 +127,10 @@ SKIP_GREEN = (
def _skip_rows(skips: dict) -> str:
"""Render SKIPPED rungs as stage-like rows. An intentional (declared) skip looks like a pass row
but its status says 'INTENTIONAL SKIP' (muted green) with the declared reason on the line below;
an unintentional skip is amber 'UNINTENTIONAL SKIP' with a prompt to add a test or declare it."""
"""Render the non-run rungs as stage-like rows (phase lvl5 semantics). An INTENTIONAL skip
(declared/structural — the rung does not apply, the climb continues past it) is muted green
with its reason on the line below; an UNVERIFIED rung (should have run, wasn't checked — the
level cannot rise above it) is amber 'unverified'."""
rows = []
for rung, reason in (skips.get("intentional") or {}).items():
rows.append(
@ -171,11 +145,11 @@ def _skip_rows(skips: dict) -> str:
rows.append(
f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{GAP_COLOR}">⊘</span>'
f"<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>"
f'<td class="st" style="color:{GAP_COLOR}">unintentional skip</td></tr>'
f'<td class="st" style="color:{GAP_COLOR}">unverified</td></tr>'
)
rows.append(
'<tr class="skipreason"><td></td><td colspan="2">not declared in EXPECTED_NA — add the '
"missing test/label, or declare the skip with a reason</td></tr>"
'<tr class="skipreason"><td></td><td colspan="2">rung did not run / could not be '
"checked — the level cannot rise above an unverified rung</td></tr>"
)
return "\n".join(rows)
@ -184,13 +158,15 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
"""Build the summary-card HTML from a results.json dict. `screenshot_rel` is the relative path to
the screenshot PNG (same dir as the card) — omitted from the card if None / absent.
The card shows exactly what the data says: recipe + version, the level badge + cap reason, the
per-stage/per-test ✔/✘ table, the invariant flags, and the app screenshot. No computation here."""
The card shows exactly what the data says: recipe + version, the level, the per-stage/per-test
✔/✘ table (+ skip/unverified rung rows — the SOLE carrier of "why isn't the level higher"),
the invariant flags, and the app screenshot. No computation here. Tolerates old (schema-1)
artifacts: the ladder height is read off the rungs the artifact actually has."""
recipe = html.escape(str(data.get("recipe", "?")))
version = html.escape(str(data.get("version") or data.get("ref") or ""))
level = int(data.get("level", 0))
cap_reason = str(data.get("level_cap_reason") or "")
cap = html.escape(cap_reason)
# Old (pre-lvl5) artifacts have a 4-rung ladder — render their "of N" honestly.
ladder_top = 5 if "lint" in (data.get("rungs") or {}) else 4
sk = data.get("skips", {}) or {}
color = level_color(level)
flags = data.get("flags", {}) or {}
@ -221,7 +197,7 @@ body{{margin:0;font-family:system-ui,-apple-system,Segoe UI,sans-serif;backgroun
.lvl .num{{display:inline-block;min-width:64px;padding:.3rem .7rem;border-radius:10px;
font-size:1.6rem;font-weight:700;color:#0d1117;background:{color}}}
.lvl .lbl{{display:block;color:#8b949e;font-size:.72rem;text-transform:uppercase;margin-top:.2rem}}
.cap{{padding:.4rem 1.3rem;color:#8b949e;font-size:.82rem;border-bottom:1px solid #21262d}}
.ladder{{padding:.4rem 1.3rem;color:#8b949e;font-size:.82rem;border-bottom:1px solid #21262d}}
.body{{display:flex;gap:1rem;padding:1rem 1.3rem}}
.tbl{{flex:1}}
table{{border-collapse:collapse;width:100%;font-size:.85rem}}
@ -238,12 +214,12 @@ tr.skipreason td{{color:#8b949e;font-size:.78rem;font-style:italic;padding-top:0
.shot.noshot{{display:flex;align-items:center;justify-content:center;height:225px;color:#8b949e;font-size:.85rem}}
.flags{{display:flex;gap:.6rem;padding:.6rem 1.3rem 1rem}}
.flag{{border:1px solid;border-radius:6px;padding:.15rem .5rem;font-size:.78rem;color:#c9d1d9}}
.cap b{{color:#c9d1d9}}
.ladder b{{color:#c9d1d9}}
</style></head><body><div class="card">
<div class="hd">{FLOWER_SVG}
<div class="title"><h1>{recipe}</h1><span class="ver">{version}</span></div>
<div class="lvl"><span class="num">{level}</span><span class="lbl">level</span></div></div>
<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (4)"}</div>
<div class="ladder"><b>level {level} of {ladder_top}</b></div>
<div class="body"><div class="tbl"><table>{rows}</table></div>{shot_html}</div>
<div class="flags">{"".join(flag_bits)}</div>
</div></body></html>"""

View File

@ -1,67 +1,67 @@
"""Phase 3 — the level ladder (plan-phase3-results-ux.md §4.1, R1).
"""The level ladder — five rungs, no capping (phase lvl5, plan-phase-lvl5-lint-rung.md).
A single integer **level** summarising how far up the quality ladder a recipe run climbed, with
YunoHost semantics: **a gap caps the level** — you only earn level L if every rung 1..L was a clean
PASS. The first rung that is not a clean PASS (a real FAIL *or* genuinely N/A for this recipe) stops
the climb; `cap_reason` records why. This is deliberately conservative: presentation must NEVER make
a run look greener than its tests (plan §6 cardinal guardrail), so an N/A rung caps just like a fail
— with a recorded reason so the level is *fair*, not inflated.
The ladder is the FOUR essential rungs every recipe is held to:
A single integer **level** summarising how far up the quality ladder a recipe run climbed:
L0 — install failed / app never became healthy.
L1 — Installs: deploys + passes health/readiness.
L2 — Upgrades: previous published version → PR version, stays healthy, data intact.
L3 — Backup/restore: seeded data survives backup → wipe → restore.
L4 — Functional: recipe-specific functional tests pass.
L5 — Lint: `abra recipe lint` passes against the exact ref under test.
Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) are **OPTIONAL**
capabilities — they are NOT part of the level ladder and never cap it. They still run when present
(and SSO is still enforced for the run VERDICT via the deps/SSO checks in run_recipe_ci.py), but a
recipe without an SSO surface or without repo-local tests is simply not penalised on the level.
Semantics (operator-decided 2026-06-11, recorded in DECISIONS.md — replaces the Phase-3
"N/A caps" rule):
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the unit
test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`). The orchestrator
(`run_recipe_ci.py`) is responsible for translating its raw per-tier results into the rung-status
dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
level = max i such that rung_i == "pass" and every rung j < i is "pass" or "skip"; 0 if none.
Rung status vocabulary (each rung ∈ these three):
"pass" the rung was exercised and passed.
"fail" the rung was exercised and failed.
"na" the rung does not apply to this recipe (e.g. only one published version → no upgrade;
not backup-capable). N/A is NOT a failure, but it DOES cap the climb (with a distinct
cap_reason) so the level never overstates what was actually verified.
A rung has one of FOUR statuses:
"pass" — exercised and passed.
"fail" — exercised and failed. Blocks: no rung above it can count.
"skip"INTENTIONAL skip: the rung genuinely does not apply to this recipe, from a
declared or structural fact (not backup-capable; only one published version;
declared in recipe_meta.EXPECTED_NA). Does NOT stop the climb.
"unver" — UNINTENTIONAL not-verified: the rung SHOULD have run but didn't (infra error,
missing tool, harness exception, prior-stage abort, timeout). Blocks exactly
like a fail — the level never rises above a rung that wasn't actually checked.
The per-rung table (results.json `rungs`, card, dashboard) is the SOLE carrier of "why isn't
this level higher" — there is no cap_reason. The classification of every N/A source into
skip-vs-unver lives in derive_rungs (results.py) and is tabulated in DECISIONS.md; anything
unclassifiable defaults to "unver" (conservative: never claim what wasn't checked).
Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) remain
OPTIONAL capabilities — not rungs, never counted (SSO is still enforced for the run VERDICT
via the deps/SSO checks in run_recipe_ci.py).
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the
unit test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`).
"""
from __future__ import annotations
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install itself
# did not pass. Each later rung requires every earlier rung to be a clean PASS. These four are the
# ESSENTIAL rungs — integration/recipe-local are optional and deliberately NOT in this tuple.
RUNGS = ("install", "upgrade", "backup_restore", "functional")
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install
# itself did not pass. These five are the ESSENTIAL rungs — integration/recipe-local are
# optional and deliberately NOT in this tuple.
RUNGS = ("install", "upgrade", "backup_restore", "functional", "lint")
# Human-readable label per rung level, for cap_reason + the summary card.
# Human-readable label per rung level, for the summary card / docs.
RUNG_LABEL = {
1: "install (deploy + health)",
2: "upgrade (prev published → PR)",
3: "backup/restore (data integrity)",
4: "functional (recipe-specific tests)",
5: "lint (abra recipe lint)",
}
VALID = {"pass", "fail", "na"}
VALID = {"pass", "fail", "skip", "unver"}
def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
"""Map a rung-status dict → (level 0..4, cap_reason).
def compute_level(rungs: dict[str, str]) -> int:
"""Map a rung-status dict → level 0..5.
`rungs` must contain a status in {"pass","fail","na"} for every name in RUNGS. The level is the
highest L such that rungs[1..L] are all "pass"; the first non-"pass" rung caps the climb. L0 is
returned when the install rung itself is not "pass" (install failed / never healthy).
cap_reason explains where the climb stopped:
- "" (empty) when the recipe earned the top rung (L4, full clean climb).
- "L<k> <label> FAILED" when a rung was exercised and failed.
- "L<k> <label> N/A" when a rung does not apply to this recipe.
Returns the reason for the FIRST rung that stopped the climb (the binding constraint).
`rungs` must contain a status in VALID for every name in RUNGS. The level is the highest
i such that rungs[i] == "pass" and every rung below i is "pass" or "skip" (an intentional
skip does not stop the climb). A "fail" or "unver" rung blocks: rungs above it cannot
count, however green. 0 when no rung qualifies.
"""
for name in RUNGS:
st = rungs.get(name)
@ -69,52 +69,44 @@ def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
raise ValueError(
f"rung {name!r} has invalid status {st!r} (expect one of {sorted(VALID)})"
)
# L0: install did not pass.
if rungs["install"] != "pass":
if rungs["install"] == "fail":
return 0, "L1 " + RUNG_LABEL[1] + " FAILED"
# install N/A is not a real-world state for a deploy run, but handle it for totality.
return 0, "L1 " + RUNG_LABEL[1] + " N/A"
# Climb: stop at the first rung that is not a clean pass.
level = 0
for idx, name in enumerate(RUNGS, start=1):
if rungs[name] == "pass":
st = rungs[name]
if st == "pass":
level = idx
elif st == "skip":
continue
# first non-pass rung — caps the climb
kind = "FAILED" if rungs[name] == "fail" else "N/A"
return level, f"L{idx} {RUNG_LABEL[idx]} {kind}"
# Full clean climb to the top rung.
return level, ""
else: # fail / unver — nothing above this rung can count
break
return level
def backup_restore_status(backup: str | None, restore: str | None, backup_capable: bool) -> str:
"""Collapse the backup + restore tier results into the single L3 rung status.
Both tiers must pass for the rung to pass (the rung is "seeded data survives backup→wipe→restore",
which is only verified if BOTH the backup and the restore tier are green). If the recipe is not
backup-capable, both tiers skip → the rung is N/A (caps at L2, recorded). A fail in either tier
fails the rung.
Not backup-capable (a declared/structural fact: no backupbot labels, or
recipe_meta.BACKUP_CAPABLE=False) → "skip" — the rung genuinely does not apply.
Otherwise both tiers must pass for the rung to pass; a fail in either tier fails it; any
other shape (tier skipped or never ran while backup-capable — e.g. a prior-stage abort)
is "unver": the rung should have been verified and wasn't.
"""
if not backup_capable:
return "na"
return "skip"
vals = {backup, restore}
if "fail" in vals:
return "fail"
if backup == "pass" and restore == "pass":
return "pass"
# any skip/None while backup-capable → not verified → treat as N/A (cannot claim L3)
return "na"
return "unver"
def tier_to_rung(status: str | None) -> str:
"""Map a single tier result ('pass'|'fail'|'skip'|None) to a rung status. 'skip'/None → 'na'
(the tier did not apply / did not run), so it caps the climb without being counted as a failure."""
"""Map a single tier result ('pass'|'fail'|'skip'|None) to a rung status, with NO
intentionality information: a tier that did not produce a pass/fail is "unver" (it should
have run and wasn't verified). The caller (derive_rungs) upgrades "unver" to "skip" where
a declared/structural fact makes the skip intentional — never the other way around."""
if status == "pass":
return "pass"
if status == "fail":
return "fail"
return "na"
return "unver"

174
runner/harness/lint.py Normal file
View File

@ -0,0 +1,174 @@
"""L5 lint rung — run `abra recipe lint` against the exact ref under test (phase lvl5).
Executor + classifier for the fifth ladder rung. Design constraints (plan-phase-lvl5 §2):
- **Lints the recipe's CONTENT, not the harness plumbing.** abra lint reads every
`compose*.yml` in the tree (including the CI's untracked install_steps overlays) and
force-fetches tags from `origin` (which on PR runs is the private mirror, unauthenticated
here → FATA). Both are harness artifacts, so the executor lints a PRISTINE scratch clone of
the per-run tree, checked out at the exact tested ref: `origin` becomes a local path (tag
fetch works offline, no auth) and the run's true tag set rides along (fetch_recipe pulls the
upstream version tags into the per-run tree). No lint rule is filtered or ignored.
- **rc is not the verdict.** `abra recipe lint` exits non-zero only when it cannot lint
(FATA); rule outcomes live in its table — error-severity ❌ rows print a trailing
"WARN critical errors present …" sentinel but still exit 0. So the classifier parses the
table: FAIL iff an error-severity rule is unsatisfied (or the FATA is content-attributable:
"unable to validate recipe" — the recipe config itself is invalid). PASS iff the table
rendered and no error rule failed. ANYTHING else — timeout, abra/script missing, tag-fetch
FATA, unparseable output — is "unver": loud, never a silent pass, never an intentional skip.
- **Best-effort + time-bounded.** Hard ~60s timeout (observed runtime ≈0.7s); the caller
wraps run_lint in try/except besides — a wedged lint can never hang or fail a run, and the
run VERDICT is untouched by any lint outcome (lint is a level rung, not a gate).
- Full command output (+ cmd, rc, ref header) is captured to `lint.txt` in the run artifact
dir; results.json carries status + short excerpt (failing rule ids).
abra needs a PTY even with -n ("inappropriate ioctl on device") → run via util-linux
`script -qec`, same trick as harness.abra._run_pty.
"""
from __future__ import annotations
import os
import re
import shlex
import shutil
import subprocess
import tempfile
from . import abra
LINT_TIMEOUT = 60 # hard budget, seconds; observed ~0.7s per recipe
# Strip ANSI escape sequences from PTY output before parsing.
_ANSI = re.compile(r"\x1b\[[0-9;?]*[A-Za-z]")
# A table row: ┃ R014 ┃ description ┃ error ┃ ✅/❌ ┃ skipped ┃ how-to-fix ┃ — abra renders the
# grid with HEAVY box-drawing verticals (┃ U+2503); accept the light variant (│ U+2502) too.
_ROW = re.compile(
r"^\s*[│┃]\s*(R\d+)\s*[│┃](.*?)[│┃]\s*(warn|error)\s*[│┃]\s*(✅|❌)\s*[│┃]\s*([^│┃]*)[│┃]"
)
# abra's trailing sentinel when any error-severity rule is unsatisfied (cross-check only).
_SENTINEL = "critical errors present"
# FATA classes that are the RECIPE's fault (its config cannot even be validated) — a lint
# FAIL, not an unverified rung. Everything else non-zero is environmental → unver.
_CONTENT_FATA = "unable to validate recipe"
def parse_table(output: str) -> list[dict]:
"""Parse the lint table → rows {rule, desc, severity, satisfied(bool), skipped(bool)}.
Tolerant: lines that don't match are ignored; returns [] when no table rendered."""
rows = []
for line in _ANSI.sub("", output).replace("\r", "\n").splitlines():
m = _ROW.match(line)
if not m:
continue
rule, desc, severity, mark, skipped = m.groups()
rows.append(
{
"rule": rule,
"desc": desc.strip(),
"severity": severity,
"satisfied": mark == "",
"skipped": skipped.strip() not in ("", "-"),
}
)
return rows
def classify(rc: int | None, output: str) -> tuple[str, str, list[str]]:
"""(status, detail, failed_rule_ids) from a finished lint invocation.
status ∈ {"pass","fail","unver"}; never a silent pass: pass requires a parsed table with
zero unsatisfied error-severity rules AND no sentinel. `rc=None` means the run itself blew
up (timeout/missing binary) — always unver; the caller supplies the detail.
"""
if rc is None:
return "unver", "lint did not run", []
if rc != 0:
first = next((ln for ln in _ANSI.sub("", output).splitlines() if "FATA" in ln), "").strip()
if _CONTENT_FATA in output:
# The recipe config itself failed validation — attributable to recipe content.
return "fail", first or "recipe config failed validation", []
return "unver", first or f"abra recipe lint exited {rc} with no table", []
rows = parse_table(output)
if not rows:
return "unver", "no lint table in output (rc=0)", []
failed = [
r["rule"]
for r in rows
if r["severity"] == "error" and not r["satisfied"] and not r["skipped"]
]
if failed:
return "fail", f"error rule(s) unsatisfied: {', '.join(failed)}", failed
if _SENTINEL in output:
# abra says critical errors but our parse found none — distrust the parse, never inflate.
return "fail", "abra reported critical errors (table parse found none)", []
return "pass", "", []
def run_lint(recipe: str, ref: str | None, out_dir: str | None) -> dict:
"""Execute the lint rung for `recipe` at exactly `ref` (a sha; None → the per-run tree's
current HEAD). Returns {"status","detail","rules_failed"} and writes lint.txt into
`out_dir` (when given). Never raises: every failure mode is caught into status "unver"."""
scratch = None
rc: int | None = None
output = ""
try:
src_tree = abra.recipe_dir(recipe)
scratch = tempfile.mkdtemp(prefix="ccci-lint-")
lint_abra = os.path.join(scratch, "abra")
os.makedirs(os.path.join(lint_abra, "recipes"))
clone = os.path.join(lint_abra, "recipes", recipe)
subprocess.run(
["git", "clone", "--quiet", src_tree, clone],
check=True,
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
)
if ref:
subprocess.run(
["git", "-C", clone, "checkout", "-f", "--quiet", ref],
check=True,
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
)
# catalogue: R006 (published catalogue version) reads it; servers: harmless, some abra
# paths stat it. Symlink the live ones (read-only use).
for shared in ("catalogue", "servers"):
src = os.path.join(abra.abra_dir(), shared)
if os.path.exists(src):
os.symlink(os.path.realpath(src), os.path.join(lint_abra, shared))
env = dict(os.environ, ABRA_DIR=lint_abra)
proc = subprocess.run(
["script", "-qec", f"abra recipe lint -n {shlex.quote(recipe)}", "/dev/null"],
capture_output=True,
text=True,
timeout=LINT_TIMEOUT,
env=env,
)
rc, output = proc.returncode, proc.stdout + proc.stderr
status, detail, failed = classify(rc, output)
except subprocess.TimeoutExpired:
status, detail, failed = "unver", f"lint timed out after {LINT_TIMEOUT}s", []
except Exception as e: # noqa: BLE001 — rung must never break the run; unver is the honest floor
status, detail, failed = "unver", f"lint executor error: {e.__class__.__name__}: {e}", []
finally:
if scratch:
shutil.rmtree(scratch, ignore_errors=True)
if status == "unver":
print(f"!! lint rung UNVERIFIED for {recipe}: {detail}", flush=True)
if out_dir:
try:
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(out_dir, "lint.txt"), "w", encoding="utf-8") as f:
f.write(
f"$ abra recipe lint -n {recipe} (ref={ref or 'HEAD'})\n"
f"rc={rc} status={status} {detail}\n\n{output}"
)
except OSError as e:
print(f" lint: could not write lint.txt (non-fatal): {e}", flush=True)
return {"status": status, "detail": detail, "rules_failed": failed}

View File

@ -70,13 +70,13 @@ KEYS: tuple[Key, ...] = (
"BACKUP_CAPABLE",
"bool",
None,
"Override the backup-tier capability auto-detect (compose `backupbot.backup` labels). `False` forces N/A; `True` forces the tier on; unset = auto-detect.",
"Override the backup-tier capability auto-detect (compose `backupbot.backup` labels). `False` forces an intentional skip of the backup/restore rung; `True` forces the tier on; unset = auto-detect.",
),
Key(
"EXPECTED_NA",
"dict",
None,
"Declare an N/A rung intentional: `{rung: reason}`. The cap stands either way; only the report wording changes.",
"Declare a non-run rung an INTENTIONAL skip: `{rung: reason}` — the level climbs past it; an undeclared non-run rung is *unverified* and blocks the level above it (classification table: machine-docs/DECISIONS.md phase lvl5). Never overrides an exercised pass/fail; the `lint` rung has no escape hatch.",
),
Key(
"READY_PROBE",

View File

@ -1,20 +1,22 @@
"""Phase 3 — structured run results + results.json (plan-phase3-results-ux.md §4.2, R1/R3).
"""Structured run results + results.json (Phase 3 §4.2 R1/R3; level semantics: phase lvl5).
Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying:
{ recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
level, level_cap_reason, level_cap_rung, rungs,
level, rungs, lint:{status,detail,rules_failed},
skips:{intentional:{rung:reason}, unintentional:[rung]},
flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
`skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
never claims a rung it did not verify; this only labels *why* a skip happened.
Rung statuses (phase lvl5, operator-decided — see harness.level + DECISIONS.md): every rung is
"pass" | "fail" | "skip" (INTENTIONAL — a declared/structural fact says the rung does not apply)
| "unver" (UNINTENTIONAL — the rung should have run and wasn't verified; blocks the level like a
fail). `derive_rungs` is the single place every N/A source is classified; anything it cannot
attribute to a declared/structural fact defaults to "unver" (conservative). `skips` mirrors that
split into results.json: intentional {rung: reason} / unintentional [rung] (= the unver rungs).
The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
rung-status dict derived here (`derive_rungs`) from the tier results + deps/SSO signals the
orchestrator holds; that mapping is documented in DECISIONS.md (Phase 3).
rung-status dict derived here (`derive_rungs`) from the tier results + structural signals the
orchestrator holds; the classification table is in DECISIONS.md (phase lvl5).
This module is import-pure (no side effects at import). `write_results` is the only writer; the
orchestrator calls the build/write path inside a try/except so a results failure NEVER changes the
@ -138,53 +140,90 @@ def derive_rungs(
results: dict[str, str],
*,
backup_capable: bool,
has_custom: bool,
has_upgrade_target: bool,
expected_na: dict | None = None,
lint_status: str | None = None,
) -> dict[str, str]:
"""Translate the orchestrator's tier results into the rung-status dict harness.level consumes —
the FOUR essential rungs only. Conservative by design — never reports a rung 'pass' it can't
substantiate (cardinal guardrail: presentation never inflates).
"""Translate the orchestrator's tier results + structural signals into the rung-status dict
harness.level consumes — the FIVE essential rungs. This is the SINGLE place every N/A source
is classified intentional ("skip") vs unintentional ("unver"); the table lives in DECISIONS.md
(phase lvl5). Conservative by design: never reports "pass" it can't substantiate, and any
rung that did not produce a pass/fail and has NO declared/structural reason is "unver".
L1 install : install tier pass.
L2 upgrade : upgrade tier (skip → N/A: only one published version).
L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
L4 functional : recipe-specific functional tests pass — the custom tier. N/A if none ran.
L1 install : install tier pass. Always applies — never "skip" (non-run → unver).
L2 upgrade : upgrade tier. Tier skipped + no upgrade target (only one published
version, structural) → "skip"; declared in EXPECTED_NA → "skip";
anything else non-pass/fail (prior-stage abort, tier excluded) → "unver".
L3 backup/res : backup AND restore tiers pass. Not backup-capable (declared/structural)
"skip"; EXPECTED_NA → "skip"; unverified-while-capable → "unver".
L4 functional : the custom tier. No custom tests / tier skipped → EXPECTED_NA-declared
"skip", else "unver" (absent functional coverage is a gap, not an
intentional property of the recipe).
L5 lint : from the lint executor (harness.lint). pass/fail only — every recipe can
be linted, so there is NO intentional-skip escape hatch: a lint that
could not run (timeout, abra missing, executor error) is "unver".
Integration (SSO/OIDC) and recipe-local are OPTIONAL and intentionally NOT rungs here — they
never cap the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
never affect the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
"""
expected = set((expected_na or {}).keys())
rungs: dict[str, str] = {}
rungs["install"] = level_mod.tier_to_rung(results.get("install"))
rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
rungs["backup_restore"] = level_mod.backup_restore_status(
up = results.get("upgrade")
if up in ("pass", "fail"):
rungs["upgrade"] = up
elif up == "skip" and not has_upgrade_target:
# The orchestrator skipped the tier for the structural reason: nothing to upgrade from.
rungs["upgrade"] = "skip"
elif "upgrade" in expected:
rungs["upgrade"] = "skip"
else:
rungs["upgrade"] = "unver"
br = level_mod.backup_restore_status(
results.get("backup"), results.get("restore"), backup_capable
)
if br == "unver" and "backup_restore" in expected:
br = "skip"
rungs["backup_restore"] = br
custom = results.get("custom")
if not has_custom or custom == "skip" or custom is None:
rungs["functional"] = "na"
elif custom == "fail":
rungs["functional"] = "fail"
else: # custom == "pass"
rungs["functional"] = "pass"
if custom in ("pass", "fail"):
rungs["functional"] = custom
elif "functional" in expected:
rungs["functional"] = "skip"
else:
rungs["functional"] = "unver"
rungs["lint"] = lint_status if lint_status in ("pass", "fail") else "unver"
return rungs
def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
"""Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).
# Reasons attached to STRUCTURAL intentional skips (no EXPECTED_NA declaration needed — the
# fact is read off the recipe itself).
_STRUCTURAL_REASON = {
"upgrade": "only one published version — no upgrade target",
"backup_restore": "not backup-capable (no backupbot labels / declared)",
}
A recipe lists the rungs it intentionally skips, each with a reason, in
`recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
**intentional** iff it is in that list; any rung that is skipped and NOT in the list is
**unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
level either way — the harness never claims a rung it did not verify — this only labels *why* a
skip happened. Returns:
{ "intentional": {rung: reason, ...}, # skipped AND declared in EXPECTED_NA
"unintentional": [rung, ...] } # skipped but NOT declared
"""
def skips(
rungs: dict[str, str],
expected_na: dict | None,
) -> dict:
"""Mirror the rung classification into results.json's `skips` block:
{ "intentional": {rung: reason, ...}, # status "skip" — declared/structural, with why
"unintentional": [rung, ...] } # status "unver" — should have run, wasn't verified
The reason is the recipe's EXPECTED_NA declaration when present, else the structural fact
derive_rungs skipped on. Purely descriptive — the level math lives in harness.level."""
expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
na = [r for r, st in rungs.items() if st == "na"]
intentional = {r: expected[r] for r in na if r in expected}
unintentional = sorted(r for r in na if r not in expected)
intentional = {
r: expected.get(r) or _STRUCTURAL_REASON.get(r, "declared intentional")
for r, st in rungs.items()
if st == "skip"
}
unintentional = sorted(r for r, st in rungs.items() if st == "unver")
return {"intentional": intentional, "unintentional": unintentional}
@ -200,6 +239,8 @@ def build_results(
clean_teardown: bool,
no_secret_leak: bool,
finished_ts: float | None,
has_upgrade_target: bool = True,
lint: dict | None = None,
screenshot: str | None = None,
summary_card: str | None = None,
expected_na: dict | None = None,
@ -207,17 +248,41 @@ def build_results(
) -> dict:
"""Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
declared intentional-skip map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
accidentally-missing coverage."""
declared intentional-skip map (recipe_meta.EXPECTED_NA); `has_upgrade_target` is the structural
"a previous published version exists" fact; `lint` is harness.lint.run_lint's result dict
(None — e.g. an old caller — derives the lint rung as "unver": never a silent pass)."""
stages = collect_stages(records)
has_custom = any(r["tier"] == "custom" for r in records)
rungs = derive_rungs(results, backup_capable=backup_capable, has_custom=has_custom)
lvl, cap_reason = level_mod.compute_level(rungs)
# The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
# (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
capped = level_mod.RUNGS[lvl] if cap_reason else None
lint = lint or {}
lint_status = lint.get("status")
rungs = derive_rungs(
results,
backup_capable=backup_capable,
has_upgrade_target=has_upgrade_target,
expected_na=expected_na,
lint_status=lint_status,
)
# Surface lint in the per-stage table too (it has no pytest/JUnit tier), so the card's
# stage breakdown carries all five rungs.
if rungs["lint"] != "skip": # lint is never "skip", but stay defensive
stages.append(
{
"name": "lint",
"status": rungs["lint"],
"tests": [
{
"name": "abra recipe lint",
"classname": "lint",
"source": "harness",
"status": rungs["lint"],
"ms": 0,
"message": str(lint.get("detail") or ""),
}
],
}
)
lvl = level_mod.compute_level(rungs)
return {
"schema": 1,
"schema": 2,
"run_id": run_id(),
"recipe": recipe,
"version": version,
@ -225,9 +290,12 @@ def build_results(
"ref": (ref or "")[:12],
"finished": finished_ts,
"level": lvl,
"level_cap_reason": cap_reason,
"level_cap_rung": capped,
"rungs": rungs,
"lint": {
"status": rungs["lint"],
"detail": str(lint.get("detail") or ""),
"rules_failed": list(lint.get("rules_failed") or []),
},
"skips": skips(rungs, expected_na),
"stages": stages,
"results": results,

View File

@ -58,6 +58,9 @@ from harness import ( # noqa: E402
from harness import ( # noqa: E402
deps as deps_mod,
)
from harness import ( # noqa: E402
lint as lint_mod,
)
from harness import ( # noqa: E402
manifest as manifest_mod,
)
@ -928,6 +931,24 @@ def main() -> int:
run_artifact_dir = os.path.join(results_mod.runs_dir(), results_mod.run_id())
junit_dir = os.path.join(run_artifact_dir, "junit")
records: list[dict] = []
# L5 lint rung (phase lvl5): `abra recipe lint` against the EXACT tested ref, in a pristine
# scratch clone (harness.lint — the per-run tree is still at head_ref here, before any
# version-pinning checkout). Level rung only — NEVER the verdict: run_lint catches every
# failure mode into status "unver" (60s hard budget) and this belt-and-braces wrap makes a
# crashed executor identical to "could not verify".
lint_result = {"status": "unver", "detail": "lint executor crashed", "rules_failed": []}
try:
lint_result = lint_mod.run_lint(recipe, head_ref, run_artifact_dir)
except Exception as e: # noqa: BLE001 — lint is a rung, not a gate; never touches the verdict
print(
f"!! lint rung executor crashed (non-fatal, rung=unver): {_scrub(str(e))}", flush=True
)
print(
f"lint rung: {lint_result['status']}"
f"{'' + lint_result['detail'] if lint_result.get('detail') else ''}",
flush=True,
)
with contextlib.suppress(OSError):
os.makedirs(junit_dir, exist_ok=True)
@ -1253,6 +1274,8 @@ def main() -> int:
records=records,
results=results,
backup_capable=backup_cap,
has_upgrade_target=prev is not None, # structural: a previous published version exists
lint=lint_result, # L5 rung (phase lvl5)
clean_teardown=clean_teardown,
no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact
screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded
@ -1270,17 +1293,15 @@ def main() -> int:
file=sys.stderr,
)
path = results_mod.write_results(data)
print(
f"results.json written: {path} (level={data['level']}"
f"{'' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
flush=True,
)
# Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
# but is not in the recipe's intentional list — either add the missing coverage or declare it.
print(f"results.json written: {path} (level={data['level']} of 5)", flush=True)
# Surface UNVERIFIED rungs in the CI log (non-blocking, R7): a rung that should have run
# and wasn't verified blocks the level above it — fill the coverage, or (where a
# declared/structural reason genuinely applies) declare it in EXPECTED_NA.
for rung in data.get("skips", {}).get("unintentional", []):
print(
f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
f"⚠ coverage: rung '{rung}' is UNVERIFIED (did not run / could not be checked) — "
f"the level cannot rise above it. Add the missing test/coverage, or declare a "
f"genuine inapplicability in tests/{recipe}/recipe_meta.py "
f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
flush=True,
)
@ -1302,21 +1323,10 @@ def main() -> int:
with open(html_path, "w", encoding="utf-8") as f:
f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
capped = data.get("level_cap_rung")
sk = data.get("skips", {})
cap_skip = (
"intentional"
if capped in (sk.get("intentional") or {})
else "unintentional"
if capped in (sk.get("unintentional") or [])
else ""
)
# Badge = level only (number + colour) — the per-rung table on the card is the sole
# carrier of "why isn't this higher" (operator-specified, phase lvl5).
with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
f.write(
card_mod.level_badge_svg(
data["level"], data.get("level_cap_reason", ""), cap_skip
)
)
f.write(card_mod.level_badge_svg(data["level"]))
print(
f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
f"badge.svg written into {run_artifact_dir}",

View File

@ -1,8 +1,11 @@
"""Unit tests for the pure card/badge renderers (harness.card), Phase 3 U2 (R3/R6).
"""Unit tests for the pure card/badge renderers (harness.card) — phase lvl5 semantics.
Covers the deterministic HTML + SVG string builders (the PNG step needs Playwright + is exercised in
the U2 live demo). The cardinal check: the card REPORTS the data verbatim — level/marks come straight
from the dict, never recomputed. Run cold: cc-ci-run -m pytest tests/unit/test_card.py -q
Covers the deterministic HTML + SVG string builders (the PNG step needs Playwright + is exercised
live). The cardinal check: the card REPORTS the data verbatim — level/marks come straight from the
dict, never recomputed — the badge is NUMBER + COLOUR ONLY, and the per-rung table rows (incl.
intentional-skip / unverified) are the sole carrier of "why isn't the level higher". Old schema-1
artifacts (4-rung ladder, cap fields present) must render without error and without relabeling.
Run cold: cc-ci-run -m pytest tests/unit/test_card.py -q
"""
from __future__ import annotations
@ -14,12 +17,19 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")
from harness import card as C # noqa: E402
def _data(level=3, cap="L4 functional (recipe-specific tests) N/A"):
return {
def _data(level=5, **kw):
d = {
"schema": 2,
"recipe": "uptime-kuma",
"version": "1.23.0",
"level": level,
"level_cap_reason": cap,
"rungs": {
"install": "pass",
"upgrade": "pass",
"backup_restore": "pass",
"functional": "pass",
"lint": "pass",
},
"flags": {"clean_teardown": True, "no_secret_leak": True},
"screenshot": "screenshot.png",
"stages": [
@ -36,46 +46,54 @@ def _data(level=3, cap="L4 functional (recipe-specific tests) N/A"):
{"name": "test_broken", "status": "fail", "ms": 5},
],
},
{
"name": "lint",
"status": "pass",
"tests": [{"name": "abra recipe lint", "status": "pass", "ms": 0}],
},
],
}
d.update(kw)
return d
def test_level_color_ramp():
assert C.level_color(0) != C.level_color(6)
assert C.level_color(6) == "#3fb950"
assert C.level_color(99) == "#8b949e" # unknown → grey
# 0 (red) … 5 (bright green — full 5-rung climb); unknown → grey.
assert C.level_color(0) != C.level_color(5)
assert C.level_color(5) == "#3fb950"
assert C.level_color(99) == "#8b949e"
def test_badge_svg_wellformed():
def test_badge_svg_is_number_and_color_only():
svg = C.level_badge_svg(4)
assert svg.startswith("<svg") and svg.endswith("</svg>")
assert "level 4" in svg
assert C.level_color(4) in svg
# plain cap (no intent) → two-box badge, no third segment
assert "expected" not in svg and "gap?" not in svg
# operator-specified (phase lvl5): NOTHING but the level on the badge no annotation
# segment of any kind, whatever the rung situation.
assert "expected" not in svg and "gap?" not in svg and "skip" not in svg
def test_badge_svg_differentiates_intentional_vs_unintentional_skip():
# an intentional (declared) skip capped the climb → muted "expected" third segment
exp = C.level_badge_svg(2, "L3 backup/restore N/A", "intentional")
assert "level 2" in exp and "expected" in exp and C.EXPECT_COLOR in exp
assert "gap?" not in exp
# an unintentional skip (not declared) → amber "gap?" third segment
gap = C.level_badge_svg(2, "L3 backup/restore N/A", "unintentional")
assert "level 2" in gap and "gap?" in gap and C.GAP_COLOR in gap
assert "expected" not in gap
def test_badge_svg_level5():
svg = C.level_badge_svg(5)
assert "level 5" in svg and "#3fb950" in svg
def test_skip_rows_intentional_and_unintentional():
def test_skip_rows_intentional_and_unverified():
html_out = C._skip_rows(
{"intentional": {"backup_restore": "no persistent data"}, "unintentional": ["functional"]}
)
# intentional skip: labelled row (muted green) + the reason on its own line
assert "intentional skip" in html_out and C.SKIP_GREEN in html_out
assert "backup/restore" in html_out and "no persistent data" in html_out
# unintentional skip: amber row + prompt to declare/add coverage
assert "unintentional skip" in html_out and C.GAP_COLOR in html_out
assert "functional" in html_out and "EXPECTED_NA" in html_out
# unverified rung: amber row + the blocks-the-level explanation
assert "unverified" in html_out and C.GAP_COLOR in html_out
assert "functional" in html_out and "cannot rise above" in html_out
def test_skip_rows_lint_label_known():
html_out = C._skip_rows({"intentional": {}, "unintentional": ["lint"]})
assert ">lint<" in html_out.replace("</b>", "<") # rung label renders, not a KeyError
def test_skip_rows_empty_when_no_skips():
@ -83,22 +101,68 @@ def test_skip_rows_empty_when_no_skips():
def test_card_html_reports_level_verbatim():
html = C.render_card_html(_data(level=2, cap="L3 backup/restore (data integrity) N/A"))
html = C.render_card_html(_data(level=2))
assert "uptime-kuma" in html
assert "1.23.0" in html
# the level shown is exactly what was passed (no recompute)
assert ">2<" in html
assert "L3 backup/restore" in html
assert "level 2 of 5" in html
assert C.level_color(2) in html
def test_card_html_shows_stage_and_test_marks():
def test_card_html_no_cap_language():
html = C.render_card_html(_data())
assert "capped" not in html and "cap_reason" not in html
assert "level 5 of 5" in html
def test_card_html_old_schema1_artifact_renders():
# history compatibility: a pre-lvl5 results.json (4-rung ladder, cap fields, "na" statuses)
# renders without KeyError and shows ITS OWN ladder height (no retroactive relabeling).
old = {
"schema": 1,
"recipe": "legacy",
"version": "0.9",
"level": 4,
"level_cap_reason": "",
"level_cap_rung": None,
"rungs": {
"install": "pass",
"upgrade": "pass",
"backup_restore": "pass",
"functional": "pass",
},
"skips": {"intentional": {}, "unintentional": []},
"flags": {"clean_teardown": True, "no_secret_leak": True},
"screenshot": None,
"stages": [],
}
html = C.render_card_html(old)
assert "legacy" in html
assert "level 4 of 4" in html # the old top, not 5
assert "capped" not in html
def test_card_html_shows_stage_and_test_marks_incl_lint():
html = C.render_card_html(_data())
assert "install" in html and "custom" in html
assert "abra recipe lint" in html
assert "test_serving" in html and "test_broken" in html
assert C.STATUS_MARK["pass"] in html and C.STATUS_MARK["fail"] in html
def test_card_html_unver_stage_mark_renders():
d = _data()
d["stages"][2] = {
"name": "lint",
"status": "unver",
"tests": [{"name": "abra recipe lint", "status": "unver", "ms": 0, "message": "timed out"}],
}
html = C.render_card_html(d)
assert C.STATUS_MARK["unver"] in html
assert C.STATUS_COLOR["unver"] in html
def test_card_html_flags_rendered():
html = C.render_card_html(_data())
assert "clean teardown" in html and "no secret leak" in html

View File

@ -28,7 +28,6 @@ def _row(**kw):
"ref": "db9a9502",
"version": "db9a95024e9d",
"level": 4,
"level_cap_reason": "",
"has_screenshot": True,
"flags": {"clean_teardown": True, "no_secret_leak": True},
"finished": 0,
@ -40,7 +39,7 @@ def _row(**kw):
def test_level_color_ramp_and_fallback():
assert dashboard.level_color(0) == "#e5534b"
assert dashboard.level_color(6) == "#3fb950"
assert dashboard.level_color(5) == "#3fb950" # full 5-rung climb (phase lvl5)
assert dashboard.level_color(4) == "#a0b93f"
assert dashboard.level_color(99) == "#8b949e"
assert dashboard.level_color(None) == "#8b949e"
@ -61,20 +60,12 @@ def test_overview_grid_mirrors_results():
def test_overview_never_greener_than_data():
# A failed run at level 0 must show level 0 + the failure pill — never a green/high level.
out = dashboard.render_overview(
[
_row(
status="failure",
level=0,
has_screenshot=False,
flags={},
level_cap_reason="L1 install FAILED",
)
]
[_row(status="failure", level=0, has_screenshot=False, flags={})]
)
assert "level 0" in out
assert dashboard.level_color(0) in out # red
assert dashboard._COLORS["failure"] in out
assert "level 4" not in out and "level 5" not in out and "level 6" not in out
assert "level 4" not in out and "level 5" not in out
assert "no screenshot" in out # placeholder, no broken image
@ -104,7 +95,6 @@ def test_build_row_projects_results(monkeypatch):
lambda n: {
"version": "1.2.3",
"level": 2,
"level_cap_reason": "cap",
"screenshot": "screenshot.png",
"flags": {"clean_teardown": True},
},
@ -123,6 +113,38 @@ def test_build_row_projects_results(monkeypatch):
assert r["url"].endswith("/cc-ci/7")
def test_build_row_old_schema1_artifact_renders(monkeypatch):
# History compatibility (phase lvl5): pre-lvl5 results.json still carries cap fields and a
# 4-rung ladder — it must project + render without KeyError, level shown VERBATIM (no
# retroactive relabeling), and the old cap text simply isn't resurfaced anywhere.
monkeypatch.setattr(
dashboard,
"_results_for",
lambda n: {
"schema": 1,
"version": "0.9.1",
"level": 2,
"level_cap_reason": "L3 backup/restore (data integrity) N/A",
"level_cap_rung": "backup_restore",
"screenshot": "screenshot.png",
"flags": {"clean_teardown": True, "no_secret_leak": True},
},
)
b = {
"number": 11,
"status": "success",
"event": "custom",
"params": {"RECIPE": "legacy", "REF": "abc123"},
"finished": 5,
}
r = dashboard._build_row(b)
out = dashboard.render_overview([r])
assert "level 2" in out and dashboard.level_color(2) in out
assert "N/A" not in out and "capped" not in out # cap language gone from the surface
hist = dashboard.render_history("legacy", [r])
assert "L2" in hist
def test_build_row_degrades_without_results(monkeypatch):
# No results.json (e.g. an old run): grid still renders from Drone fields, level absent.
monkeypatch.setattr(dashboard, "_results_for", lambda n: {})

View File

@ -1,8 +1,14 @@
"""Unit tests for the Phase-3 level ladder (harness.level), plan-phase3-results-ux.md §4.1 / R1.
"""Unit tests for the level ladder (harness.level) — phase lvl5 semantics.
Pure function — no I/O. Proves the YunoHost gap-caps-the-level semantics, including the U0 gate
acceptance: a recipe that climbs through L4 reports 4, and one that fails at L2 is capped at 1
(the level just below the failed rung). Run cold with: cc-ci-run -m pytest tests/unit/test_level.py -q
Pure function — no I/O. Proves the operator-decided rule (plan-phase-lvl5-lint-rung.md,
DECISIONS.md phase lvl5):
level = max i such that rung_i == "pass" and every rung j < i is "pass" or "skip"
— a real FAIL blocks, an UNVERIFIED rung blocks exactly like a fail, an INTENTIONAL skip is
climbed past. Includes the mission's four worked examples verbatim, and the old N/A cases
(single-published-version recipe, non-backup-capable recipe) now climbing past their former
caps. Run cold with: cc-ci-run -m pytest tests/unit/test_level.py -q
"""
from __future__ import annotations
@ -19,69 +25,115 @@ def _rungs(
upgrade="pass",
backup_restore="pass",
functional="pass",
lint="pass",
):
return {
"install": install,
"upgrade": upgrade,
"backup_restore": backup_restore,
"functional": functional,
"lint": lint,
}
# ---- the ladder: four essential rungs, top is L4 (functional) ----
# ---- the ladder: five essential rungs, top is L5 (lint) ----
def test_full_clean_climb_to_L4():
# All four essential rungs pass → L4 (the top; integration/recipe-local are optional, not leveled).
lvl, reason = L.compute_level(_rungs())
assert lvl == 4
assert reason == ""
def test_full_clean_climb_is_L5():
assert L.compute_level(_rungs()) == 5
def test_fails_at_L2_capped_at_L1():
# GATE: upgrade fails → capped at L1 even though higher rungs would pass.
lvl, reason = L.compute_level(_rungs(upgrade="fail", backup_restore="pass", functional="pass"))
assert lvl == 1
assert "L2" in reason and "FAILED" in reason
def test_ladder_is_five_rungs_lint_on_top():
assert L.RUNGS == ("install", "upgrade", "backup_restore", "functional", "lint")
assert "lint" in L.RUNG_LABEL[5]
# ---- L0 / install ----
# ---- mission worked examples (operator Q&A 2026-06-11, verbatim) ----
def test_mission_example_fail_blocks():
# install ✔, upgrade ✘, backup ✔, functional ✔, lint ✔ → level 1 (fail blocks).
assert L.compute_level(_rungs(upgrade="fail")) == 1
def test_mission_example_intentional_skip_climbs():
# install ✔, upgrade ✔, backup skip (not capable), functional ✔, lint ✔ → level 5
# (previously capped at 2 — the confusing part the operator removed).
assert L.compute_level(_rungs(backup_restore="skip")) == 5
def test_mission_example_unverified_blocks():
# install ✔, upgrade ✔, backup UNVER (harness error), functional ✔, lint ✔ → level 2
# (we cannot claim what we didn't check).
assert L.compute_level(_rungs(backup_restore="unver")) == 2
def test_mission_example_unverified_top_rung_not_earned():
# all four ✔, lint unver (abra missing) → level 4.
assert L.compute_level(_rungs(lint="unver")) == 4
# ---- blocking semantics ----
def test_install_fail_is_L0():
lvl, reason = L.compute_level(_rungs(install="fail"))
assert lvl == 0
assert "L1" in reason and "FAILED" in reason
assert L.compute_level(_rungs(install="fail")) == 0
# ---- gap-caps semantics: a higher pass can't rescue a lower gap ----
def test_install_unver_is_L0():
assert L.compute_level(_rungs(install="unver")) == 0
def test_higher_pass_does_not_rescue_lower_na():
# backup/restore N/A (stateless app) caps at L2 even though functional would pass.
lvl, reason = L.compute_level(_rungs(backup_restore="na", functional="pass"))
assert lvl == 2
assert "L3" in reason and "N/A" in reason
def test_higher_pass_never_rescues_a_fail():
# everything above a failed rung is dead, however green.
assert L.compute_level(_rungs(upgrade="fail", backup_restore="pass", functional="pass")) == 1
def test_upgrade_na_caps_at_L1():
# only one published version → no upgrade possible → N/A caps at L1 (upgrade is essential).
lvl, reason = L.compute_level(_rungs(upgrade="na"))
assert lvl == 1
assert "L2" in reason and "N/A" in reason
def test_lint_fail_blocks_at_4():
assert L.compute_level(_rungs(lint="fail")) == 4
def test_functional_na_caps_at_L3():
# no recipe-specific functional tests → functional N/A caps at L3.
lvl, reason = L.compute_level(_rungs(functional="na"))
assert lvl == 3
assert "L4" in reason and "N/A" in reason
def test_unver_blocks_even_after_a_skip():
# skip at L2 is climbed past, but the unver at L3 still blocks → level 1.
assert L.compute_level(_rungs(upgrade="skip", backup_restore="unver")) == 1
def test_functional_fail_caps_at_L3():
lvl, reason = L.compute_level(_rungs(functional="fail"))
assert lvl == 3
assert "L4" in reason and "FAILED" in reason
# ---- intentional-skip climbing (the de-cap) ----
def test_single_version_recipe_climbs_past_upgrade_skip():
# old rule: upgrade N/A capped at L1. New rule: skip is climbed past → full climb 5.
assert L.compute_level(_rungs(upgrade="skip")) == 5
def test_stateless_recipe_climbs_past_backup_skip_to_lint():
assert L.compute_level(_rungs(upgrade="skip", backup_restore="skip")) == 5
def test_skip_does_not_count_as_pass():
# ALL skips → nothing passed → level 0 (a skip climbs, but never earns).
assert (
L.compute_level(
_rungs(
install="skip",
upgrade="skip",
backup_restore="skip",
functional="skip",
lint="skip",
)
)
== 0
)
def test_skip_then_pass_earns_the_higher_rung():
# skip at L4, pass at L5 → level 5 (the skip below doesn't stop the climb).
assert L.compute_level(_rungs(functional="skip")) == 5
def test_trailing_skip_keeps_last_pass():
# passes up to L3, skips above → level stays 3 (skips never raise).
assert L.compute_level(_rungs(functional="skip", lint="skip")) == 3
# ---- input validation ----
@ -89,7 +141,7 @@ def test_functional_fail_caps_at_L3():
def test_invalid_status_raises():
bad = _rungs()
bad["functional"] = "passed" # not in the vocabulary
bad["functional"] = "na" # the OLD vocabulary is no longer valid — every N/A is classified
try:
L.compute_level(bad)
except ValueError:
@ -97,6 +149,16 @@ def test_invalid_status_raises():
raise AssertionError("expected ValueError on invalid rung status")
def test_missing_rung_raises():
bad = _rungs()
del bad["lint"]
try:
L.compute_level(bad)
except ValueError:
return
raise AssertionError("expected ValueError on missing rung")
# ---- helpers: backup_restore_status ----
@ -104,8 +166,8 @@ def test_backup_restore_status_pass():
assert L.backup_restore_status("pass", "pass", True) == "pass"
def test_backup_restore_status_not_capable_is_na():
assert L.backup_restore_status("skip", "skip", False) == "na"
def test_backup_restore_status_not_capable_is_intentional_skip():
assert L.backup_restore_status("skip", "skip", False) == "skip"
def test_backup_restore_status_fail_on_either():
@ -113,16 +175,20 @@ def test_backup_restore_status_fail_on_either():
assert L.backup_restore_status("fail", "pass", True) == "fail"
def test_backup_restore_partial_is_na():
# backup-capable but restore didn't run cleanly (not pass, not fail) → cannot claim L3
assert L.backup_restore_status("pass", "skip", True) == "na"
def test_backup_restore_partial_is_unverified():
# backup-capable but restore didn't run cleanly (not pass, not fail) → cannot claim L3,
# and the non-run is NOT intentional → unver (blocks the level above it).
assert L.backup_restore_status("pass", "skip", True) == "unver"
assert L.backup_restore_status(None, None, True) == "unver"
# ---- helpers: tier_to_rung ----
def test_tier_to_rung_mapping():
def test_tier_to_rung_mapping_defaults_unverified():
assert L.tier_to_rung("pass") == "pass"
assert L.tier_to_rung("fail") == "fail"
assert L.tier_to_rung("skip") == "na"
assert L.tier_to_rung(None) == "na"
# no intentionality information here — a non-run is unver; derive_rungs upgrades to "skip"
# only on a declared/structural fact, never the other way.
assert L.tier_to_rung("skip") == "unver"
assert L.tier_to_rung(None) == "unver"

196
tests/unit/test_lint.py Normal file
View File

@ -0,0 +1,196 @@
"""Unit tests for the L5 lint executor (harness.lint) — phase lvl5.
Covers the table parser + classifier against real abra-0.13 output shapes (probed on the CI
host 2026-06-11, JOURNAL-lvl5), and run_lint's never-raise / never-silent-pass guarantees via
a fake-PATH `script` shim (no real abra needed). Run cold:
cc-ci-run -m pytest tests/unit/test_lint.py -q
"""
from __future__ import annotations
import os
import stat
import subprocess
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lint as L # noqa: E402
# Realistic abra lint table rows, as captured on cc-ci: abra renders HEAVY box-drawing
# verticals (┃ U+2503) — the parser must match those, not just the light │.
TABLE_OK = (
"┏━━━━━━┳━━━━━━┓\r\n"
"┃ R001 ┃ compose config has expected version ┃ warn ┃ ✅ ┃ - ┃ ensure ┃\r\n"
"┃ R015 ┃ long secret names ┃ warn ┃ ❌ ┃ - ┃ reduce ┃\r\n"
"┃ R008 ┃ .env.sample provided ┃ error ┃ ✅ ┃ - ┃ create ┃\r\n"
"┃ R014 ┃ only annotated tags used for recipe version ┃ error ┃ ✅ ┃ - ┃ retag ┃\r\n"
"┗━━━━━━┻━━━━━━┛\r\n"
"WARN secret session_secret is longer than 12 characters\r\n"
)
# The light-vertical variant must parse identically (defensive: abra theme/version drift).
TABLE_OK_LIGHT = TABLE_OK.replace("", "")
TABLE_R014_FAIL = (
TABLE_OK.replace(
"┃ R014 ┃ only annotated tags used for recipe version ┃ error ┃ ✅",
"┃ R014 ┃ only annotated tags used for recipe version ┃ error ┃ ❌",
)
+ "WARN critical errors present in hedgedoc config\r\n"
)
TABLE_SKIPPED_ERROR = TABLE_OK.replace(
"┃ R014 ┃ only annotated tags used for recipe version ┃ error ┃ ✅ ┃ - ┃",
"┃ R014 ┃ only annotated tags used for recipe version ┃ error ┃ ❌ ┃ skipped ┃",
)
# ---- parse_table ----
def test_parse_table_rows_and_marks():
rows = L.parse_table(TABLE_OK)
by = {r["rule"]: r for r in rows}
assert set(by) == {"R001", "R015", "R008", "R014"}
assert by["R001"]["severity"] == "warn" and by["R001"]["satisfied"]
assert by["R015"]["severity"] == "warn" and not by["R015"]["satisfied"]
assert by["R014"]["severity"] == "error" and by["R014"]["satisfied"]
assert not any(r["skipped"] for r in rows)
def test_parse_table_strips_ansi():
rows = L.parse_table("\x1b[1m" + TABLE_OK + "\x1b[0m")
assert len(rows) == 4
def test_parse_table_light_verticals_too():
assert L.parse_table(TABLE_OK_LIGHT) == L.parse_table(TABLE_OK)
def test_parse_table_garbage_is_empty():
assert L.parse_table("FATA something exploded\r\n") == []
assert L.parse_table("") == []
# ---- classify ----
def test_classify_pass_with_warn_misses_only():
# warn-severity ❌ (R015) does NOT fail the rung — only error-severity rules do.
assert L.classify(0, TABLE_OK) == ("pass", "", [])
def test_classify_error_rule_fails():
status, detail, failed = L.classify(0, TABLE_R014_FAIL)
assert status == "fail"
assert failed == ["R014"]
assert "R014" in detail
def test_classify_skipped_error_rule_does_not_fail_but_sentinel_guards():
# a skipped error rule isn't counted as failed by the parser, but abra's own sentinel line
# (if present) still forces fail — the classifier never out-greens abra.
status, _, failed = L.classify(0, TABLE_SKIPPED_ERROR)
assert failed == []
assert status == "pass"
status2, detail2, _ = L.classify(
0, TABLE_SKIPPED_ERROR + "WARN critical errors present in x config\r\n"
)
assert status2 == "fail"
assert "critical errors" in detail2
def test_classify_rc0_without_table_is_unver():
# rc=0 but nothing parseable → cannot claim pass.
assert L.classify(0, "weird output")[0] == "unver"
def test_classify_content_fata_is_fail():
out = "FATA unable to validate recipe: .env.sample for x couldn't be read\r\n"
status, detail, _ = L.classify(1, out)
assert status == "fail"
assert "unable to validate recipe" in detail
def test_classify_environment_fata_is_unver():
out = "FATA unable to fetch tags in /x: repository not found: Not found.\r\n"
status, detail, _ = L.classify(1, out)
assert status == "unver"
assert "fetch tags" in detail
def test_classify_did_not_run_is_unver():
assert L.classify(None, "")[0] == "unver"
# ---- run_lint: never raises, never silently passes ----
def _mkrecipe(tmp_path):
repo = tmp_path / "abra" / "recipes" / "fakerec"
repo.mkdir(parents=True)
(repo / "compose.yml").write_text("version: '3.8'\n")
for cmd in (
["git", "init", "-q"],
["git", "add", "."],
["git", "-c", "user.email=t@t", "-c", "user.name=t", "commit", "-qm", "x"],
):
subprocess.run(cmd, cwd=repo, check=True)
return repo
def _shim(tmp_path, body):
"""Drop a fake `script` executable on PATH (run_lint invokes `script -qec "abra ..."`)."""
bindir = tmp_path / "bin"
bindir.mkdir(exist_ok=True)
sh = bindir / "script"
sh.write_text("#!/bin/sh\n" + body)
sh.chmod(sh.stat().st_mode | stat.S_IEXEC)
return str(bindir)
def test_run_lint_pass_via_shim(tmp_path, monkeypatch):
_mkrecipe(tmp_path)
monkeypatch.setenv("ABRA_DIR", str(tmp_path / "abra"))
out = TABLE_OK.replace("\r\n", "\\n")
monkeypatch.setenv(
"PATH", _shim(tmp_path, f'printf "{out}"\nexit 0\n') + os.pathsep + os.environ["PATH"]
)
res = L.run_lint("fakerec", None, str(tmp_path / "artifacts"))
assert res["status"] == "pass"
txt = (tmp_path / "artifacts" / "lint.txt").read_text()
assert "abra recipe lint -n fakerec" in txt and "R001" in txt
def test_run_lint_fail_via_shim(tmp_path, monkeypatch):
_mkrecipe(tmp_path)
monkeypatch.setenv("ABRA_DIR", str(tmp_path / "abra"))
out = TABLE_R014_FAIL.replace("\r\n", "\\n")
monkeypatch.setenv(
"PATH", _shim(tmp_path, f'printf "{out}"\nexit 0\n') + os.pathsep + os.environ["PATH"]
)
res = L.run_lint("fakerec", None, str(tmp_path / "artifacts"))
assert res["status"] == "fail"
assert res["rules_failed"] == ["R014"]
def test_run_lint_missing_recipe_is_unver_not_raise(tmp_path, monkeypatch):
monkeypatch.setenv("ABRA_DIR", str(tmp_path / "abra-none"))
res = L.run_lint("no-such-recipe", None, str(tmp_path / "artifacts"))
assert res["status"] == "unver"
assert res["detail"]
# lint.txt still written with the failure context (loud, never silent)
assert (tmp_path / "artifacts" / "lint.txt").exists()
def test_run_lint_abra_blowup_is_unver(tmp_path, monkeypatch):
_mkrecipe(tmp_path)
monkeypatch.setenv("ABRA_DIR", str(tmp_path / "abra"))
monkeypatch.setenv(
"PATH",
_shim(tmp_path, 'echo "FATA inappropriate ioctl for device"\nexit 1\n')
+ os.pathsep
+ os.environ["PATH"],
)
res = L.run_lint("fakerec", None, None)
assert res["status"] == "unver"

View File

@ -1,7 +1,8 @@
"""Unit tests for Phase-3 results assembly (harness.results), plan-phase3-results-ux.md §4.2 / R1/R3.
"""Unit tests for results assembly (harness.results) — phase lvl5 semantics.
Covers JUnit parsing, stage roll-up, the tier→rung derivation (the documented mapping the level
depends on), and full results.json assembly incl. the U0 gate cases. Pure / tmp-file only. Run cold:
Covers JUnit parsing, stage roll-up, the tier→rung derivation (the SINGLE place every N/A source
is classified intentional-skip vs unverified — the table in DECISIONS.md phase lvl5), the L5 lint
rung wiring, and full results.json assembly. Pure / tmp-file only. Run cold:
cc-ci-run -m pytest tests/unit/test_results.py -q
"""
@ -27,6 +28,8 @@ JUNIT_MIXED = """<?xml version="1.0"?>
<testcase classname="tests.y" name="test_skipped" time="0"><skipped message="no deps"/></testcase>
</testsuite></testsuites>"""
LINT_PASS = {"status": "pass", "detail": "", "rules_failed": []}
def _write(tmp_path, name, content):
p = tmp_path / name
@ -90,7 +93,7 @@ def test_collect_stages_synthesizes_when_no_junit():
assert len(stages[0]["tests"]) == 1
# ---- derive_rungs: the documented mapping ----
# ---- derive_rungs: the documented N/A-classification mapping (DECISIONS.md phase lvl5) ----
def _results(**kw):
@ -105,34 +108,113 @@ def _results(**kw):
return base
def test_derive_rungs_full_climb_four_essential():
rungs = R.derive_rungs(_results(), backup_capable=True, has_custom=True)
# only the four essential rungs — integration/recipe-local are optional, not produced here.
def test_derive_rungs_full_climb_five_rungs():
rungs = R.derive_rungs(
_results(), backup_capable=True, has_upgrade_target=True, lint_status="pass"
)
# the five essential rungs — integration/recipe-local are optional, not produced here.
assert rungs == {
"install": "pass",
"upgrade": "pass",
"backup_restore": "pass",
"functional": "pass",
"lint": "pass",
}
def test_derive_rungs_stateless_backup_and_functional_na():
def test_derive_rungs_structural_skips_are_intentional():
# single published version (tier skipped, no upgrade target) + not backup-capable →
# both rungs are INTENTIONAL skips, not unverified.
rungs = R.derive_rungs(
_results(backup="skip", restore="skip", custom="skip"),
_results(upgrade="skip", backup="skip", restore="skip"),
backup_capable=False,
has_custom=False,
has_upgrade_target=False,
lint_status="pass",
)
assert rungs["backup_restore"] == "na"
assert rungs["functional"] == "na"
assert rungs["upgrade"] == "skip"
assert rungs["backup_restore"] == "skip"
assert "integration" not in rungs and "recipe_local" not in rungs
def test_derive_rungs_functional_fail():
rungs = R.derive_rungs(_results(custom="fail"), backup_capable=True, has_custom=True)
def test_derive_rungs_upgrade_skip_with_target_is_unverified():
# the tier skipped although an upgrade target exists (e.g. install failed → downstream
# skipped): NOT structural → unver.
rungs = R.derive_rungs(
_results(install="fail", upgrade="skip", backup="skip", restore="skip", custom="skip"),
backup_capable=True,
has_upgrade_target=True,
lint_status="pass",
)
assert rungs["install"] == "fail"
assert rungs["upgrade"] == "unver"
assert rungs["backup_restore"] == "unver"
assert rungs["functional"] == "unver"
def test_derive_rungs_missing_tier_is_unverified():
# a tier excluded from the run entirely (dev CCCI_STAGES escape) → no result key → unver,
# never an intentional skip (the recipe didn't declare anything).
res = {"install": "pass"}
rungs = R.derive_rungs(res, backup_capable=True, has_upgrade_target=True, lint_status="pass")
assert rungs["upgrade"] == "unver"
assert rungs["backup_restore"] == "unver"
assert rungs["functional"] == "unver"
def test_derive_rungs_expected_na_declares_intentional():
# EXPECTED_NA turns a non-run rung into an intentional skip (declared source).
rungs = R.derive_rungs(
_results(custom="skip"),
backup_capable=True,
has_upgrade_target=True,
expected_na={"functional": "no functional surface"},
lint_status="pass",
)
assert rungs["functional"] == "skip"
def test_derive_rungs_no_custom_tests_defaults_unverified():
# absent functional coverage with NO declaration is a gap → unver (conservative default).
rungs = R.derive_rungs(
_results(custom="skip"), backup_capable=True, has_upgrade_target=True, lint_status="pass"
)
assert rungs["functional"] == "unver"
def test_derive_rungs_expected_na_never_overrides_a_real_result():
# a declaration cannot soften an exercised rung: fail stays fail.
rungs = R.derive_rungs(
_results(custom="fail"),
backup_capable=True,
has_upgrade_target=True,
expected_na={"functional": "declared"},
lint_status="pass",
)
assert rungs["functional"] == "fail"
# ---- build_results: end-to-end incl level + flags ----
def test_derive_rungs_lint_never_skips():
# lint has NO intentional-skip escape hatch: pass/fail from the executor, anything else
# (None, "unver", junk) → unver — even if a recipe tries to declare it away.
for status, want in (("pass", "pass"), ("fail", "fail"), ("unver", "unver"), (None, "unver")):
rungs = R.derive_rungs(
_results(),
backup_capable=True,
has_upgrade_target=True,
expected_na={"lint": "nope"},
lint_status=status,
)
assert rungs["lint"] == want, status
def test_derive_rungs_functional_fail():
rungs = R.derive_rungs(
_results(custom="fail"), backup_capable=True, has_upgrade_target=True, lint_status="pass"
)
assert rungs["functional"] == "fail"
# ---- build_results: end-to-end incl level + lint + flags ----
def test_build_results_level_and_flags(tmp_path):
@ -163,17 +245,75 @@ def test_build_results_level_and_flags(tmp_path):
clean_teardown=True,
no_secret_leak=True,
finished_ts=1234.0,
lint=LINT_PASS,
)
# all four essential rungs pass → full climb to L4 (the top), no cap
assert data["level"] == 4
assert data["level_cap_reason"] == ""
# all five essential rungs pass → full climb to L5; no cap concept anywhere.
assert data["schema"] == 2
assert data["level"] == 5
assert "level_cap_reason" not in data and "level_cap_rung" not in data
assert data["recipe"] == "hedgedoc"
assert data["ref"] == "deadbeefcafe"
assert data["flags"] == {"clean_teardown": True, "no_secret_leak": True}
assert [s["name"] for s in data["stages"]] == ["install", "custom"]
# lint appears as a synthetic stage so the card's table carries all five rungs.
assert [s["name"] for s in data["stages"]] == ["install", "custom", "lint"]
assert data["lint"] == {"status": "pass", "detail": "", "rules_failed": []}
def test_build_results_capped_at_L1_on_upgrade_fail(tmp_path):
def test_build_results_lint_fail_blocks_at_4(tmp_path):
recs = [
{
"tier": "install",
"source": "generic",
"file": "g/test_install.py",
"rc": 0,
"junit": _write(tmp_path, "i.xml", JUNIT_PASS),
}
]
data = R.build_results(
recipe="x",
version=None,
pr="0",
ref=None,
records=recs,
results=_results(),
backup_capable=True,
clean_teardown=True,
no_secret_leak=True,
finished_ts=0.0,
lint={
"status": "fail",
"detail": "error rule(s) unsatisfied: R014",
"rules_failed": ["R014"],
},
)
assert data["level"] == 4
assert data["rungs"]["lint"] == "fail"
assert data["lint"]["rules_failed"] == ["R014"]
lint_stage = [s for s in data["stages"] if s["name"] == "lint"][0]
assert lint_stage["status"] == "fail"
assert "R014" in lint_stage["tests"][0]["message"]
def test_build_results_no_lint_given_is_unverified_never_pass(tmp_path):
# an old/lint-less caller must NEVER get a free L5: the rung derives as unver → level 4 max.
data = R.build_results(
recipe="x",
version=None,
pr="0",
ref=None,
records=[],
results=_results(),
backup_capable=True,
clean_teardown=True,
no_secret_leak=True,
finished_ts=0.0,
)
assert data["rungs"]["lint"] == "unver"
assert data["level"] == 4
assert "lint" in data["skips"]["unintentional"]
def test_build_results_level1_on_upgrade_fail(tmp_path):
recs = [
{
"tier": "install",
@ -194,12 +334,13 @@ def test_build_results_capped_at_L1_on_upgrade_fail(tmp_path):
clean_teardown=True,
no_secret_leak=True,
finished_ts=0.0,
lint=LINT_PASS,
)
assert data["level"] == 1
assert "L2" in data["level_cap_reason"]
assert data["rungs"]["upgrade"] == "fail"
# ---- skips: intentional (declared) vs unintentional (everything else skipped) ----
# ---- skips: intentional (declared/structural, with reason) vs unintentional (= unver) ----
def _rungs(**kw):
@ -208,24 +349,26 @@ def _rungs(**kw):
"upgrade": "pass",
"backup_restore": "pass",
"functional": "pass",
"lint": "pass",
}
base.update(kw)
return base
def test_skips_intentional_vs_unintentional():
rungs = _rungs(backup_restore="na", functional="na")
def test_skips_declared_reason_and_unverified_split():
rungs = _rungs(backup_restore="skip", functional="unver")
sk = R.skips(rungs, {"backup_restore": "stateless static server"})
# backup_restore is declared (intentional, with reason); functional skipped but not declared.
assert sk["intentional"] == {"backup_restore": "stateless static server"}
assert sk["unintentional"] == ["functional"]
def test_skips_none_declared_all_unintentional():
rungs = _rungs(backup_restore="na")
def test_skips_structural_reason_when_undeclared():
# a structural skip (derive_rungs) carries its structural reason even without EXPECTED_NA.
rungs = _rungs(upgrade="skip", backup_restore="skip")
sk = R.skips(rungs, None)
assert sk["intentional"] == {}
assert sk["unintentional"] == ["backup_restore"]
assert "only one published version" in sk["intentional"]["upgrade"]
assert "not backup-capable" in sk["intentional"]["backup_restore"]
assert sk["unintentional"] == []
def test_skips_declaration_only_counts_when_actually_skipped():
@ -236,9 +379,9 @@ def test_skips_declaration_only_counts_when_actually_skipped():
assert "backup_restore" not in sk["unintentional"]
def test_build_results_threads_expected_na(tmp_path):
# Mirrors custom-html-tiny post-change: install + a passing functional (custom) test, but no
# backup surface (backup_restore declared intentionally skipped).
def test_build_results_stateless_recipe_climbs(tmp_path):
# custom-html-tiny shape: no backup surface (declared), single published version, passing
# functional — formerly capped at L2 by the N/A; now climbs to L5 (the de-cap, mission §2).
recs = [
{
"tier": "install",
@ -261,23 +404,47 @@ def test_build_results_threads_expected_na(tmp_path):
pr="0",
ref=None,
records=recs,
results=_results(backup="skip", restore="skip"), # custom=pass (default) → functional pass
backup_capable=False, # no backupbot label → backup_restore skipped (N/A)
results=_results(upgrade="skip", backup="skip", restore="skip"),
backup_capable=False, # no backupbot label → structural intentional skip
has_upgrade_target=False, # single published version → structural intentional skip
clean_teardown=True,
no_secret_leak=True,
finished_ts=0.0,
lint=LINT_PASS,
expected_na={"backup_restore": "stateless static file server"},
)
# backup_restore skip still caps at L2 (never inflates) — even though functional passes above it,
# the skip caps the climb — but it's the declared (intentional) rung that capped.
assert data["level"] == 2
assert "L3" in data["level_cap_reason"]
assert data["level_cap_rung"] == "backup_restore"
assert data["rungs"]["functional"] == "pass"
assert data["level"] == 5 # skips are climbed past; nothing was inflated to get here
assert data["rungs"] == {
"install": "pass",
"upgrade": "skip",
"backup_restore": "skip",
"functional": "pass",
"lint": "pass",
}
assert data["skips"]["intentional"]["backup_restore"] == "stateless static file server"
assert (
data["skips"]["unintentional"] == []
) # backup_restore declared; functional passed → clean
assert "only one published version" in data["skips"]["intentional"]["upgrade"]
assert data["skips"]["unintentional"] == []
def test_build_results_unverified_backup_blocks(tmp_path):
# synthesized tier abort: backup-capable but the tiers never produced a result → unver → the
# level stays below the unverified rung (mission worked example #3).
data = R.build_results(
recipe="x",
version=None,
pr="0",
ref=None,
records=[],
results=_results(backup="skip", restore="skip"),
backup_capable=True,
clean_teardown=True,
no_secret_leak=True,
finished_ts=0.0,
lint=LINT_PASS,
)
assert data["rungs"]["backup_restore"] == "unver"
assert data["level"] == 2
assert data["skips"]["unintentional"] == ["backup_restore"]
def test_build_results_threads_customization(tmp_path):
@ -310,6 +477,7 @@ def test_build_results_threads_customization(tmp_path):
"clean_teardown": True,
"no_secret_leak": True,
"finished_ts": 0.0,
"lint": LINT_PASS,
}
assert R.build_results(**kwargs, customization=cust)["customization"] == cust
assert R.build_results(**kwargs)["customization"] is None