From 819000417bfff13a6b5530d671970862a5c19f2d Mon Sep 17 00:00:00 2001 From: mfowler Date: Tue, 16 Jun 2026 00:07:43 +0000 Subject: [PATCH] feat: benchmark builder-adversary-deferred (4-phase incl review); limit-detect across all roots --- analyze.py | 13 ++++--------- engine | 2 +- plans/calc/review.md | 23 +++++++++++++++++++++++ run-harness-bench.sh | 4 ++++ 4 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 plans/calc/review.md diff --git a/analyze.py b/analyze.py index 0005850..3e89532 100755 --- a/analyze.py +++ b/analyze.py @@ -30,16 +30,11 @@ for line in open(DATA): # flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is # inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign # run-root (repos are kept). -_camp = sorted(glob.glob("/tmp/ao-campaign-*")) -_camp_root = _camp[-1] if _camp else "" -_solo = sorted(glob.glob("/tmp/ao-solo-*")) -_solo_root = _solo[-1] if _solo else "" def _limit_hit(v, rep): - pats = [] - if _camp_root: - pats.append(f"{_camp_root}/{v}/r{rep}/.ao-state/*watchdog*.log") # campaign layout - if v == "builder-solo" and _solo_root: - pats.append(f"{_solo_root}/r{rep}/.ao-state/*watchdog*.log") # solo layout (no variant subdir) + # search ALL run roots (a variant/rep may live in any campaign root; solo has its own layout) + pats = [f"/tmp/ao-campaign-*/{v}/r{rep}/.ao-state/*watchdog*.log"] + if v == "builder-solo": + pats.append(f"/tmp/ao-solo-*/r{rep}/.ao-state/*watchdog*.log") for pat in pats: for wl in glob.glob(pat): try: diff --git a/engine b/engine index c6c7ce8..90375f0 160000 --- a/engine +++ b/engine @@ -1 +1 @@ -Subproject commit c6c7ce8640f36d9cae58d3f320b3922963c6809a +Subproject commit 90375f004edc3d08afebc2eeed9826707a6b07e4 diff --git a/plans/calc/review.md b/plans/calc/review.md new file mode 100644 index 0000000..e30758f --- /dev/null +++ b/plans/calc/review.md @@ -0,0 +1,23 @@ +# Phase `review` — comprehensive deferred verification + +No new features. The Builder has self-certified `lex`, `parse`, and `eval` and accumulated the whole +calculator. The Adversary now does its **one comprehensive cold-verification** of the entire build — +the single adversary gate in the run. + +## Definition of Done + +- **D1 — full cold re-verify.** From a FRESH clone, the Adversary re-runs **every DoD item from every + prior phase** (tokenizer, parser AST shape, evaluator + CLI) and confirms each passes. +- **D2 — full suite green.** `python -m unittest` passes, 0 failures. +- **D3 — cross-feature break-it.** Hunt interactions across lex→parse→eval that a per-gate view + misses: nested unary + parens (`-(-(1+2))`→3), precedence chains (`2+3*4-5/5`→13), error + propagation lexer→evaluator (`1 @ 2`, `1/0`, `(1+` all error cleanly), whitespace + floats + parens + together, CLI exit codes for valid vs invalid. File any defect. +- **D4 — findings cleared.** Every finding fixed by the Builder + re-verified PASS; no standing + `## VETO`. + +## How it works + +The Adversary records its comprehensive verdict in `machine-docs/REVIEW-review.md` +(`review(all): PASS`, or findings with repro). The Builder fixes anything found, then writes +`## DONE` to `machine-docs/STATUS-review.md` only after the Adversary's comprehensive PASS. diff --git a/run-harness-bench.sh b/run-harness-bench.sh index c19dd84..1ec5fb2 100755 --- a/run-harness-bench.sh +++ b/run-harness-bench.sh @@ -63,6 +63,9 @@ PY gen_config() { # local v="$1" run="$2" prefix="$3" + # builder-adversary-deferred adds a final comprehensive `review` phase (4 phases, not 3) + local review_phase="" + [ "$v" = "builder-adversary-deferred" ] && review_phase=" { id = \"review\", plan = \"$PLANS/review.md\", status = \"STATUS-review.md\" }," cat > "$run/agents.toml" <