From 819000417bfff13a6b5530d671970862a5c19f2d Mon Sep 17 00:00:00 2001
From: mfowler <mfowler.email@protonmail.com>
Date: Tue, 16 Jun 2026 00:07:43 +0000
Subject: [PATCH] feat: benchmark builder-adversary-deferred (4-phase incl
 review); limit-detect across all roots

---
 analyze.py           | 13 ++++---------
 engine               |  2 +-
 plans/calc/review.md | 23 +++++++++++++++++++++++
 run-harness-bench.sh |  4 ++++
 4 files changed, 32 insertions(+), 10 deletions(-)
 create mode 100644 plans/calc/review.md
diff --git a/analyze.py b/analyze.py
index 0005850..3e89532 100755
--- a/analyze.py
+++ b/analyze.py
@@ -30,16 +30,11 @@ for line in open(DATA):
 # flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
 # inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
 # run-root (repos are kept).
-_camp = sorted(glob.glob("/tmp/ao-campaign-*"))
-_camp_root = _camp[-1] if _camp else ""
-_solo = sorted(glob.glob("/tmp/ao-solo-*"))
-_solo_root = _solo[-1] if _solo else ""
 def _limit_hit(v, rep):
-    pats = []
-    if _camp_root:
-        pats.append(f"{_camp_root}/{v}/r{rep}/.ao-state/*watchdog*.log")   # campaign layout
-    if v == "builder-solo" and _solo_root:
-        pats.append(f"{_solo_root}/r{rep}/.ao-state/*watchdog*.log")        # solo layout (no variant subdir)
+    # search ALL run roots (a variant/rep may live in any campaign root; solo has its own layout)
+    pats = [f"/tmp/ao-campaign-*/{v}/r{rep}/.ao-state/*watchdog*.log"]
+    if v == "builder-solo":
+        pats.append(f"/tmp/ao-solo-*/r{rep}/.ao-state/*watchdog*.log")
     for pat in pats:
         for wl in glob.glob(pat):
             try:
diff --git a/engine b/engine
index c6c7ce8..90375f0 160000
--- a/engine
+++ b/engine
@@ -1 +1 @@
-Subproject commit c6c7ce8640f36d9cae58d3f320b3922963c6809a
+Subproject commit 90375f004edc3d08afebc2eeed9826707a6b07e4
diff --git a/plans/calc/review.md b/plans/calc/review.md
new file mode 100644
index 0000000..e30758f
--- /dev/null
+++ b/plans/calc/review.md
@@ -0,0 +1,23 @@
+# Phase `review` — comprehensive deferred verification
+
+No new features. The Builder has self-certified `lex`, `parse`, and `eval` and accumulated the whole
+calculator. The Adversary now does its **one comprehensive cold-verification** of the entire build —
+the single adversary gate in the run.
+
+## Definition of Done
+
+- **D1 — full cold re-verify.** From a FRESH clone, the Adversary re-runs **every DoD item from every
+  prior phase** (tokenizer, parser AST shape, evaluator + CLI) and confirms each passes.
+- **D2 — full suite green.** `python -m unittest` passes, 0 failures.
+- **D3 — cross-feature break-it.** Hunt interactions across lex→parse→eval that a per-gate view
+  misses: nested unary + parens (`-(-(1+2))`→3), precedence chains (`2+3*4-5/5`→13), error
+  propagation lexer→evaluator (`1 @ 2`, `1/0`, `(1+` all error cleanly), whitespace + floats + parens
+  together, CLI exit codes for valid vs invalid. File any defect.
+- **D4 — findings cleared.** Every finding fixed by the Builder + re-verified PASS; no standing
+  `## VETO`.
+
+## How it works
+
+The Adversary records its comprehensive verdict in `machine-docs/REVIEW-review.md`
+(`review(all): PASS`, or findings with repro). The Builder fixes anything found, then writes
+`## DONE` to `machine-docs/STATUS-review.md` only after the Adversary's comprehensive PASS.
diff --git a/run-harness-bench.sh b/run-harness-bench.sh
index c19dd84..1ec5fb2 100755
--- a/run-harness-bench.sh
+++ b/run-harness-bench.sh
@@ -63,6 +63,9 @@ PY
 
 gen_config() {  # <variant> <run> <prefix>
   local v="$1" run="$2" prefix="$3"
+  # builder-adversary-deferred adds a final comprehensive `review` phase (4 phases, not 3)
+  local review_phase=""
+  [ "$v" = "builder-adversary-deferred" ] && review_phase="  { id = \"review\", plan = \"$PLANS/review.md\", status = \"STATUS-review.md\" },"
   cat > "$run/agents.toml" <<EOF
 [watchdog]
 signal_interval = 15
@@ -118,6 +121,7 @@ phases = [
   { id = "lex",   plan = "$PLANS/lex.md",   status = "STATUS-lex.md" },
   { id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
   { id = "eval",  plan = "$PLANS/eval.md",  status = "STATUS-eval.md" },
+$review_phase
 ]
 EOF
 }