feat: campaign mode — repeat each variant N times, aggregate distributions

run-harness-bench.sh now loops VARIANTS × BENCH_REPEATS (default 5), writes each run's row to RESULTS-campaign.md.data immediately (survives interruption), and aggregates per-variant median/mean/min/max/stdev + median duration into RESULTS-campaign.md. Frees each run's repo/transcripts after tallying.
2026-06-14 22:19:10 +00:00
parent b46dca003c
commit 37032ee363
2 changed files with 95 additions and 97 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,5 @@ __pycache__/
 *.pyc
 *.tmp
 RESULTS-harness.md.tmp
+RESULTS-campaign.md.data
+RESULTS-campaign.md.data.hdr
--- a/run-harness-bench.sh
+++ b/run-harness-bench.sh
@ -1,13 +1,16 @@
 #!/usr/bin/env bash
-# run-harness-bench.sh — FULL harness benchmark (real agents.py up loop, not headless single-pass).
+# run-harness-bench.sh — FULL harness benchmark CAMPAIGN.
 #
-# For each variant (builder-adversary, builder-adversary-min) this stands up a real Builder/Adversary
-# loop pair + watchdog over a shared work repo and lets them run autonomously through the multi-phase
-# calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE. Both loops on Sonnet. Then it
-# clocks the tokens each loop used (summed from the Claude Code session transcripts) and re-runs the
-# final Definition-of-Done itself.
+# For each variant, run the real agents.py up Builder/Adversary loop pair + watchdog through the
+# multi-phase calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE, REPEATS times, and
+# aggregate per-variant token distributions (median/mean/min/max/stdev) so the run-to-run variance
+# is measured, not guessed. Both loops on Sonnet. Tokens summed from each loop's session transcript.
 #
-# Long, autonomous, nondeterministic (N=1). Per-variant wall-clock timeout below. Usage: ./run-harness-bench.sh
+# Every run appends one row to RESULTS-campaign.md.data immediately (partial results survive a kill).
+# At the end, aggregates into RESULTS-campaign.md.
+#
+# Usage:  ./run-bench-campaign            (defaults: all 4 variants, 5 repeats each)
+#         BENCH_REPEATS=3 ./run-harness-bench.sh builder-adversary builder-adversary-stateless
 set -u

 BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
@ -15,18 +18,19 @@ ENGINE="$BENCH_DIR/engine"
 PLANS="$BENCH_DIR/plans/calc"
 AGENTS_PY="$ENGINE/agents.py"
 MODEL="claude-sonnet-4-6"
-RUNROOT="$(mktemp -d /tmp/ao-harness-XXXXXX)"   # no dot → clean transcript-dir mapping
-RESULTS="$BENCH_DIR/RESULTS-harness.md"
-TIMEOUT="${BENCH_TIMEOUT:-3000}"                # seconds per variant
+RUNROOT="$(mktemp -d /tmp/ao-campaign-XXXXXX)"
+RESULTS="$BENCH_DIR/RESULTS-campaign.md"
+DATA="$RESULTS.data"
+REPEATS="${BENCH_REPEATS:-5}"
+TIMEOUT="${BENCH_TIMEOUT:-1800}"        # seconds per run (calc runs finish in ~10-15 min)
 POLL=60
 GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
 VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean)
-[ $# -gt 0 ] && VARIANTS=("$@")   # run only the variants named on the command line, if any
+[ $# -gt 0 ] && VARIANTS=("$@")

-# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust
-# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge:
-# add only this dir's entry, preserve everything else (the file is shared global state).
-trust_dir() {
+log() { echo "[$(date -u +%H:%M:%S)] $*"; }
+
+trust_dir() {  # let interactive claude (tmux) skip the workspace-trust dialog; atomic merge
  python3 - "$1" <<'PY'
 import json,os,sys,tempfile
 p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
@ -38,14 +42,11 @@ json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
 PY
 }

-log() { echo "[$(date -u +%H:%M:%S)] $*"; }
-
-# transcript token sum for an agent's working dir -> "in out cache_create cache_read"
+# transcript token sum for a working dir -> "in out cache_create cache_read"
 collect_tokens() {
  python3 - "$1" <<'PY'
 import json,sys,os,glob
-wd=sys.argv[1].rstrip('/')
-name=wd.replace('/','-').replace('.','-')        # '/tmp/x' -> '-tmp-x'
+wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
 tdir=os.path.expanduser("~/.claude/projects/"+name)
 ti=to=tcc=tcr=0
 for f in glob.glob(tdir+"/*.jsonl"):
@ -121,112 +122,107 @@ phases = [
 EOF
 }

-declare -A SUM_TOK SUM_OK SUM_PHASES
-
-run_variant() {
-  local v="$1"
-  local run="$RUNROOT/$v"
-  local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}        # mktemp suffix → unique per invocation
-  local prefix="b${rtag}$(echo "$v" | cksum | cut -c1-2)-"          # unique per (run, variant); avoids tmux collisions
+run_one() {  # <variant> <rep>
+  local v="$1" rep="$2"
+  local run="$RUNROOT/$v/r$rep"
+  local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
+  local prefix="c${rtag:0:4}$(echo "$v.$rep" | cksum | cut -c1-3)-"
  mkdir -p "$run"
-  log "===== $v  (run dir: $run, prefix: $prefix) ====="
+  log "===== $v  rep $rep/$REPEATS  (prefix $prefix) ====="

-  # shared bare 'origin' + two clones
  git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
-  git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main   # so clones check out 'main' (we push main, not master)
+  git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
  git "${GIT_ID[@]}" init -q -b main "$run/seed"
-  ( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md \
-      && : > machine-docs/.gitkeep \
-      && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed work repo" \
+  ( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md && : > machine-docs/.gitkeep \
+      && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed" \
      && git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
  git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
  git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
  for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
-  trust_dir "$run/work"; trust_dir "$run/work-adv"   # let interactive claude skip the trust dialog
-
+  trust_dir "$run/work"; trust_dir "$run/work-adv"
  gen_config "$v" "$run" "$prefix"

-  log "[$v] agents.py up …"
  python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
-  log "[$v] up done; status:"; python3 "$AGENTS_PY" status --config "$run/agents.toml" 2>&1 | sed 's/^/    /'

-  # poll for SEQUENCE-COMPLETE or timeout
  local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
  while [ $t -lt "$TIMEOUT" ]; do
-    if [ -f "$marker" ]; then done="yes"; break; fi
+    [ -f "$marker" ] && { done="yes"; break; }
    sleep "$POLL"; t=$((t+POLL))
-    local idx commits
-    idx="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')"
-    commits="$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo '?')"
-    log "[$v] t=${t}s phase-idx=$idx origin-commits=$commits"
  done
-  log "[$v] loop finished (sequence-complete=$done after ${t}s); tearing down"
  python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1

-  # final DoD check from the adversary's clone (pull latest first)
  ( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null )
  local tests=no cli=no
-  ( cd "$run/work-adv" && python -m unittest -q ) >"$run/final-unittest.txt" 2>&1 && tests=yes
+  ( cd "$run/work-adv" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
  local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )"
  [ "$out" = "14" ] && cli=yes
-  local reviews; reviews="$(grep -rhoiE '(lex|parse|eval)/D[0-9]+:?\s*PASS' "$run/work-adv/machine-docs/" 2>/dev/null | sort -u | wc -l)"
-  # a standing veto is the "## VETO" marker (per the prompts) — NOT the word "veto" (matches "No veto")
-  local veto="no"; grep -rqiE '##[[:space:]]*VETO' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
-  SUM_PHASES[$v]="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')"
-
+  local veto=no; grep -rqiE '##[[:space:]]*VETO' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
  local success=NO
  [ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES
-  SUM_OK[$v]=$success

-  # tokens
  read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")"
  read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")"
-  local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) vtok=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))
-  SUM_TOK[$v]=$vtok
+  local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) total=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))

-  {
-    echo "### $v"
-    echo "- **success:** $success  (sequence-complete=$done, tests=$tests, cli('2+3*4'→'$out')=$cli, gates-passed=$reviews, veto=$veto, final phase-idx=${SUM_PHASES[$v]})"
-    echo "- **builder loop:** in=$bi out=$bo cache_create=$bcc cache_read=$bcr → **${btok}** tok"
-    echo "- **adversary loop:** in=$ai out=$ao cache_create=$acc cache_read=$acr → **${atok}** tok"
-    echo "- **total:** **${vtok}** tokens"
-    echo
-  } >>"$RESULTS.tmp"
-  log "[$v] DONE success=$success tokens=$vtok gates-passed=$reviews"
+  # append one row immediately (survives a kill): variant rep success btok atok total dur
+  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$v" "$rep" "$success" "$btok" "$atok" "$total" "$t" >>"$DATA"
+  log "  -> $v r$rep: success=$success total=$total dur=${t}s"
+  # free disk: drop this run's transcripts + repo now that tokens are tallied
+  rm -rf "$run/work/.git" "$run/work-adv/.git" "$run/origin.git" "$run/seed" 2>/dev/null
 }

-prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
+# ---- collect ----
+: > "$DATA"
+printf 'variant\trep\tsuccess\tbuilder\tadversary\ttotal\tduration_s\n' > "$DATA.hdr"
+for v in "${VARIANTS[@]}"; do
+  for rep in $(seq 1 "$REPEATS"); do run_one "$v" "$rep"; done
+done

-: >"$RESULTS.tmp"
-for v in "${VARIANTS[@]}"; do run_variant "$v"; done
+# ---- aggregate ----
+python3 - "$DATA" "$RESULTS" "$MODEL" "$REPEATS" "$(git -C "$ENGINE" rev-parse --short HEAD)" "$RUNROOT" <<'PY'
+import sys, statistics as st
+data, out, model, reps, eng, runroot = sys.argv[1:7]
+rows=[]
+for line in open(data):
+    p=line.rstrip("\n").split("\t")
+    if len(p)!=7: continue
+    v,rep,ok,b,a,tot,dur=p
+    rows.append((v,int(rep),ok,int(b),int(a),int(tot),int(dur)))
+variants=[]
+for r in rows:
+    if r[0] not in variants: variants.append(r[0])
+def fmt(n): return f"{n:,}"
+def stats(xs):
+    if not xs: return None
+    return dict(n=len(xs), min=min(xs), max=max(xs), mean=int(st.mean(xs)),
+                median=int(st.median(xs)), stdev=int(st.pstdev(xs)) if len(xs)>1 else 0)
+with open(out,"w") as f:
+    f.write("# Full-harness benchmark — campaign (5× per variant)\n\n")
+    f.write(f"Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
+            f"to SEQUENCE-COMPLETE, **{reps}× per variant**. Both loops on **{model}**. Engine `{eng}`. "
+            f"Tokens summed from each loop's session transcript.\n\n")
+    f.write("## Per-variant total tokens (across repeats)\n\n")
+    f.write("| variant | runs | success | median | mean | min | max | stdev | spread (max/min) |\n")
+    f.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|\n")
+    for v in variants:
+        tots=[r[5] for r in rows if r[0]==v]
+        oks=sum(1 for r in rows if r[0]==v and r[2]=="YES")
+        s=stats(tots)
+        spread=f"{s['max']/s['min']:.2f}x" if s and s['min'] else "—"
+        f.write(f"| {v} | {s['n']} | {oks}/{s['n']} | {fmt(s['median'])} | {fmt(s['mean'])} | "
+                f"{fmt(s['min'])} | {fmt(s['max'])} | {fmt(s['stdev'])} | {spread} |\n")
+    f.write("\n## Median duration\n\n| variant | median dur (s) |\n|---|--:|\n")
+    for v in variants:
+        durs=[r[6] for r in rows if r[0]==v]
+        f.write(f"| {v} | {int(st.median(durs)) if durs else '—'} |\n")
+    f.write("\n## All runs (raw)\n\n| variant | rep | success | builder | adversary | total | dur(s) |\n")
+    f.write("|---|:--:|:--:|--:|--:|--:|--:|\n")
+    for r in rows:
+        f.write(f"| {r[0]} | {r[1]} | {r[2]} | {fmt(r[3])} | {fmt(r[4])} | {fmt(r[5])} | {r[6]} |\n")
+    f.write(f"\n_Run root: `{runroot}`. Raw data: `RESULTS-campaign.md.data`._\n")
+print("wrote", out)
+PY

-{
-  echo "# Full-harness benchmark — prompt variants"
-  echo
-  echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
-  echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
-  echo "at \`$(git -C "$ENGINE" rev-parse --short HEAD)\`. Both loops on **$MODEL**. Per-variant timeout"
-  echo "${TIMEOUT}s. Tokens summed from the Claude Code session transcripts of each loop's clone."
-  echo
-  echo "## Static prompt size (chars: kickoff + role)"
-  echo "| version | builder | adversary |"
-  echo "|---|--:|--:|"
-  for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done
-  echo
-  echo "## Per-variant"
-  echo
-  cat "$RESULTS.tmp"
-  echo "## Summary"
-  echo "| version | success | total tokens |"
-  echo "|---|:--:|--:|"
-  for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done
-  echo
-  echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
-  echo "_Run dirs: \`$RUNROOT\`_"
-} >"$RESULTS"
-rm -f "$RESULTS.tmp"
-
-echo; echo "===== ALL DONE ====="
-for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done
-echo "Results: $RESULTS"
-echo "Run dirs: $RUNROOT"
+echo; echo "===== CAMPAIGN DONE ====="
+cat "$RESULTS"
+echo "Run root: $RUNROOT"