Files
agent-orchestrator-benchmark/run-harness-bench.sh
mfowler 37032ee363 feat: campaign mode — repeat each variant N times, aggregate distributions
run-harness-bench.sh now loops VARIANTS × BENCH_REPEATS (default 5), writes each
run's row to RESULTS-campaign.md.data immediately (survives interruption), and
aggregates per-variant median/mean/min/max/stdev + median duration into
RESULTS-campaign.md. Frees each run's repo/transcripts after tallying.
2026-06-14 22:19:10 +00:00

229 lines
9.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# run-harness-bench.sh — FULL harness benchmark CAMPAIGN.
#
# For each variant, run the real agents.py up Builder/Adversary loop pair + watchdog through the
# multi-phase calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE, REPEATS times, and
# aggregate per-variant token distributions (median/mean/min/max/stdev) so the run-to-run variance
# is measured, not guessed. Both loops on Sonnet. Tokens summed from each loop's session transcript.
#
# Every run appends one row to RESULTS-campaign.md.data immediately (partial results survive a kill).
# At the end, aggregates into RESULTS-campaign.md.
#
# Usage: ./run-bench-campaign (defaults: all 4 variants, 5 repeats each)
# BENCH_REPEATS=3 ./run-harness-bench.sh builder-adversary builder-adversary-stateless
set -u
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
ENGINE="$BENCH_DIR/engine"
PLANS="$BENCH_DIR/plans/calc"
AGENTS_PY="$ENGINE/agents.py"
MODEL="claude-sonnet-4-6"
RUNROOT="$(mktemp -d /tmp/ao-campaign-XXXXXX)"
RESULTS="$BENCH_DIR/RESULTS-campaign.md"
DATA="$RESULTS.data"
REPEATS="${BENCH_REPEATS:-5}"
TIMEOUT="${BENCH_TIMEOUT:-1800}" # seconds per run (calc runs finish in ~10-15 min)
POLL=60
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean)
[ $# -gt 0 ] && VARIANTS=("$@")
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
trust_dir() { # let interactive claude (tmux) skip the workspace-trust dialog; atomic merge
python3 - "$1" <<'PY'
import json,os,sys,tempfile
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
cfg=json.load(open(p))
e=cfg.setdefault("projects",{}).setdefault(d,{})
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
PY
}
# transcript token sum for a working dir -> "in out cache_create cache_read"
collect_tokens() {
python3 - "$1" <<'PY'
import json,sys,os,glob
wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
tdir=os.path.expanduser("~/.claude/projects/"+name)
ti=to=tcc=tcr=0
for f in glob.glob(tdir+"/*.jsonl"):
for line in open(f, errors="ignore"):
try: o=json.loads(line)
except Exception: continue
if o.get("type")=="assistant":
u=(o.get("message",{}) or {}).get("usage",{}) or {}
ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
print(ti,to,tcc,tcr)
PY
}
gen_config() { # <variant> <run> <prefix>
local v="$1" run="$2" prefix="$3"
cat > "$run/agents.toml" <<EOF
[watchdog]
signal_interval = 15
heavy_interval = 60
limit_probe_fallback = 300
limit_reset_slack = 45
stall_grace = 180
[defaults]
session_prefix = "$prefix"
log_dir = "$run/.ao-state"
backend = "claude"
model = "$MODEL"
watch = "heal"
[backend.claude]
bin = "claude"
flags = "--dangerously-skip-permissions"
remote_control = true
supports_resume = true
prompt_delivery = "arg"
process_name = "claude"
submit_key = "Enter"
stall_idle = 300
active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
[[agent]]
name = "builder"
kind = "loop"
role = "builder"
dir = "$run/work"
watch = "heal+stall"
[[agent]]
name = "adversary"
session = "${prefix}adv"
kind = "loop"
role = "adversary"
dir = "$run/work-adv"
watch = "heal+stall"
[loop]
state_file = "phase-idx"
resume_phase = true
auto_advance = true
done_marker = "## DONE"
kickoff_template = "$ENGINE/examples/$v/prompts/kickoff.md"
roles_dir = "$ENGINE/examples/$v/prompts"
handoff = { repo = "$run/work", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"], claim_pattern = "^claim", review_pattern = "^review", state_subdir = "machine-docs" }
phases = [
{ id = "lex", plan = "$PLANS/lex.md", status = "STATUS-lex.md" },
{ id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
{ id = "eval", plan = "$PLANS/eval.md", status = "STATUS-eval.md" },
]
EOF
}
run_one() { # <variant> <rep>
local v="$1" rep="$2"
local run="$RUNROOT/$v/r$rep"
local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
local prefix="c${rtag:0:4}$(echo "$v.$rep" | cksum | cut -c1-3)-"
mkdir -p "$run"
log "===== $v rep $rep/$REPEATS (prefix $prefix) ====="
git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
git "${GIT_ID[@]}" init -q -b main "$run/seed"
( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md && : > machine-docs/.gitkeep \
&& git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed" \
&& git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
trust_dir "$run/work"; trust_dir "$run/work-adv"
gen_config "$v" "$run" "$prefix"
python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
while [ $t -lt "$TIMEOUT" ]; do
[ -f "$marker" ] && { done="yes"; break; }
sleep "$POLL"; t=$((t+POLL))
done
python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null )
local tests=no cli=no
( cd "$run/work-adv" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )"
[ "$out" = "14" ] && cli=yes
local veto=no; grep -rqiE '##[[:space:]]*VETO' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
local success=NO
[ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES
read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")"
read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")"
local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) total=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))
# append one row immediately (survives a kill): variant rep success btok atok total dur
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$v" "$rep" "$success" "$btok" "$atok" "$total" "$t" >>"$DATA"
log " -> $v r$rep: success=$success total=$total dur=${t}s"
# free disk: drop this run's transcripts + repo now that tokens are tallied
rm -rf "$run/work/.git" "$run/work-adv/.git" "$run/origin.git" "$run/seed" 2>/dev/null
}
# ---- collect ----
: > "$DATA"
printf 'variant\trep\tsuccess\tbuilder\tadversary\ttotal\tduration_s\n' > "$DATA.hdr"
for v in "${VARIANTS[@]}"; do
for rep in $(seq 1 "$REPEATS"); do run_one "$v" "$rep"; done
done
# ---- aggregate ----
python3 - "$DATA" "$RESULTS" "$MODEL" "$REPEATS" "$(git -C "$ENGINE" rev-parse --short HEAD)" "$RUNROOT" <<'PY'
import sys, statistics as st
data, out, model, reps, eng, runroot = sys.argv[1:7]
rows=[]
for line in open(data):
p=line.rstrip("\n").split("\t")
if len(p)!=7: continue
v,rep,ok,b,a,tot,dur=p
rows.append((v,int(rep),ok,int(b),int(a),int(tot),int(dur)))
variants=[]
for r in rows:
if r[0] not in variants: variants.append(r[0])
def fmt(n): return f"{n:,}"
def stats(xs):
if not xs: return None
return dict(n=len(xs), min=min(xs), max=max(xs), mean=int(st.mean(xs)),
median=int(st.median(xs)), stdev=int(st.pstdev(xs)) if len(xs)>1 else 0)
with open(out,"w") as f:
f.write("# Full-harness benchmark — campaign (5× per variant)\n\n")
f.write(f"Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
f"to SEQUENCE-COMPLETE, **{reps}× per variant**. Both loops on **{model}**. Engine `{eng}`. "
f"Tokens summed from each loop's session transcript.\n\n")
f.write("## Per-variant total tokens (across repeats)\n\n")
f.write("| variant | runs | success | median | mean | min | max | stdev | spread (max/min) |\n")
f.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|\n")
for v in variants:
tots=[r[5] for r in rows if r[0]==v]
oks=sum(1 for r in rows if r[0]==v and r[2]=="YES")
s=stats(tots)
spread=f"{s['max']/s['min']:.2f}x" if s and s['min'] else "—"
f.write(f"| {v} | {s['n']} | {oks}/{s['n']} | {fmt(s['median'])} | {fmt(s['mean'])} | "
f"{fmt(s['min'])} | {fmt(s['max'])} | {fmt(s['stdev'])} | {spread} |\n")
f.write("\n## Median duration\n\n| variant | median dur (s) |\n|---|--:|\n")
for v in variants:
durs=[r[6] for r in rows if r[0]==v]
f.write(f"| {v} | {int(st.median(durs)) if durs else '—'} |\n")
f.write("\n## All runs (raw)\n\n| variant | rep | success | builder | adversary | total | dur(s) |\n")
f.write("|---|:--:|:--:|--:|--:|--:|--:|\n")
for r in rows:
f.write(f"| {r[0]} | {r[1]} | {r[2]} | {fmt(r[3])} | {fmt(r[4])} | {fmt(r[5])} | {r[6]} |\n")
f.write(f"\n_Run root: `{runroot}`. Raw data: `RESULTS-campaign.md.data`._\n")
print("wrote", out)
PY
echo; echo "===== CAMPAIGN DONE ====="
cat "$RESULTS"
echo "Run root: $RUNROOT"