Files
agent-orchestrator-benchmark/run-harness-bench.sh

249 lines
11 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# run-harness-bench.sh — FULL harness benchmark CAMPAIGN.
#
# For each variant, run the real agents.py up Builder/Adversary loop pair + watchdog through the
# multi-phase calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE, REPEATS times, and
# aggregate per-variant token distributions (median/mean/min/max/stdev) so the run-to-run variance
# is measured, not guessed. Both loops on Sonnet. Tokens summed from each loop's session transcript.
#
# Every run appends one row to RESULTS-campaign.md.data immediately (partial results survive a kill).
# At the end, aggregates into RESULTS-campaign.md.
#
# Usage: ./run-bench-campaign (defaults: all 4 variants, 5 repeats each)
# BENCH_REPEATS=3 ./run-harness-bench.sh builder-adversary builder-adversary-stateless
set -u
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
ENGINE="$BENCH_DIR/engine"
PLANS="$BENCH_DIR/plans/calc"
AGENTS_PY="$ENGINE/agents.py"
MODEL="claude-sonnet-4-6"
RUNROOT="$(mktemp -d /tmp/ao-campaign-XXXXXX)"
RESULTS="$BENCH_DIR/RESULTS-campaign.md"
DATA="$RESULTS.data"
REPEATS="${BENCH_REPEATS:-5}"
TIMEOUT="${BENCH_TIMEOUT:-1800}" # seconds per run (calc runs finish in ~10-15 min)
POLL=60
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean)
[ $# -gt 0 ] && VARIANTS=("$@")
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
trust_dir() { # let interactive claude (tmux) skip the workspace-trust dialog; atomic merge
python3 - "$1" <<'PY'
import json,os,sys,tempfile
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
cfg=json.load(open(p))
e=cfg.setdefault("projects",{}).setdefault(d,{})
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
PY
}
# transcript token sum for a working dir -> "in out cache_create cache_read"
collect_tokens() {
python3 - "$1" <<'PY'
import json,sys,os,glob
wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
tdir=os.path.expanduser("~/.claude/projects/"+name)
ti=to=tcc=tcr=0
for f in glob.glob(tdir+"/*.jsonl"):
for line in open(f, errors="ignore"):
try: o=json.loads(line)
except Exception: continue
if o.get("type")=="assistant":
u=(o.get("message",{}) or {}).get("usage",{}) or {}
ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
print(ti,to,tcc,tcr)
PY
}
gen_config() { # <variant> <run> <prefix>
local v="$1" run="$2" prefix="$3"
cat > "$run/agents.toml" <<EOF
[watchdog]
signal_interval = 15
heavy_interval = 60
limit_probe_fallback = 300
limit_reset_slack = 45
stall_grace = 180
[defaults]
session_prefix = "$prefix"
log_dir = "$run/.ao-state"
backend = "claude"
model = "$MODEL"
watch = "heal"
[backend.claude]
bin = "claude"
flags = "--dangerously-skip-permissions"
remote_control = true
supports_resume = true
prompt_delivery = "arg"
process_name = "claude"
submit_key = "Enter"
stall_idle = 300
active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
[[agent]]
name = "builder"
kind = "loop"
role = "builder"
dir = "$run/work"
watch = "heal+stall"
[[agent]]
name = "adversary"
session = "${prefix}adv"
kind = "loop"
role = "adversary"
dir = "$run/work-adv"
watch = "heal+stall"
[loop]
state_file = "phase-idx"
resume_phase = true
auto_advance = true
done_marker = "## DONE"
kickoff_template = "$ENGINE/examples/$v/prompts/kickoff.md"
roles_dir = "$ENGINE/examples/$v/prompts"
handoff = { repo = "$run/work", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"], claim_pattern = "^claim", review_pattern = "^review", state_subdir = "machine-docs" }
phases = [
{ id = "lex", plan = "$PLANS/lex.md", status = "STATUS-lex.md" },
{ id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
{ id = "eval", plan = "$PLANS/eval.md", status = "STATUS-eval.md" },
]
EOF
}
run_one() { # <variant> <rep>
local v="$1" rep="$2"
local run="$RUNROOT/$v/r$rep"
local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
local prefix="c${rtag:0:4}$(echo "$v.$rep" | cksum | cut -c1-3)-"
mkdir -p "$run"
log "===== $v rep $rep/$REPEATS (prefix $prefix) ====="
git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
git "${GIT_ID[@]}" init -q -b main "$run/seed"
( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md && : > machine-docs/.gitkeep \
&& git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed" \
&& git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
trust_dir "$run/work"; trust_dir "$run/work-adv"
gen_config "$v" "$run" "$prefix"
python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
while [ $t -lt "$TIMEOUT" ]; do
[ -f "$marker" ] && { done="yes"; break; }
sleep "$POLL"; t=$((t+POLL))
done
python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null )
local tests=no cli=no
( cd "$run/work-adv" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )"
[ "$out" = "14" ] && cli=yes
# standing veto = an all-caps "## VETO <reason>" header; NOT "## Veto log" (a ledger header)
local veto=no; grep -rqE '^##[[:space:]]*VETO[[:space:]]' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
local success=NO
[ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES
read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")"
read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")"
local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) total=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))
local commits loc
commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0)
loc=$(cat "$run/work-adv"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$')
# append one row immediately (survives a kill): variant rep success btok atok total dur commits loc
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$v" "$rep" "$success" "$btok" "$atok" "$total" "$t" "$commits" "$loc" >>"$DATA"
log " -> $v r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc"
# NOTE: repos are KEPT under "$run" (work/, work-adv/, origin.git) for later analysis — not deleted.
}
# ---- collect ----
# append-mode: data file is NOT wiped (clear manually for a fresh campaign)
printf 'variant\trep\tsuccess\tbuilder\tadversary\ttotal\tduration_s\tcommits\tloc\n' > "$DATA.hdr"
for v in "${VARIANTS[@]}"; do
for rep in $(seq 1 "$REPEATS"); do run_one "$v" "$rep"; done
done
# ---- aggregate ----
python3 - "$DATA" "$RESULTS" "$MODEL" "$REPEATS" "$(git -C "$ENGINE" rev-parse --short HEAD)" "$RUNROOT" <<'PY'
import sys, statistics as st
data, out, model, reps, eng, runroot = sys.argv[1:7]
rows=[]
for line in open(data):
p=line.rstrip("\n").split("\t")
if len(p)!=9: continue
v,rep,ok,b,a,tot,dur,commits,loc=p
rows.append(dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc)))
variants=[]
for r in rows:
if r["v"] not in variants: variants.append(r["v"])
def fmt(n): return f"{n:,}"
def stats(xs):
if not xs: return None
return dict(n=len(xs), min=min(xs), max=max(xs), mean=int(st.mean(xs)),
median=int(st.median(xs)), stdev=int(st.pstdev(xs)) if len(xs)>1 else 0)
def pearson(a,b):
if len(a)<2: return float('nan')
ma,mb=st.mean(a),st.mean(b)
den=(sum((x-ma)**2 for x in a)*sum((y-mb)**2 for y in b))**0.5
return (sum((x-ma)*(y-mb) for x,y in zip(a,b))/den) if den else float('nan')
ok_rows=[r for r in rows if r["ok"]=="YES"]
with open(out,"w") as f:
f.write("# Full-harness benchmark — campaign\n\n")
f.write(f"Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
f"to SEQUENCE-COMPLETE, **{reps}× per variant**. Both loops on **{model}**. Engine `{eng}`. "
f"Tokens summed from each loop's session transcript; commits = work-repo commit count; "
f"LOC = non-blank lines in `calc/*.py` (code + tests). Stats over SUCCESSFUL runs.\n\n")
f.write("## Per-variant — total tokens (successful runs)\n\n")
f.write("| variant | runs(ok) | median | mean | min | max | stdev | spread |\n")
f.write("|---|:--:|--:|--:|--:|--:|--:|--:|\n")
for v in variants:
tots=[r["total"] for r in ok_rows if r["v"]==v]
n=sum(1 for r in rows if r["v"]==v); s=stats(tots)
if not s: f.write(f"| {v} | 0/{n} | — | — | — | — | — | — |\n"); continue
spread=f"{s['max']/s['min']:.2f}x" if s['min'] else "—"
f.write(f"| {v} | {s['n']}/{n} | {fmt(s['median'])} | {fmt(s['mean'])} | {fmt(s['min'])} | "
f"{fmt(s['max'])} | {fmt(s['stdev'])} | {spread} |\n")
f.write("\n## Per-variant — medians (commits / LOC / duration)\n\n")
f.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
for v in variants:
sub=[r for r in ok_rows if r["v"]==v]
if not sub: f.write(f"| {v} | — | — | — |\n"); continue
f.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
f.write("\n## Correlations with total tokens (pooled over all successful runs)\n\n")
f.write(f"n = {len(ok_rows)} successful runs.\n\n| tokens vs | Pearson r |\n|---|--:|\n")
for k,lab in [("dur","duration"),("commits","commits"),("loc","LOC (code+tests)")]:
f.write(f"| {lab} | {pearson([r['total'] for r in ok_rows],[r[k] for r in ok_rows]):+.2f} |\n")
f.write("\n## All runs (raw)\n\n")
f.write("| variant | rep | ok | builder | adversary | total | dur(s) | commits | LOC |\n")
f.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|\n")
for r in rows:
f.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {fmt(r['builder'])} | {fmt(r['adversary'])} | "
f"{fmt(r['total'])} | {r['dur']} | {r['commits']} | {r['loc']} |\n")
f.write(f"\n_Run root (repos kept here for analysis): `{runroot}`. Raw data: `RESULTS-campaign.md.data`._\n")
print("wrote", out)
PY
echo; echo "===== CAMPAIGN DONE ====="
cat "$RESULTS"
echo "Run root: $RUNROOT"