249 lines
11 KiB
Bash
Executable File
249 lines
11 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# run-harness-bench.sh — FULL harness benchmark CAMPAIGN.
|
||
#
|
||
# For each variant, run the real agents.py up Builder/Adversary loop pair + watchdog through the
|
||
# multi-phase calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE, REPEATS times, and
|
||
# aggregate per-variant token distributions (median/mean/min/max/stdev) so the run-to-run variance
|
||
# is measured, not guessed. Both loops on Sonnet. Tokens summed from each loop's session transcript.
|
||
#
|
||
# Every run appends one row to RESULTS-campaign.md.data immediately (partial results survive a kill).
|
||
# At the end, aggregates into RESULTS-campaign.md.
|
||
#
|
||
# Usage: ./run-bench-campaign (defaults: all 4 variants, 5 repeats each)
|
||
# BENCH_REPEATS=3 ./run-harness-bench.sh builder-adversary builder-adversary-stateless
|
||
set -u
|
||
|
||
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
ENGINE="$BENCH_DIR/engine"
|
||
PLANS="$BENCH_DIR/plans/calc"
|
||
AGENTS_PY="$ENGINE/agents.py"
|
||
MODEL="claude-sonnet-4-6"
|
||
RUNROOT="$(mktemp -d /tmp/ao-campaign-XXXXXX)"
|
||
RESULTS="$BENCH_DIR/RESULTS-campaign.md"
|
||
DATA="$RESULTS.data"
|
||
REPEATS="${BENCH_REPEATS:-5}"
|
||
TIMEOUT="${BENCH_TIMEOUT:-1800}" # seconds per run (calc runs finish in ~10-15 min)
|
||
POLL=60
|
||
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
||
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean)
|
||
[ $# -gt 0 ] && VARIANTS=("$@")
|
||
|
||
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
||
|
||
trust_dir() { # let interactive claude (tmux) skip the workspace-trust dialog; atomic merge
|
||
python3 - "$1" <<'PY'
|
||
import json,os,sys,tempfile
|
||
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
|
||
cfg=json.load(open(p))
|
||
e=cfg.setdefault("projects",{}).setdefault(d,{})
|
||
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
|
||
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
|
||
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
|
||
PY
|
||
}
|
||
|
||
# transcript token sum for a working dir -> "in out cache_create cache_read"
|
||
collect_tokens() {
|
||
python3 - "$1" <<'PY'
|
||
import json,sys,os,glob
|
||
wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
|
||
tdir=os.path.expanduser("~/.claude/projects/"+name)
|
||
ti=to=tcc=tcr=0
|
||
for f in glob.glob(tdir+"/*.jsonl"):
|
||
for line in open(f, errors="ignore"):
|
||
try: o=json.loads(line)
|
||
except Exception: continue
|
||
if o.get("type")=="assistant":
|
||
u=(o.get("message",{}) or {}).get("usage",{}) or {}
|
||
ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
|
||
tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
|
||
print(ti,to,tcc,tcr)
|
||
PY
|
||
}
|
||
|
||
gen_config() { # <variant> <run> <prefix>
|
||
local v="$1" run="$2" prefix="$3"
|
||
cat > "$run/agents.toml" <<EOF
|
||
[watchdog]
|
||
signal_interval = 15
|
||
heavy_interval = 60
|
||
limit_probe_fallback = 300
|
||
limit_reset_slack = 45
|
||
stall_grace = 180
|
||
|
||
[defaults]
|
||
session_prefix = "$prefix"
|
||
log_dir = "$run/.ao-state"
|
||
backend = "claude"
|
||
model = "$MODEL"
|
||
watch = "heal"
|
||
|
||
[backend.claude]
|
||
bin = "claude"
|
||
flags = "--dangerously-skip-permissions"
|
||
remote_control = true
|
||
supports_resume = true
|
||
prompt_delivery = "arg"
|
||
process_name = "claude"
|
||
submit_key = "Enter"
|
||
stall_idle = 300
|
||
active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
|
||
limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
|
||
fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
|
||
|
||
[[agent]]
|
||
name = "builder"
|
||
kind = "loop"
|
||
role = "builder"
|
||
dir = "$run/work"
|
||
watch = "heal+stall"
|
||
|
||
[[agent]]
|
||
name = "adversary"
|
||
session = "${prefix}adv"
|
||
kind = "loop"
|
||
role = "adversary"
|
||
dir = "$run/work-adv"
|
||
watch = "heal+stall"
|
||
|
||
[loop]
|
||
state_file = "phase-idx"
|
||
resume_phase = true
|
||
auto_advance = true
|
||
done_marker = "## DONE"
|
||
kickoff_template = "$ENGINE/examples/$v/prompts/kickoff.md"
|
||
roles_dir = "$ENGINE/examples/$v/prompts"
|
||
handoff = { repo = "$run/work", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"], claim_pattern = "^claim", review_pattern = "^review", state_subdir = "machine-docs" }
|
||
phases = [
|
||
{ id = "lex", plan = "$PLANS/lex.md", status = "STATUS-lex.md" },
|
||
{ id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
|
||
{ id = "eval", plan = "$PLANS/eval.md", status = "STATUS-eval.md" },
|
||
]
|
||
EOF
|
||
}
|
||
|
||
run_one() { # <variant> <rep>
|
||
local v="$1" rep="$2"
|
||
local run="$RUNROOT/$v/r$rep"
|
||
local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
|
||
local prefix="c${rtag:0:4}$(echo "$v.$rep" | cksum | cut -c1-3)-"
|
||
mkdir -p "$run"
|
||
log "===== $v rep $rep/$REPEATS (prefix $prefix) ====="
|
||
|
||
git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
|
||
git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
|
||
git "${GIT_ID[@]}" init -q -b main "$run/seed"
|
||
( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md && : > machine-docs/.gitkeep \
|
||
&& git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed" \
|
||
&& git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
|
||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
|
||
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
|
||
trust_dir "$run/work"; trust_dir "$run/work-adv"
|
||
gen_config "$v" "$run" "$prefix"
|
||
|
||
python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
|
||
|
||
local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
|
||
while [ $t -lt "$TIMEOUT" ]; do
|
||
[ -f "$marker" ] && { done="yes"; break; }
|
||
sleep "$POLL"; t=$((t+POLL))
|
||
done
|
||
python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
|
||
|
||
( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null )
|
||
local tests=no cli=no
|
||
( cd "$run/work-adv" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
|
||
local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )"
|
||
[ "$out" = "14" ] && cli=yes
|
||
# standing veto = an all-caps "## VETO <reason>" header; NOT "## Veto log" (a ledger header)
|
||
local veto=no; grep -rqE '^##[[:space:]]*VETO[[:space:]]' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
|
||
local success=NO
|
||
[ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES
|
||
|
||
read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")"
|
||
read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")"
|
||
local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) total=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))
|
||
local commits loc
|
||
commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0)
|
||
loc=$(cat "$run/work-adv"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$')
|
||
|
||
# append one row immediately (survives a kill): variant rep success btok atok total dur commits loc
|
||
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$v" "$rep" "$success" "$btok" "$atok" "$total" "$t" "$commits" "$loc" >>"$DATA"
|
||
log " -> $v r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc"
|
||
# NOTE: repos are KEPT under "$run" (work/, work-adv/, origin.git) for later analysis — not deleted.
|
||
}
|
||
|
||
# ---- collect ----
|
||
# append-mode: data file is NOT wiped (clear manually for a fresh campaign)
|
||
printf 'variant\trep\tsuccess\tbuilder\tadversary\ttotal\tduration_s\tcommits\tloc\n' > "$DATA.hdr"
|
||
for v in "${VARIANTS[@]}"; do
|
||
for rep in $(seq 1 "$REPEATS"); do run_one "$v" "$rep"; done
|
||
done
|
||
|
||
# ---- aggregate ----
|
||
python3 - "$DATA" "$RESULTS" "$MODEL" "$REPEATS" "$(git -C "$ENGINE" rev-parse --short HEAD)" "$RUNROOT" <<'PY'
|
||
import sys, statistics as st
|
||
data, out, model, reps, eng, runroot = sys.argv[1:7]
|
||
rows=[]
|
||
for line in open(data):
|
||
p=line.rstrip("\n").split("\t")
|
||
if len(p)!=9: continue
|
||
v,rep,ok,b,a,tot,dur,commits,loc=p
|
||
rows.append(dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
|
||
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc)))
|
||
variants=[]
|
||
for r in rows:
|
||
if r["v"] not in variants: variants.append(r["v"])
|
||
def fmt(n): return f"{n:,}"
|
||
def stats(xs):
|
||
if not xs: return None
|
||
return dict(n=len(xs), min=min(xs), max=max(xs), mean=int(st.mean(xs)),
|
||
median=int(st.median(xs)), stdev=int(st.pstdev(xs)) if len(xs)>1 else 0)
|
||
def pearson(a,b):
|
||
if len(a)<2: return float('nan')
|
||
ma,mb=st.mean(a),st.mean(b)
|
||
den=(sum((x-ma)**2 for x in a)*sum((y-mb)**2 for y in b))**0.5
|
||
return (sum((x-ma)*(y-mb) for x,y in zip(a,b))/den) if den else float('nan')
|
||
ok_rows=[r for r in rows if r["ok"]=="YES"]
|
||
with open(out,"w") as f:
|
||
f.write("# Full-harness benchmark — campaign\n\n")
|
||
f.write(f"Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
|
||
f"to SEQUENCE-COMPLETE, **{reps}× per variant**. Both loops on **{model}**. Engine `{eng}`. "
|
||
f"Tokens summed from each loop's session transcript; commits = work-repo commit count; "
|
||
f"LOC = non-blank lines in `calc/*.py` (code + tests). Stats over SUCCESSFUL runs.\n\n")
|
||
f.write("## Per-variant — total tokens (successful runs)\n\n")
|
||
f.write("| variant | runs(ok) | median | mean | min | max | stdev | spread |\n")
|
||
f.write("|---|:--:|--:|--:|--:|--:|--:|--:|\n")
|
||
for v in variants:
|
||
tots=[r["total"] for r in ok_rows if r["v"]==v]
|
||
n=sum(1 for r in rows if r["v"]==v); s=stats(tots)
|
||
if not s: f.write(f"| {v} | 0/{n} | — | — | — | — | — | — |\n"); continue
|
||
spread=f"{s['max']/s['min']:.2f}x" if s['min'] else "—"
|
||
f.write(f"| {v} | {s['n']}/{n} | {fmt(s['median'])} | {fmt(s['mean'])} | {fmt(s['min'])} | "
|
||
f"{fmt(s['max'])} | {fmt(s['stdev'])} | {spread} |\n")
|
||
f.write("\n## Per-variant — medians (commits / LOC / duration)\n\n")
|
||
f.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
|
||
for v in variants:
|
||
sub=[r for r in ok_rows if r["v"]==v]
|
||
if not sub: f.write(f"| {v} | — | — | — |\n"); continue
|
||
f.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
|
||
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
|
||
f.write("\n## Correlations with total tokens (pooled over all successful runs)\n\n")
|
||
f.write(f"n = {len(ok_rows)} successful runs.\n\n| tokens vs | Pearson r |\n|---|--:|\n")
|
||
for k,lab in [("dur","duration"),("commits","commits"),("loc","LOC (code+tests)")]:
|
||
f.write(f"| {lab} | {pearson([r['total'] for r in ok_rows],[r[k] for r in ok_rows]):+.2f} |\n")
|
||
f.write("\n## All runs (raw)\n\n")
|
||
f.write("| variant | rep | ok | builder | adversary | total | dur(s) | commits | LOC |\n")
|
||
f.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|\n")
|
||
for r in rows:
|
||
f.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {fmt(r['builder'])} | {fmt(r['adversary'])} | "
|
||
f"{fmt(r['total'])} | {r['dur']} | {r['commits']} | {r['loc']} |\n")
|
||
f.write(f"\n_Run root (repos kept here for analysis): `{runroot}`. Raw data: `RESULTS-campaign.md.data`._\n")
|
||
print("wrote", out)
|
||
PY
|
||
|
||
echo; echo "===== CAMPAIGN DONE ====="
|
||
cat "$RESULTS"
|
||
echo "Run root: $RUNROOT"
|