#!/usr/bin/env bash # run-solo-bench.sh — builder-SOLO control runs (no adversary), appended to the campaign data file. # # Same multi-phase calculator + same mechanics as run-harness-bench.sh, but a single builder agent # that builds AND self-verifies (engine/examples/builder-solo). Appends rows # variant=builder-solo rep success builder 0 total dur commits loc # to RESULTS-campaign.md.data so analyze.py folds it into the comparison. Run AFTER the main campaign # finishes (so it doesn't compete for usage limits). Requires the benchmark engine pinned at a ref # that ships examples/builder-solo. # # Usage: BENCH_REPEATS=5 ./run-solo-bench.sh set -u BENCH_DIR="$(cd "$(dirname "$0")" && pwd)" ENGINE="$BENCH_DIR/engine" PLANS="$BENCH_DIR/plans/calc" AGENTS_PY="$ENGINE/agents.py" MODEL="claude-sonnet-4-6" RUNROOT="$(mktemp -d /tmp/ao-solo-XXXXXX)" DATA="$BENCH_DIR/RESULTS-campaign.md.data" REPEATS="${BENCH_REPEATS:-5}" TIMEOUT="${BENCH_TIMEOUT:-1800}" POLL=60 GIT_ID=(-c user.email=bench@example.com -c user.name=bench) V="builder-solo" log() { echo "[$(date -u +%H:%M:%S)] $*"; } trust_dir() { python3 - "$1" <<'PY' import json,os,sys,tempfile p=os.path.expanduser("~/.claude.json"); d=sys.argv[1] cfg=json.load(open(p)); e=cfg.setdefault("projects",{}).setdefault(d,{}) e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd) json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p) PY } collect_tokens() { python3 - "$1" <<'PY' import json,sys,os,glob wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-') tdir=os.path.expanduser("~/.claude/projects/"+name); ti=to=tcc=tcr=0 for f in glob.glob(tdir+"/*.jsonl"): for line in open(f, errors="ignore"): try: o=json.loads(line) except Exception: continue if o.get("type")=="assistant": u=(o.get("message",{}) or {}).get("usage",{}) or {} ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0 tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0 print(ti+to+tcc+tcr) PY } gen_config() { # local run="$1" prefix="$2" cat > "$run/agents.toml" < local rep="$1" run="$RUNROOT/r$rep" local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-} local prefix="s${rtag:0:4}$(echo "$rep" | cksum | cut -c1-3)-" mkdir -p "$run" log "===== builder-solo rep $rep/$REPEATS (prefix $prefix) =====" git "${GIT_ID[@]}" init -q --bare "$run/origin.git" git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main git "${GIT_ID[@]}" init -q -b main "$run/seed" ( cd "$run/seed" && mkdir -p machine-docs && echo "# calc" > README.md && : > machine-docs/.gitkeep \ && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m seed \ && git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main ) git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work" ( cd "$run/work" && git config user.email bench@example.com && git config user.name bench ) trust_dir "$run/work" gen_config "$run" "$prefix" python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1 local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no" while [ $t -lt "$TIMEOUT" ]; do [ -f "$marker" ] && { done="yes"; break; } sleep "$POLL"; t=$((t+POLL)) done python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1 local tests=no cli=no ( cd "$run/work" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes local out; out="$( cd "$run/work" && python calc.py '2+3*4' 2>/dev/null )" [ "$out" = "14" ] && cli=yes local success=NO [ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && success=YES local total commits loc total=$(collect_tokens "$run/work") commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0) loc=$(cat "$run/work"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$') printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$V" "$rep" "$success" "$total" "0" "$total" "$t" "$commits" "$loc" >>"$DATA" log " -> builder-solo r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc" } for rep in $(seq 1 "$REPEATS"); do run_one "$rep"; done echo; echo "===== SOLO DONE ====="; echo "appended to $DATA ; run repos kept under $RUNROOT" echo "Now run: python3 analyze.py to fold builder-solo into the comparison."