run-solo-bench.sh runs the builder-solo variant (single builder, self-verify, no adversary) 5× on the same calculator and appends rows to the shared campaign data file (adversary col = 0). Separate script so the live campaign runner is untouched. analyze.py limit-detection now also covers the solo run layout. Engine example builder-solo committed at a0f7652; benchmark engine to be re- pinned to it before running solo (after the main campaign completes).
152 lines
5.7 KiB
Bash
Executable File
152 lines
5.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-solo-bench.sh — builder-SOLO control runs (no adversary), appended to the campaign data file.
|
|
#
|
|
# Same multi-phase calculator + same mechanics as run-harness-bench.sh, but a single builder agent
|
|
# that builds AND self-verifies (engine/examples/builder-solo). Appends rows
|
|
# variant=builder-solo rep success builder 0 total dur commits loc
|
|
# to RESULTS-campaign.md.data so analyze.py folds it into the comparison. Run AFTER the main campaign
|
|
# finishes (so it doesn't compete for usage limits). Requires the benchmark engine pinned at a ref
|
|
# that ships examples/builder-solo.
|
|
#
|
|
# Usage: BENCH_REPEATS=5 ./run-solo-bench.sh
|
|
set -u
|
|
|
|
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
ENGINE="$BENCH_DIR/engine"
|
|
PLANS="$BENCH_DIR/plans/calc"
|
|
AGENTS_PY="$ENGINE/agents.py"
|
|
MODEL="claude-sonnet-4-6"
|
|
RUNROOT="$(mktemp -d /tmp/ao-solo-XXXXXX)"
|
|
DATA="$BENCH_DIR/RESULTS-campaign.md.data"
|
|
REPEATS="${BENCH_REPEATS:-5}"
|
|
TIMEOUT="${BENCH_TIMEOUT:-1800}"
|
|
POLL=60
|
|
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
|
V="builder-solo"
|
|
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
|
|
|
trust_dir() {
|
|
python3 - "$1" <<'PY'
|
|
import json,os,sys,tempfile
|
|
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
|
|
cfg=json.load(open(p)); e=cfg.setdefault("projects",{}).setdefault(d,{})
|
|
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
|
|
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
|
|
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
|
|
PY
|
|
}
|
|
collect_tokens() {
|
|
python3 - "$1" <<'PY'
|
|
import json,sys,os,glob
|
|
wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
|
|
tdir=os.path.expanduser("~/.claude/projects/"+name); ti=to=tcc=tcr=0
|
|
for f in glob.glob(tdir+"/*.jsonl"):
|
|
for line in open(f, errors="ignore"):
|
|
try: o=json.loads(line)
|
|
except Exception: continue
|
|
if o.get("type")=="assistant":
|
|
u=(o.get("message",{}) or {}).get("usage",{}) or {}
|
|
ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
|
|
tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
|
|
print(ti+to+tcc+tcr)
|
|
PY
|
|
}
|
|
|
|
gen_config() { # <run> <prefix>
|
|
local run="$1" prefix="$2"
|
|
cat > "$run/agents.toml" <<EOF
|
|
[watchdog]
|
|
signal_interval = 15
|
|
heavy_interval = 60
|
|
limit_probe_fallback = 300
|
|
limit_reset_slack = 45
|
|
stall_grace = 180
|
|
|
|
[defaults]
|
|
session_prefix = "$prefix"
|
|
log_dir = "$run/.ao-state"
|
|
backend = "claude"
|
|
model = "$MODEL"
|
|
watch = "heal"
|
|
|
|
[backend.claude]
|
|
bin = "claude"
|
|
flags = "--dangerously-skip-permissions"
|
|
remote_control = true
|
|
supports_resume = true
|
|
prompt_delivery = "arg"
|
|
process_name = "claude"
|
|
submit_key = "Enter"
|
|
stall_idle = 300
|
|
active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
|
|
limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
|
|
fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
|
|
|
|
[[agent]]
|
|
name = "builder"
|
|
kind = "loop"
|
|
role = "builder"
|
|
dir = "$run/work"
|
|
watch = "heal+stall"
|
|
|
|
[loop]
|
|
state_file = "phase-idx"
|
|
resume_phase = true
|
|
auto_advance = true
|
|
done_marker = "## DONE"
|
|
kickoff_template = "$ENGINE/examples/builder-solo/prompts/kickoff.md"
|
|
roles_dir = "$ENGINE/examples/builder-solo/prompts"
|
|
handoff = { repo = "$run/work", state_subdir = "machine-docs" }
|
|
phases = [
|
|
{ id = "lex", plan = "$PLANS/lex.md", status = "STATUS-lex.md" },
|
|
{ id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
|
|
{ id = "eval", plan = "$PLANS/eval.md", status = "STATUS-eval.md" },
|
|
]
|
|
EOF
|
|
}
|
|
|
|
run_one() { # <rep>
|
|
local rep="$1" run="$RUNROOT/r$rep"
|
|
local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
|
|
local prefix="s${rtag:0:4}$(echo "$rep" | cksum | cut -c1-3)-"
|
|
mkdir -p "$run"
|
|
log "===== builder-solo rep $rep/$REPEATS (prefix $prefix) ====="
|
|
|
|
git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
|
|
git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
|
|
git "${GIT_ID[@]}" init -q -b main "$run/seed"
|
|
( cd "$run/seed" && mkdir -p machine-docs && echo "# calc" > README.md && : > machine-docs/.gitkeep \
|
|
&& git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m seed \
|
|
&& git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
|
|
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
|
( cd "$run/work" && git config user.email bench@example.com && git config user.name bench )
|
|
trust_dir "$run/work"
|
|
gen_config "$run" "$prefix"
|
|
|
|
python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
|
|
local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
|
|
while [ $t -lt "$TIMEOUT" ]; do
|
|
[ -f "$marker" ] && { done="yes"; break; }
|
|
sleep "$POLL"; t=$((t+POLL))
|
|
done
|
|
python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
|
|
|
|
local tests=no cli=no
|
|
( cd "$run/work" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
|
|
local out; out="$( cd "$run/work" && python calc.py '2+3*4' 2>/dev/null )"
|
|
[ "$out" = "14" ] && cli=yes
|
|
local success=NO
|
|
[ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && success=YES
|
|
local total commits loc
|
|
total=$(collect_tokens "$run/work")
|
|
commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0)
|
|
loc=$(cat "$run/work"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$')
|
|
|
|
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$V" "$rep" "$success" "$total" "0" "$total" "$t" "$commits" "$loc" >>"$DATA"
|
|
log " -> builder-solo r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc"
|
|
}
|
|
|
|
for rep in $(seq 1 "$REPEATS"); do run_one "$rep"; done
|
|
echo; echo "===== SOLO DONE ====="; echo "appended to $DATA ; run repos kept under $RUNROOT"
|
|
echo "Now run: python3 analyze.py to fold builder-solo into the comparison."
|