From fc0608ede12833ac3fe689413eadb27f01b736bf Mon Sep 17 00:00:00 2001 From: mfowler Date: Mon, 15 Jun 2026 02:36:58 +0000 Subject: [PATCH] feat: builder-solo control runner (run after campaign) + limit-detect for it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run-solo-bench.sh runs the builder-solo variant (single builder, self-verify, no adversary) 5× on the same calculator and appends rows to the shared campaign data file (adversary col = 0). Separate script so the live campaign runner is untouched. analyze.py limit-detection now also covers the solo run layout. Engine example builder-solo committed at a0f7652; benchmark engine to be re- pinned to it before running solo (after the main campaign completes). --- analyze.py | 22 ++++--- run-solo-bench.sh | 151 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 7 deletions(-) create mode 100755 run-solo-bench.sh diff --git a/analyze.py b/analyze.py index 212a650..0005850 100755 --- a/analyze.py +++ b/analyze.py @@ -30,14 +30,22 @@ for line in open(DATA): # flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is # inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign # run-root (repos are kept). -_roots = sorted(glob.glob("/tmp/ao-campaign-*")) -_root = _roots[-1] if _roots else "" +_camp = sorted(glob.glob("/tmp/ao-campaign-*")) +_camp_root = _camp[-1] if _camp else "" +_solo = sorted(glob.glob("/tmp/ao-solo-*")) +_solo_root = _solo[-1] if _solo else "" def _limit_hit(v, rep): - for wl in glob.glob(f"{_root}/{v}/r{rep}/.ao-state/*watchdog*.log"): - try: - if "limit hit" in open(wl, errors="ignore").read(): return True - except OSError: - pass + pats = [] + if _camp_root: + pats.append(f"{_camp_root}/{v}/r{rep}/.ao-state/*watchdog*.log") # campaign layout + if v == "builder-solo" and _solo_root: + pats.append(f"{_solo_root}/r{rep}/.ao-state/*watchdog*.log") # solo layout (no variant subdir) + for pat in pats: + for wl in glob.glob(pat): + try: + if "limit hit" in open(wl, errors="ignore").read(): return True + except OSError: + pass return False for r in rows: r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else "" diff --git a/run-solo-bench.sh b/run-solo-bench.sh new file mode 100755 index 0000000..4a14abd --- /dev/null +++ b/run-solo-bench.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# run-solo-bench.sh — builder-SOLO control runs (no adversary), appended to the campaign data file. +# +# Same multi-phase calculator + same mechanics as run-harness-bench.sh, but a single builder agent +# that builds AND self-verifies (engine/examples/builder-solo). Appends rows +# variant=builder-solo rep success builder 0 total dur commits loc +# to RESULTS-campaign.md.data so analyze.py folds it into the comparison. Run AFTER the main campaign +# finishes (so it doesn't compete for usage limits). Requires the benchmark engine pinned at a ref +# that ships examples/builder-solo. +# +# Usage: BENCH_REPEATS=5 ./run-solo-bench.sh +set -u + +BENCH_DIR="$(cd "$(dirname "$0")" && pwd)" +ENGINE="$BENCH_DIR/engine" +PLANS="$BENCH_DIR/plans/calc" +AGENTS_PY="$ENGINE/agents.py" +MODEL="claude-sonnet-4-6" +RUNROOT="$(mktemp -d /tmp/ao-solo-XXXXXX)" +DATA="$BENCH_DIR/RESULTS-campaign.md.data" +REPEATS="${BENCH_REPEATS:-5}" +TIMEOUT="${BENCH_TIMEOUT:-1800}" +POLL=60 +GIT_ID=(-c user.email=bench@example.com -c user.name=bench) +V="builder-solo" +log() { echo "[$(date -u +%H:%M:%S)] $*"; } + +trust_dir() { + python3 - "$1" <<'PY' +import json,os,sys,tempfile +p=os.path.expanduser("~/.claude.json"); d=sys.argv[1] +cfg=json.load(open(p)); e=cfg.setdefault("projects",{}).setdefault(d,{}) +e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True +fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd) +json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p) +PY +} +collect_tokens() { + python3 - "$1" <<'PY' +import json,sys,os,glob +wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-') +tdir=os.path.expanduser("~/.claude/projects/"+name); ti=to=tcc=tcr=0 +for f in glob.glob(tdir+"/*.jsonl"): + for line in open(f, errors="ignore"): + try: o=json.loads(line) + except Exception: continue + if o.get("type")=="assistant": + u=(o.get("message",{}) or {}).get("usage",{}) or {} + ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0 + tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0 +print(ti+to+tcc+tcr) +PY +} + +gen_config() { # + local run="$1" prefix="$2" + cat > "$run/agents.toml" < + local rep="$1" run="$RUNROOT/r$rep" + local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-} + local prefix="s${rtag:0:4}$(echo "$rep" | cksum | cut -c1-3)-" + mkdir -p "$run" + log "===== builder-solo rep $rep/$REPEATS (prefix $prefix) =====" + + git "${GIT_ID[@]}" init -q --bare "$run/origin.git" + git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main + git "${GIT_ID[@]}" init -q -b main "$run/seed" + ( cd "$run/seed" && mkdir -p machine-docs && echo "# calc" > README.md && : > machine-docs/.gitkeep \ + && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m seed \ + && git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main ) + git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work" + ( cd "$run/work" && git config user.email bench@example.com && git config user.name bench ) + trust_dir "$run/work" + gen_config "$run" "$prefix" + + python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1 + local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no" + while [ $t -lt "$TIMEOUT" ]; do + [ -f "$marker" ] && { done="yes"; break; } + sleep "$POLL"; t=$((t+POLL)) + done + python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1 + + local tests=no cli=no + ( cd "$run/work" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes + local out; out="$( cd "$run/work" && python calc.py '2+3*4' 2>/dev/null )" + [ "$out" = "14" ] && cli=yes + local success=NO + [ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && success=YES + local total commits loc + total=$(collect_tokens "$run/work") + commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0) + loc=$(cat "$run/work"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$') + + printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$V" "$rep" "$success" "$total" "0" "$total" "$t" "$commits" "$loc" >>"$DATA" + log " -> builder-solo r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc" +} + +for rep in $(seq 1 "$REPEATS"); do run_one "$rep"; done +echo; echo "===== SOLO DONE ====="; echo "appended to $DATA ; run repos kept under $RUNROOT" +echo "Now run: python3 analyze.py to fold builder-solo into the comparison."