feat: builder-solo control runner (run after campaign) + limit-detect for it

run-solo-bench.sh runs the builder-solo variant (single builder, self-verify, no adversary) 5× on the same calculator and appends rows to the shared campaign data file (adversary col = 0). Separate script so the live campaign runner is untouched. analyze.py limit-detection now also covers the solo run layout. Engine example builder-solo committed at a0f7652; benchmark engine to be re- pinned to it before running solo (after the main campaign completes).
2026-06-15 02:36:58 +00:00
parent 25a77f5d3c
commit fc0608ede1
2 changed files with 166 additions and 7 deletions
--- a/analyze.py
+++ b/analyze.py
@ -30,14 +30,22 @@ for line in open(DATA):
 # flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
 # inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
 # run-root (repos are kept).
-_roots = sorted(glob.glob("/tmp/ao-campaign-*"))
-_root = _roots[-1] if _roots else ""
+_camp = sorted(glob.glob("/tmp/ao-campaign-*"))
+_camp_root = _camp[-1] if _camp else ""
+_solo = sorted(glob.glob("/tmp/ao-solo-*"))
+_solo_root = _solo[-1] if _solo else ""
 def _limit_hit(v, rep):
-    for wl in glob.glob(f"{_root}/{v}/r{rep}/.ao-state/*watchdog*.log"):
-        try:
-            if "limit hit" in open(wl, errors="ignore").read(): return True
-        except OSError:
-            pass
+    pats = []
+    if _camp_root:
+        pats.append(f"{_camp_root}/{v}/r{rep}/.ao-state/*watchdog*.log")   # campaign layout
+    if v == "builder-solo" and _solo_root:
+        pats.append(f"{_solo_root}/r{rep}/.ao-state/*watchdog*.log")        # solo layout (no variant subdir)
+    for pat in pats:
+        for wl in glob.glob(pat):
+            try:
+                if "limit hit" in open(wl, errors="ignore").read(): return True
+            except OSError:
+                pass
    return False
 for r in rows:
    r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else ""
--- a/run-solo-bench.sh
+++ b/run-solo-bench.sh
@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# run-solo-bench.sh — builder-SOLO control runs (no adversary), appended to the campaign data file.
+#
+# Same multi-phase calculator + same mechanics as run-harness-bench.sh, but a single builder agent
+# that builds AND self-verifies (engine/examples/builder-solo). Appends rows
+#   variant=builder-solo  rep  success  builder  0  total  dur  commits  loc
+# to RESULTS-campaign.md.data so analyze.py folds it into the comparison. Run AFTER the main campaign
+# finishes (so it doesn't compete for usage limits). Requires the benchmark engine pinned at a ref
+# that ships examples/builder-solo.
+#
+# Usage:  BENCH_REPEATS=5 ./run-solo-bench.sh
+set -u
+
+BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
+ENGINE="$BENCH_DIR/engine"
+PLANS="$BENCH_DIR/plans/calc"
+AGENTS_PY="$ENGINE/agents.py"
+MODEL="claude-sonnet-4-6"
+RUNROOT="$(mktemp -d /tmp/ao-solo-XXXXXX)"
+DATA="$BENCH_DIR/RESULTS-campaign.md.data"
+REPEATS="${BENCH_REPEATS:-5}"
+TIMEOUT="${BENCH_TIMEOUT:-1800}"
+POLL=60
+GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
+V="builder-solo"
+log() { echo "[$(date -u +%H:%M:%S)] $*"; }
+
+trust_dir() {
+  python3 - "$1" <<'PY'
+import json,os,sys,tempfile
+p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
+cfg=json.load(open(p)); e=cfg.setdefault("projects",{}).setdefault(d,{})
+e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
+fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
+json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
+PY
+}
+collect_tokens() {
+  python3 - "$1" <<'PY'
+import json,sys,os,glob
+wd=sys.argv[1].rstrip('/'); name=wd.replace('/','-').replace('.','-')
+tdir=os.path.expanduser("~/.claude/projects/"+name); ti=to=tcc=tcr=0
+for f in glob.glob(tdir+"/*.jsonl"):
+    for line in open(f, errors="ignore"):
+        try: o=json.loads(line)
+        except Exception: continue
+        if o.get("type")=="assistant":
+            u=(o.get("message",{}) or {}).get("usage",{}) or {}
+            ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
+            tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
+print(ti+to+tcc+tcr)
+PY
+}
+
+gen_config() {  # <run> <prefix>
+  local run="$1" prefix="$2"
+  cat > "$run/agents.toml" <<EOF
+[watchdog]
+signal_interval = 15
+heavy_interval  = 60
+limit_probe_fallback = 300
+limit_reset_slack = 45
+stall_grace = 180
+
+[defaults]
+session_prefix = "$prefix"
+log_dir = "$run/.ao-state"
+backend = "claude"
+model = "$MODEL"
+watch = "heal"
+
+[backend.claude]
+bin = "claude"
+flags = "--dangerously-skip-permissions"
+remote_control = true
+supports_resume = true
+prompt_delivery = "arg"
+process_name = "claude"
+submit_key = "Enter"
+stall_idle = 300
+active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
+limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
+fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
+
+[[agent]]
+name = "builder"
+kind = "loop"
+role = "builder"
+dir = "$run/work"
+watch = "heal+stall"
+
+[loop]
+state_file = "phase-idx"
+resume_phase = true
+auto_advance = true
+done_marker = "## DONE"
+kickoff_template = "$ENGINE/examples/builder-solo/prompts/kickoff.md"
+roles_dir = "$ENGINE/examples/builder-solo/prompts"
+handoff = { repo = "$run/work", state_subdir = "machine-docs" }
+phases = [
+  { id = "lex",   plan = "$PLANS/lex.md",   status = "STATUS-lex.md" },
+  { id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
+  { id = "eval",  plan = "$PLANS/eval.md",  status = "STATUS-eval.md" },
+]
+EOF
+}
+
+run_one() {  # <rep>
+  local rep="$1" run="$RUNROOT/r$rep"
+  local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-}
+  local prefix="s${rtag:0:4}$(echo "$rep" | cksum | cut -c1-3)-"
+  mkdir -p "$run"
+  log "===== builder-solo rep $rep/$REPEATS (prefix $prefix) ====="
+
+  git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
+  git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main
+  git "${GIT_ID[@]}" init -q -b main "$run/seed"
+  ( cd "$run/seed" && mkdir -p machine-docs && echo "# calc" > README.md && : > machine-docs/.gitkeep \
+      && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m seed \
+      && git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
+  git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
+  ( cd "$run/work" && git config user.email bench@example.com && git config user.name bench )
+  trust_dir "$run/work"
+  gen_config "$run" "$prefix"
+
+  python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
+  local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
+  while [ $t -lt "$TIMEOUT" ]; do
+    [ -f "$marker" ] && { done="yes"; break; }
+    sleep "$POLL"; t=$((t+POLL))
+  done
+  python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
+
+  local tests=no cli=no
+  ( cd "$run/work" && python -m unittest -q ) >/dev/null 2>&1 && tests=yes
+  local out; out="$( cd "$run/work" && python calc.py '2+3*4' 2>/dev/null )"
+  [ "$out" = "14" ] && cli=yes
+  local success=NO
+  [ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && success=YES
+  local total commits loc
+  total=$(collect_tokens "$run/work")
+  commits=$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo 0)
+  loc=$(cat "$run/work"/calc/*.py 2>/dev/null | grep -cve '^[[:space:]]*$')
+
+  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' "$V" "$rep" "$success" "$total" "0" "$total" "$t" "$commits" "$loc" >>"$DATA"
+  log "  -> builder-solo r$rep: success=$success total=$total dur=${t}s commits=$commits loc=$loc"
+}
+
+for rep in $(seq 1 "$REPEATS"); do run_one "$rep"; done
+echo; echo "===== SOLO DONE ====="; echo "appended to $DATA ; run repos kept under $RUNROOT"
+echo "Now run:  python3 analyze.py   to fold builder-solo into the comparison."