#!/usr/bin/env bash # run-harness-bench.sh — FULL harness benchmark (real agents.py up loop, not headless single-pass). # # For each variant (builder-adversary, builder-adversary-min) this stands up a real Builder/Adversary # loop pair + watchdog over a shared work repo and lets them run autonomously through the multi-phase # calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE. Both loops on Sonnet. Then it # clocks the tokens each loop used (summed from the Claude Code session transcripts) and re-runs the # final Definition-of-Done itself. # # Long, autonomous, nondeterministic (N=1). Per-variant wall-clock timeout below. Usage: ./run-harness-bench.sh set -u BENCH_DIR="$(cd "$(dirname "$0")" && pwd)" ENGINE="$BENCH_DIR/engine" PLANS="$BENCH_DIR/plans/calc" AGENTS_PY="$ENGINE/agents.py" MODEL="claude-sonnet-4-6" RUNROOT="$(mktemp -d /tmp/ao-harness-XXXXXX)" # no dot → clean transcript-dir mapping RESULTS="$BENCH_DIR/RESULTS-harness.md" TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant POLL=60 GIT_ID=(-c user.email=bench@example.com -c user.name=bench) VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean) [ $# -gt 0 ] && VARIANTS=("$@") # run only the variants named on the command line, if any # pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust # dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge: # add only this dir's entry, preserve everything else (the file is shared global state). trust_dir() { python3 - "$1" <<'PY' import json,os,sys,tempfile p=os.path.expanduser("~/.claude.json"); d=sys.argv[1] cfg=json.load(open(p)) e=cfg.setdefault("projects",{}).setdefault(d,{}) e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd) json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p) PY } log() { echo "[$(date -u +%H:%M:%S)] $*"; } # transcript token sum for an agent's working dir -> "in out cache_create cache_read" collect_tokens() { python3 - "$1" <<'PY' import json,sys,os,glob wd=sys.argv[1].rstrip('/') name=wd.replace('/','-').replace('.','-') # '/tmp/x' -> '-tmp-x' tdir=os.path.expanduser("~/.claude/projects/"+name) ti=to=tcc=tcr=0 for f in glob.glob(tdir+"/*.jsonl"): for line in open(f, errors="ignore"): try: o=json.loads(line) except Exception: continue if o.get("type")=="assistant": u=(o.get("message",{}) or {}).get("usage",{}) or {} ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0 tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0 print(ti,to,tcc,tcr) PY } gen_config() { # local v="$1" run="$2" prefix="$3" cat > "$run/agents.toml" < README.md \ && : > machine-docs/.gitkeep \ && git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed work repo" \ && git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main ) git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work" git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv" for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog gen_config "$v" "$run" "$prefix" log "[$v] agents.py up …" python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1 log "[$v] up done; status:"; python3 "$AGENTS_PY" status --config "$run/agents.toml" 2>&1 | sed 's/^/ /' # poll for SEQUENCE-COMPLETE or timeout local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no" while [ $t -lt "$TIMEOUT" ]; do if [ -f "$marker" ]; then done="yes"; break; fi sleep "$POLL"; t=$((t+POLL)) local idx commits idx="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')" commits="$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo '?')" log "[$v] t=${t}s phase-idx=$idx origin-commits=$commits" done log "[$v] loop finished (sequence-complete=$done after ${t}s); tearing down" python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1 # final DoD check from the adversary's clone (pull latest first) ( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null ) local tests=no cli=no ( cd "$run/work-adv" && python -m unittest -q ) >"$run/final-unittest.txt" 2>&1 && tests=yes local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )" [ "$out" = "14" ] && cli=yes local reviews; reviews="$(grep -rhoiE '(lex|parse|eval)/D[0-9]+:?\s*PASS' "$run/work-adv/machine-docs/" 2>/dev/null | sort -u | wc -l)" # a standing veto is the "## VETO" marker (per the prompts) — NOT the word "veto" (matches "No veto") local veto="no"; grep -rqiE '##[[:space:]]*VETO' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes SUM_PHASES[$v]="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')" local success=NO [ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES SUM_OK[$v]=$success # tokens read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")" read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")" local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) vtok=$(( bi+bo+bcc+bcr + ai+ao+acc+acr )) SUM_TOK[$v]=$vtok { echo "### $v" echo "- **success:** $success (sequence-complete=$done, tests=$tests, cli('2+3*4'→'$out')=$cli, gates-passed=$reviews, veto=$veto, final phase-idx=${SUM_PHASES[$v]})" echo "- **builder loop:** in=$bi out=$bo cache_create=$bcc cache_read=$bcr → **${btok}** tok" echo "- **adversary loop:** in=$ai out=$ao cache_create=$acc cache_read=$acr → **${atok}** tok" echo "- **total:** **${vtok}** tokens" echo } >>"$RESULTS.tmp" log "[$v] DONE success=$success tokens=$vtok gates-passed=$reviews" } prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; } : >"$RESULTS.tmp" for v in "${VARIANTS[@]}"; do run_variant "$v"; done { echo "# Full-harness benchmark — prompt variants" echo echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the" echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned" echo "at \`$(git -C "$ENGINE" rev-parse --short HEAD)\`. Both loops on **$MODEL**. Per-variant timeout" echo "${TIMEOUT}s. Tokens summed from the Claude Code session transcripts of each loop's clone." echo echo "## Static prompt size (chars: kickoff + role)" echo "| version | builder | adversary |" echo "|---|--:|--:|" for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done echo echo "## Per-variant" echo cat "$RESULTS.tmp" echo "## Summary" echo "| version | success | total tokens |" echo "|---|:--:|--:|" for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done echo echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._" echo "_Run dirs: \`$RUNROOT\`_" } >"$RESULTS" rm -f "$RESULTS.tmp" echo; echo "===== ALL DONE =====" for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done echo "Results: $RESULTS" echo "Run dirs: $RUNROOT"