- bump engine submodule to e0425e6 (adds builder-adversary-lean: context hygiene + enforced per-gate review) - run-harness-bench.sh: accept variant names as CLI args to run a subset
233 lines
9.8 KiB
Bash
Executable File
233 lines
9.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# run-harness-bench.sh — FULL harness benchmark (real agents.py up loop, not headless single-pass).
|
|
#
|
|
# For each variant (builder-adversary, builder-adversary-min) this stands up a real Builder/Adversary
|
|
# loop pair + watchdog over a shared work repo and lets them run autonomously through the multi-phase
|
|
# calculator (plans/calc/{lex,parse,eval}.md) to SEQUENCE-COMPLETE. Both loops on Sonnet. Then it
|
|
# clocks the tokens each loop used (summed from the Claude Code session transcripts) and re-runs the
|
|
# final Definition-of-Done itself.
|
|
#
|
|
# Long, autonomous, nondeterministic (N=1). Per-variant wall-clock timeout below. Usage: ./run-harness-bench.sh
|
|
set -u
|
|
|
|
BENCH_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
ENGINE="$BENCH_DIR/engine"
|
|
PLANS="$BENCH_DIR/plans/calc"
|
|
AGENTS_PY="$ENGINE/agents.py"
|
|
MODEL="claude-sonnet-4-6"
|
|
RUNROOT="$(mktemp -d /tmp/ao-harness-XXXXXX)" # no dot → clean transcript-dir mapping
|
|
RESULTS="$BENCH_DIR/RESULTS-harness.md"
|
|
TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant
|
|
POLL=60
|
|
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
|
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless builder-adversary-lean)
|
|
[ $# -gt 0 ] && VARIANTS=("$@") # run only the variants named on the command line, if any
|
|
|
|
# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust
|
|
# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge:
|
|
# add only this dir's entry, preserve everything else (the file is shared global state).
|
|
trust_dir() {
|
|
python3 - "$1" <<'PY'
|
|
import json,os,sys,tempfile
|
|
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
|
|
cfg=json.load(open(p))
|
|
e=cfg.setdefault("projects",{}).setdefault(d,{})
|
|
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
|
|
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
|
|
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
|
|
PY
|
|
}
|
|
|
|
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
|
|
|
# transcript token sum for an agent's working dir -> "in out cache_create cache_read"
|
|
collect_tokens() {
|
|
python3 - "$1" <<'PY'
|
|
import json,sys,os,glob
|
|
wd=sys.argv[1].rstrip('/')
|
|
name=wd.replace('/','-').replace('.','-') # '/tmp/x' -> '-tmp-x'
|
|
tdir=os.path.expanduser("~/.claude/projects/"+name)
|
|
ti=to=tcc=tcr=0
|
|
for f in glob.glob(tdir+"/*.jsonl"):
|
|
for line in open(f, errors="ignore"):
|
|
try: o=json.loads(line)
|
|
except Exception: continue
|
|
if o.get("type")=="assistant":
|
|
u=(o.get("message",{}) or {}).get("usage",{}) or {}
|
|
ti+=u.get("input_tokens",0) or 0; to+=u.get("output_tokens",0) or 0
|
|
tcc+=u.get("cache_creation_input_tokens",0) or 0; tcr+=u.get("cache_read_input_tokens",0) or 0
|
|
print(ti,to,tcc,tcr)
|
|
PY
|
|
}
|
|
|
|
gen_config() { # <variant> <run> <prefix>
|
|
local v="$1" run="$2" prefix="$3"
|
|
cat > "$run/agents.toml" <<EOF
|
|
[watchdog]
|
|
signal_interval = 15
|
|
heavy_interval = 60
|
|
limit_probe_fallback = 300
|
|
limit_reset_slack = 45
|
|
stall_grace = 180
|
|
|
|
[defaults]
|
|
session_prefix = "$prefix"
|
|
log_dir = "$run/.ao-state"
|
|
backend = "claude"
|
|
model = "$MODEL"
|
|
watch = "heal"
|
|
|
|
[backend.claude]
|
|
bin = "claude"
|
|
flags = "--dangerously-skip-permissions"
|
|
remote_control = true
|
|
supports_resume = true
|
|
prompt_delivery = "arg"
|
|
process_name = "claude"
|
|
submit_key = "Enter"
|
|
stall_idle = 300
|
|
active_re = "esc to interrupt|Running tool|⠇|⠙|· \\\\d+"
|
|
limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)"
|
|
fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified"
|
|
|
|
[[agent]]
|
|
name = "builder"
|
|
kind = "loop"
|
|
role = "builder"
|
|
dir = "$run/work"
|
|
watch = "heal+stall"
|
|
|
|
[[agent]]
|
|
name = "adversary"
|
|
session = "${prefix}adv"
|
|
kind = "loop"
|
|
role = "adversary"
|
|
dir = "$run/work-adv"
|
|
watch = "heal+stall"
|
|
|
|
[loop]
|
|
state_file = "phase-idx"
|
|
resume_phase = true
|
|
auto_advance = true
|
|
done_marker = "## DONE"
|
|
kickoff_template = "$ENGINE/examples/$v/prompts/kickoff.md"
|
|
roles_dir = "$ENGINE/examples/$v/prompts"
|
|
handoff = { repo = "$run/work", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"], claim_pattern = "^claim", review_pattern = "^review", state_subdir = "machine-docs" }
|
|
phases = [
|
|
{ id = "lex", plan = "$PLANS/lex.md", status = "STATUS-lex.md" },
|
|
{ id = "parse", plan = "$PLANS/parse.md", status = "STATUS-parse.md" },
|
|
{ id = "eval", plan = "$PLANS/eval.md", status = "STATUS-eval.md" },
|
|
]
|
|
EOF
|
|
}
|
|
|
|
declare -A SUM_TOK SUM_OK SUM_PHASES
|
|
|
|
run_variant() {
|
|
local v="$1"
|
|
local run="$RUNROOT/$v"
|
|
local rtag; rtag=$(basename "$RUNROOT"); rtag=${rtag##*-} # mktemp suffix → unique per invocation
|
|
local prefix="b${rtag}$(echo "$v" | cksum | cut -c1-2)-" # unique per (run, variant); avoids tmux collisions
|
|
mkdir -p "$run"
|
|
log "===== $v (run dir: $run, prefix: $prefix) ====="
|
|
|
|
# shared bare 'origin' + two clones
|
|
git "${GIT_ID[@]}" init -q --bare "$run/origin.git"
|
|
git -C "$run/origin.git" symbolic-ref HEAD refs/heads/main # so clones check out 'main' (we push main, not master)
|
|
git "${GIT_ID[@]}" init -q -b main "$run/seed"
|
|
( cd "$run/seed" && mkdir -p machine-docs && echo "# calc work repo" > README.md \
|
|
&& : > machine-docs/.gitkeep \
|
|
&& git "${GIT_ID[@]}" add -A && git "${GIT_ID[@]}" commit -q -m "chore: seed work repo" \
|
|
&& git "${GIT_ID[@]}" remote add origin "$run/origin.git" && git "${GIT_ID[@]}" push -q -u origin main )
|
|
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
|
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
|
|
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
|
|
trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog
|
|
|
|
gen_config "$v" "$run" "$prefix"
|
|
|
|
log "[$v] agents.py up …"
|
|
python3 "$AGENTS_PY" up --config "$run/agents.toml" >"$run/up.log" 2>&1
|
|
log "[$v] up done; status:"; python3 "$AGENTS_PY" status --config "$run/agents.toml" 2>&1 | sed 's/^/ /'
|
|
|
|
# poll for SEQUENCE-COMPLETE or timeout
|
|
local marker="$run/.ao-state/SEQUENCE-COMPLETE" t=0 done="no"
|
|
while [ $t -lt "$TIMEOUT" ]; do
|
|
if [ -f "$marker" ]; then done="yes"; break; fi
|
|
sleep "$POLL"; t=$((t+POLL))
|
|
local idx commits
|
|
idx="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')"
|
|
commits="$(git -C "$run/origin.git" rev-list --count main 2>/dev/null || echo '?')"
|
|
log "[$v] t=${t}s phase-idx=$idx origin-commits=$commits"
|
|
done
|
|
log "[$v] loop finished (sequence-complete=$done after ${t}s); tearing down"
|
|
python3 "$AGENTS_PY" down --config "$run/agents.toml" >"$run/down.log" 2>&1
|
|
|
|
# final DoD check from the adversary's clone (pull latest first)
|
|
( cd "$run/work-adv" && git "${GIT_ID[@]}" pull -q --no-rebase origin main 2>/dev/null )
|
|
local tests=no cli=no
|
|
( cd "$run/work-adv" && python -m unittest -q ) >"$run/final-unittest.txt" 2>&1 && tests=yes
|
|
local out; out="$( cd "$run/work-adv" && python calc.py '2+3*4' 2>/dev/null )"
|
|
[ "$out" = "14" ] && cli=yes
|
|
local reviews; reviews="$(grep -rhoiE '(lex|parse|eval)/D[0-9]+:?\s*PASS' "$run/work-adv/machine-docs/" 2>/dev/null | sort -u | wc -l)"
|
|
# a standing veto is the "## VETO" marker (per the prompts) — NOT the word "veto" (matches "No veto")
|
|
local veto="no"; grep -rqiE '##[[:space:]]*VETO' "$run/work-adv/machine-docs/" 2>/dev/null && veto=yes
|
|
SUM_PHASES[$v]="$(cat "$run/.ao-state/state/phase-idx" 2>/dev/null || echo '?')"
|
|
|
|
local success=NO
|
|
[ "$done" = yes ] && [ "$tests" = yes ] && [ "$cli" = yes ] && [ "$veto" = no ] && success=YES
|
|
SUM_OK[$v]=$success
|
|
|
|
# tokens
|
|
read -r bi bo bcc bcr <<<"$(collect_tokens "$run/work")"
|
|
read -r ai ao acc acr <<<"$(collect_tokens "$run/work-adv")"
|
|
local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) vtok=$(( bi+bo+bcc+bcr + ai+ao+acc+acr ))
|
|
SUM_TOK[$v]=$vtok
|
|
|
|
{
|
|
echo "### $v"
|
|
echo "- **success:** $success (sequence-complete=$done, tests=$tests, cli('2+3*4'→'$out')=$cli, gates-passed=$reviews, veto=$veto, final phase-idx=${SUM_PHASES[$v]})"
|
|
echo "- **builder loop:** in=$bi out=$bo cache_create=$bcc cache_read=$bcr → **${btok}** tok"
|
|
echo "- **adversary loop:** in=$ai out=$ao cache_create=$acc cache_read=$acr → **${atok}** tok"
|
|
echo "- **total:** **${vtok}** tokens"
|
|
echo
|
|
} >>"$RESULTS.tmp"
|
|
log "[$v] DONE success=$success tokens=$vtok gates-passed=$reviews"
|
|
}
|
|
|
|
prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
|
|
|
|
: >"$RESULTS.tmp"
|
|
for v in "${VARIANTS[@]}"; do run_variant "$v"; done
|
|
|
|
{
|
|
echo "# Full-harness benchmark — prompt variants"
|
|
echo
|
|
echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
|
|
echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
|
|
echo "at \`$(git -C "$ENGINE" rev-parse --short HEAD)\`. Both loops on **$MODEL**. Per-variant timeout"
|
|
echo "${TIMEOUT}s. Tokens summed from the Claude Code session transcripts of each loop's clone."
|
|
echo
|
|
echo "## Static prompt size (chars: kickoff + role)"
|
|
echo "| version | builder | adversary |"
|
|
echo "|---|--:|--:|"
|
|
for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done
|
|
echo
|
|
echo "## Per-variant"
|
|
echo
|
|
cat "$RESULTS.tmp"
|
|
echo "## Summary"
|
|
echo "| version | success | total tokens |"
|
|
echo "|---|:--:|--:|"
|
|
for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done
|
|
echo
|
|
echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
|
|
echo "_Run dirs: \`$RUNROOT\`_"
|
|
} >"$RESULTS"
|
|
rm -f "$RESULTS.tmp"
|
|
|
|
echo; echo "===== ALL DONE ====="
|
|
for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done
|
|
echo "Results: $RESULTS"
|
|
echo "Run dirs: $RUNROOT"
|