feat: add stateless variant, pre-trust work dirs, loop over 3 variants

- bump engine submodule to 985d33d (adds builder-adversary-stateless example)
- run-harness-bench.sh: pre-trust each work dir in ~/.claude.json so interactive
  claude (tmux) skips the workspace-trust dialog (--dangerously-skip-permissions
  only skips it for redirected/headless output); benchmark all three variants
- (fixes from this session: bare repo default branch → main; unique session
  prefix per run)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-14 20:52:29 +00:00
parent 11eda4a8b1
commit a1b59e1bc5
2 changed files with 23 additions and 10 deletions

2
engine

Submodule engine updated: 737ef81066...985d33dd51

View File

@ -20,6 +20,22 @@ RESULTS="$BENCH_DIR/RESULTS-harness.md"
TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant
POLL=60
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless)
# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust
# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge:
# add only this dir's entry, preserve everything else (the file is shared global state).
trust_dir() {
python3 - "$1" <<'PY'
import json,os,sys,tempfile
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
cfg=json.load(open(p))
e=cfg.setdefault("projects",{}).setdefault(d,{})
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
PY
}
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
@ -125,6 +141,7 @@ run_variant() {
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog
gen_config "$v" "$run" "$prefix"
@ -179,11 +196,10 @@ run_variant() {
prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
: >"$RESULTS.tmp"
run_variant builder-adversary
run_variant builder-adversary-min
for v in "${VARIANTS[@]}"; do run_variant "$v"; done
{
echo "# Full-harness benchmark — original vs minimal prompts"
echo "# Full-harness benchmark — prompt variants"
echo
echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
@ -193,8 +209,7 @@ run_variant builder-adversary-min
echo "## Static prompt size (chars: kickoff + role)"
echo "| version | builder | adversary |"
echo "|---|--:|--:|"
echo "| builder-adversary (orig) | $(prompt_chars builder-adversary builder) | $(prompt_chars builder-adversary adversary) |"
echo "| builder-adversary-min | $(prompt_chars builder-adversary-min builder) | $(prompt_chars builder-adversary-min adversary) |"
for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done
echo
echo "## Per-variant"
echo
@ -202,8 +217,7 @@ run_variant builder-adversary-min
echo "## Summary"
echo "| version | success | total tokens |"
echo "|---|:--:|--:|"
echo "| builder-adversary (orig) | ${SUM_OK[builder-adversary]:-?} | ${SUM_TOK[builder-adversary]:-?} |"
echo "| builder-adversary-min | ${SUM_OK[builder-adversary-min]:-?} | ${SUM_TOK[builder-adversary-min]:-?} |"
for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done
echo
echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
echo "_Run dirs: \`$RUNROOT\`_"
@ -211,7 +225,6 @@ run_variant builder-adversary-min
rm -f "$RESULTS.tmp"
echo; echo "===== ALL DONE ====="
echo "orig: success=${SUM_OK[builder-adversary]:-?} tokens=${SUM_TOK[builder-adversary]:-?}"
echo "min : success=${SUM_OK[builder-adversary-min]:-?} tokens=${SUM_TOK[builder-adversary-min]:-?}"
for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done
echo "Results: $RESULTS"
echo "Run dirs: $RUNROOT"