feat: add stateless variant, pre-trust work dirs, loop over 3 variants
- bump engine submodule to 985d33d (adds builder-adversary-stateless example) - run-harness-bench.sh: pre-trust each work dir in ~/.claude.json so interactive claude (tmux) skips the workspace-trust dialog (--dangerously-skip-permissions only skips it for redirected/headless output); benchmark all three variants - (fixes from this session: bare repo default branch → main; unique session prefix per run) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2
engine
2
engine
Submodule engine updated: 737ef81066...985d33dd51
@ -20,6 +20,22 @@ RESULTS="$BENCH_DIR/RESULTS-harness.md"
|
||||
TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant
|
||||
POLL=60
|
||||
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
||||
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless)
|
||||
|
||||
# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust
|
||||
# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge:
|
||||
# add only this dir's entry, preserve everything else (the file is shared global state).
|
||||
trust_dir() {
|
||||
python3 - "$1" <<'PY'
|
||||
import json,os,sys,tempfile
|
||||
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
|
||||
cfg=json.load(open(p))
|
||||
e=cfg.setdefault("projects",{}).setdefault(d,{})
|
||||
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
|
||||
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
|
||||
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
|
||||
PY
|
||||
}
|
||||
|
||||
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
||||
|
||||
@ -125,6 +141,7 @@ run_variant() {
|
||||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
||||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
|
||||
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
|
||||
trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog
|
||||
|
||||
gen_config "$v" "$run" "$prefix"
|
||||
|
||||
@ -179,11 +196,10 @@ run_variant() {
|
||||
prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
|
||||
|
||||
: >"$RESULTS.tmp"
|
||||
run_variant builder-adversary
|
||||
run_variant builder-adversary-min
|
||||
for v in "${VARIANTS[@]}"; do run_variant "$v"; done
|
||||
|
||||
{
|
||||
echo "# Full-harness benchmark — original vs minimal prompts"
|
||||
echo "# Full-harness benchmark — prompt variants"
|
||||
echo
|
||||
echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
|
||||
echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
|
||||
@ -193,8 +209,7 @@ run_variant builder-adversary-min
|
||||
echo "## Static prompt size (chars: kickoff + role)"
|
||||
echo "| version | builder | adversary |"
|
||||
echo "|---|--:|--:|"
|
||||
echo "| builder-adversary (orig) | $(prompt_chars builder-adversary builder) | $(prompt_chars builder-adversary adversary) |"
|
||||
echo "| builder-adversary-min | $(prompt_chars builder-adversary-min builder) | $(prompt_chars builder-adversary-min adversary) |"
|
||||
for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done
|
||||
echo
|
||||
echo "## Per-variant"
|
||||
echo
|
||||
@ -202,8 +217,7 @@ run_variant builder-adversary-min
|
||||
echo "## Summary"
|
||||
echo "| version | success | total tokens |"
|
||||
echo "|---|:--:|--:|"
|
||||
echo "| builder-adversary (orig) | ${SUM_OK[builder-adversary]:-?} | ${SUM_TOK[builder-adversary]:-?} |"
|
||||
echo "| builder-adversary-min | ${SUM_OK[builder-adversary-min]:-?} | ${SUM_TOK[builder-adversary-min]:-?} |"
|
||||
for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done
|
||||
echo
|
||||
echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
|
||||
echo "_Run dirs: \`$RUNROOT\`_"
|
||||
@ -211,7 +225,6 @@ run_variant builder-adversary-min
|
||||
rm -f "$RESULTS.tmp"
|
||||
|
||||
echo; echo "===== ALL DONE ====="
|
||||
echo "orig: success=${SUM_OK[builder-adversary]:-?} tokens=${SUM_TOK[builder-adversary]:-?}"
|
||||
echo "min : success=${SUM_OK[builder-adversary-min]:-?} tokens=${SUM_TOK[builder-adversary-min]:-?}"
|
||||
for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done
|
||||
echo "Results: $RESULTS"
|
||||
echo "Run dirs: $RUNROOT"
|
||||
|
||||
Reference in New Issue
Block a user