feat: add stateless variant, pre-trust work dirs, loop over 3 variants
- bump engine submodule to 985d33d (adds builder-adversary-stateless example) - run-harness-bench.sh: pre-trust each work dir in ~/.claude.json so interactive claude (tmux) skips the workspace-trust dialog (--dangerously-skip-permissions only skips it for redirected/headless output); benchmark all three variants - (fixes from this session: bare repo default branch → main; unique session prefix per run) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2
engine
2
engine
Submodule engine updated: 737ef81066...985d33dd51
@ -20,6 +20,22 @@ RESULTS="$BENCH_DIR/RESULTS-harness.md"
|
|||||||
TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant
|
TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant
|
||||||
POLL=60
|
POLL=60
|
||||||
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
GIT_ID=(-c user.email=bench@example.com -c user.name=bench)
|
||||||
|
VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless)
|
||||||
|
|
||||||
|
# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust
|
||||||
|
# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge:
|
||||||
|
# add only this dir's entry, preserve everything else (the file is shared global state).
|
||||||
|
trust_dir() {
|
||||||
|
python3 - "$1" <<'PY'
|
||||||
|
import json,os,sys,tempfile
|
||||||
|
p=os.path.expanduser("~/.claude.json"); d=sys.argv[1]
|
||||||
|
cfg=json.load(open(p))
|
||||||
|
e=cfg.setdefault("projects",{}).setdefault(d,{})
|
||||||
|
e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True
|
||||||
|
fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd)
|
||||||
|
json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
||||||
|
|
||||||
@ -125,6 +141,7 @@ run_variant() {
|
|||||||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work"
|
||||||
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
|
git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv"
|
||||||
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
|
for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done
|
||||||
|
trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog
|
||||||
|
|
||||||
gen_config "$v" "$run" "$prefix"
|
gen_config "$v" "$run" "$prefix"
|
||||||
|
|
||||||
@ -179,11 +196,10 @@ run_variant() {
|
|||||||
prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
|
prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; }
|
||||||
|
|
||||||
: >"$RESULTS.tmp"
|
: >"$RESULTS.tmp"
|
||||||
run_variant builder-adversary
|
for v in "${VARIANTS[@]}"; do run_variant "$v"; done
|
||||||
run_variant builder-adversary-min
|
|
||||||
|
|
||||||
{
|
{
|
||||||
echo "# Full-harness benchmark — original vs minimal prompts"
|
echo "# Full-harness benchmark — prompt variants"
|
||||||
echo
|
echo
|
||||||
echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
|
echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the"
|
||||||
echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
|
echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned"
|
||||||
@ -193,8 +209,7 @@ run_variant builder-adversary-min
|
|||||||
echo "## Static prompt size (chars: kickoff + role)"
|
echo "## Static prompt size (chars: kickoff + role)"
|
||||||
echo "| version | builder | adversary |"
|
echo "| version | builder | adversary |"
|
||||||
echo "|---|--:|--:|"
|
echo "|---|--:|--:|"
|
||||||
echo "| builder-adversary (orig) | $(prompt_chars builder-adversary builder) | $(prompt_chars builder-adversary adversary) |"
|
for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done
|
||||||
echo "| builder-adversary-min | $(prompt_chars builder-adversary-min builder) | $(prompt_chars builder-adversary-min adversary) |"
|
|
||||||
echo
|
echo
|
||||||
echo "## Per-variant"
|
echo "## Per-variant"
|
||||||
echo
|
echo
|
||||||
@ -202,8 +217,7 @@ run_variant builder-adversary-min
|
|||||||
echo "## Summary"
|
echo "## Summary"
|
||||||
echo "| version | success | total tokens |"
|
echo "| version | success | total tokens |"
|
||||||
echo "|---|:--:|--:|"
|
echo "|---|:--:|--:|"
|
||||||
echo "| builder-adversary (orig) | ${SUM_OK[builder-adversary]:-?} | ${SUM_TOK[builder-adversary]:-?} |"
|
for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done
|
||||||
echo "| builder-adversary-min | ${SUM_OK[builder-adversary-min]:-?} | ${SUM_TOK[builder-adversary-min]:-?} |"
|
|
||||||
echo
|
echo
|
||||||
echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
|
echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._"
|
||||||
echo "_Run dirs: \`$RUNROOT\`_"
|
echo "_Run dirs: \`$RUNROOT\`_"
|
||||||
@ -211,7 +225,6 @@ run_variant builder-adversary-min
|
|||||||
rm -f "$RESULTS.tmp"
|
rm -f "$RESULTS.tmp"
|
||||||
|
|
||||||
echo; echo "===== ALL DONE ====="
|
echo; echo "===== ALL DONE ====="
|
||||||
echo "orig: success=${SUM_OK[builder-adversary]:-?} tokens=${SUM_TOK[builder-adversary]:-?}"
|
for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done
|
||||||
echo "min : success=${SUM_OK[builder-adversary-min]:-?} tokens=${SUM_TOK[builder-adversary-min]:-?}"
|
|
||||||
echo "Results: $RESULTS"
|
echo "Results: $RESULTS"
|
||||||
echo "Run dirs: $RUNROOT"
|
echo "Run dirs: $RUNROOT"
|
||||||
|
|||||||
Reference in New Issue
Block a user