diff --git a/engine b/engine index 737ef81..985d33d 160000 --- a/engine +++ b/engine @@ -1 +1 @@ -Subproject commit 737ef810666a619a4555c20bac27ebcdc734d3b1 +Subproject commit 985d33dd519f2f52ae45778a932c983893a92167 diff --git a/run-harness-bench.sh b/run-harness-bench.sh index cf18bb2..1cf8d48 100755 --- a/run-harness-bench.sh +++ b/run-harness-bench.sh @@ -20,6 +20,22 @@ RESULTS="$BENCH_DIR/RESULTS-harness.md" TIMEOUT="${BENCH_TIMEOUT:-3000}" # seconds per variant POLL=60 GIT_ID=(-c user.email=bench@example.com -c user.name=bench) +VARIANTS=(builder-adversary builder-adversary-min builder-adversary-stateless) + +# pre-trust a work dir in ~/.claude.json so interactive claude (in tmux) skips the workspace-trust +# dialog (--dangerously-skip-permissions only skips it for redirected/headless output). Atomic merge: +# add only this dir's entry, preserve everything else (the file is shared global state). +trust_dir() { + python3 - "$1" <<'PY' +import json,os,sys,tempfile +p=os.path.expanduser("~/.claude.json"); d=sys.argv[1] +cfg=json.load(open(p)) +e=cfg.setdefault("projects",{}).setdefault(d,{}) +e["hasTrustDialogAccepted"]=True; e.setdefault("allowedTools",[]); e["hasCompletedProjectOnboarding"]=True +fd,tmp=tempfile.mkstemp(dir=os.path.dirname(p)); os.close(fd) +json.dump(cfg,open(tmp,"w"),indent=2); os.replace(tmp,p) +PY +} log() { echo "[$(date -u +%H:%M:%S)] $*"; } @@ -125,6 +141,7 @@ run_variant() { git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work" git "${GIT_ID[@]}" clone -q "$run/origin.git" "$run/work-adv" for c in work work-adv; do ( cd "$run/$c" && git config user.email bench@example.com && git config user.name bench ); done + trust_dir "$run/work"; trust_dir "$run/work-adv" # let interactive claude skip the trust dialog gen_config "$v" "$run" "$prefix" @@ -179,11 +196,10 @@ run_variant() { prompt_chars() { cat "$ENGINE/examples/$1/prompts/kickoff.md" "$ENGINE/examples/$1/prompts/$2.md" | wc -c | tr -d ' '; } : >"$RESULTS.tmp" -run_variant builder-adversary -run_variant builder-adversary-min +for v in "${VARIANTS[@]}"; do run_variant "$v"; done { - echo "# Full-harness benchmark — original vs minimal prompts" + echo "# Full-harness benchmark — prompt variants" echo echo "Real \`agents.py up\` Builder/Adversary loop pair + watchdog, run autonomously through the" echo "multi-phase calculator (\`plans/calc/{lex,parse,eval}.md\`) to SEQUENCE-COMPLETE. Engine pinned" @@ -193,8 +209,7 @@ run_variant builder-adversary-min echo "## Static prompt size (chars: kickoff + role)" echo "| version | builder | adversary |" echo "|---|--:|--:|" - echo "| builder-adversary (orig) | $(prompt_chars builder-adversary builder) | $(prompt_chars builder-adversary adversary) |" - echo "| builder-adversary-min | $(prompt_chars builder-adversary-min builder) | $(prompt_chars builder-adversary-min adversary) |" + for v in "${VARIANTS[@]}"; do echo "| $v | $(prompt_chars "$v" builder) | $(prompt_chars "$v" adversary) |"; done echo echo "## Per-variant" echo @@ -202,8 +217,7 @@ run_variant builder-adversary-min echo "## Summary" echo "| version | success | total tokens |" echo "|---|:--:|--:|" - echo "| builder-adversary (orig) | ${SUM_OK[builder-adversary]:-?} | ${SUM_TOK[builder-adversary]:-?} |" - echo "| builder-adversary-min | ${SUM_OK[builder-adversary-min]:-?} | ${SUM_TOK[builder-adversary-min]:-?} |" + for v in "${VARIANTS[@]}"; do echo "| $v | ${SUM_OK[$v]:-?} | ${SUM_TOK[$v]:-?} |"; done echo echo "_N=1 per variant; the autonomous loop is nondeterministic (number of review rounds varies)._" echo "_Run dirs: \`$RUNROOT\`_" @@ -211,7 +225,6 @@ run_variant builder-adversary-min rm -f "$RESULTS.tmp" echo; echo "===== ALL DONE =====" -echo "orig: success=${SUM_OK[builder-adversary]:-?} tokens=${SUM_TOK[builder-adversary]:-?}" -echo "min : success=${SUM_OK[builder-adversary-min]:-?} tokens=${SUM_TOK[builder-adversary-min]:-?}" +for v in "${VARIANTS[@]}"; do echo "$v: success=${SUM_OK[$v]:-?} tokens=${SUM_TOK[$v]:-?}"; done echo "Results: $RESULTS" echo "Run dirs: $RUNROOT"