#!/usr/bin/env bash # agent-orchestrator-benchmark — head-to-head prompt comparison. # # Compares two example variants that ship in the vendored engine (engine/examples/): # • builder-adversary (original prompts) # • builder-adversary-min (minimal prompts) # on the same task, INDEPENDENTLY (no shared context), both on Sonnet. Confirms each succeeds and # clocks the tokens each uses. # # Methodology (controlled, reproducible — not the full self-paced watchdog loop): # For each version we assemble EXACTLY the prompt the harness would send a loop agent # (kickoff_template with {phase_id}/{plan}/{status}/{role} substituted, then the role prompt), # and drive one Builder pass then one Adversary pass as separate headless `claude -p` sessions # (fresh context each → "no shared context"). The Builder builds+commits in its own repo; the # Adversary cold-verifies from its OWN clone. We then independently re-run the DoD check and read # the Adversary's verdict. Tokens come from `claude -p --output-format json`. # # A short identical DRIVER NOTE is appended to BOTH versions' prompts so the agents finish in one # headless session (no /loop, no waiting). Being identical, it doesn't bias the comparison. # # Usage: ./run-bench.sh (writes RESULTS.md here; work dirs under a temp dir) set -u BENCH_DIR="$(cd "$(dirname "$0")" && pwd)" ENGINE_EX="$BENCH_DIR/engine/examples" PLAN_SRC="$BENCH_DIR/plans/roman.md" MODEL="claude-sonnet-4-6" RUNROOT="$(mktemp -d /tmp/ao-benchmark.XXXXXX)" RESULTS="$BENCH_DIR/RESULTS.md" GIT_ID=(-c user.email=bench@example.com -c user.name=bench) DRIVER_NOTE='[HEADLESS BENCH — single session, NO loop] Do the ENTIRE phase now, in THIS one session: implement the code and tests, run them, then `git add -A && git commit -m "..."`. Builder: also write machine-docs/STATUS-roman.md and make a claim commit (prefix `claim(`). Adversary: cold-verify and write machine-docs/REVIEW-roman.md with `roman: PASS` or `roman: FAIL`, then commit (prefix `review(`). Do NOT invoke /loop, do NOT ScheduleWakeup, do NOT wait — finish and exit. No git remote exists; commit locally only (do not push).' # assemble_prompt -> stdout assemble_prompt() { local version="$1" role="$2" planrel="$3" sed -e "s/{phase_id}/roman/g" \ -e "s#{plan}#$planrel#g" \ -e "s/{status}/STATUS-roman.md/g" \ -e "s/{role}/$role/g" \ "$ENGINE_EX/$version/prompts/kickoff.md" printf '\n' cat "$ENGINE_EX/$version/prompts/$role.md" printf '\n\n%s\n' "$DRIVER_NOTE" } # usage_of -> "in out cache_create cache_read cost turns" usage_of() { python3 - "$1" <<'PY' import json,sys try: d=json.load(open(sys.argv[1])) except Exception: print("0 0 0 0 0 0"); sys.exit() u=d.get("usage",{}) or {} print(u.get("input_tokens",0), u.get("output_tokens",0), u.get("cache_creation_input_tokens",0), u.get("cache_read_input_tokens",0), d.get("total_cost_usd",0), d.get("num_turns",0)) PY } declare -A SUM_TOK SUM_COST SUM_OK run_version() { local version="$1" local W="$RUNROOT/$version" local A="$RUNROOT/$version-adv" echo "===== $version =====" # --- Builder: its own work repo --- mkdir -p "$W/plans"; cp "$PLAN_SRC" "$W/plans/roman.md" git "${GIT_ID[@]}" -C "$W" init -q git "${GIT_ID[@]}" -C "$W" add -A && git "${GIT_ID[@]}" -C "$W" commit -q -m "chore: seed plan" echo "[$version] builder running…" ( cd "$W" && timeout 600 claude -p "$(assemble_prompt "$version" builder plans/roman.md)" \ --output-format json --model "$MODEL" --dangerously-skip-permissions ) > "$RUNROOT/$version.builder.json" 2>"$RUNROOT/$version.builder.err" # --- Adversary: its OWN clone, cold --- git "${GIT_ID[@]}" clone -q "$W" "$A" echo "[$version] adversary running…" ( cd "$A" && timeout 600 claude -p "$(assemble_prompt "$version" adversary plans/roman.md)" \ --output-format json --model "$MODEL" --dangerously-skip-permissions ) > "$RUNROOT/$version.adv.json" 2>"$RUNROOT/$version.adv.err" # --- independent success check (re-run DoD ourselves, in the adversary's clone) --- local tests_ok="no" cli_ok="no" verdict="missing" ( cd "$A" && python -m unittest -q ) >"$RUNROOT/$version.unittest.txt" 2>&1 && tests_ok="yes" local out; out="$( cd "$A" && python roman.py 1994 2>/dev/null )" [ "$out" = "MCMXCIV" ] && cli_ok="yes" if [ -f "$A/machine-docs/REVIEW-roman.md" ]; then if grep -qiE 'roman:?\s*PASS' "$A/machine-docs/REVIEW-roman.md" && ! grep -qi 'VETO' "$A/machine-docs/REVIEW-roman.md"; then verdict="PASS"; else verdict="FAIL/none"; fi fi local success="NO" [ "$tests_ok" = yes ] && [ "$cli_ok" = yes ] && [ "$verdict" = PASS ] && success="YES" # --- tally tokens --- read -r bi bo bcc bcr bcost bturns <<<"$(usage_of "$RUNROOT/$version.builder.json")" read -r ai ao acc acr acost aturns <<<"$(usage_of "$RUNROOT/$version.adv.json")" local btok=$((bi+bo+bcc+bcr)) atok=$((ai+ao+acc+acr)) local vtok=$((btok+atok)) local vcost; vcost=$(python3 -c "print(round(${bcost:-0}+${acost:-0},4))") SUM_TOK[$version]=$vtok; SUM_COST[$version]=$vcost; SUM_OK[$version]=$success { echo "### $version" echo "- **success:** $success (tests=$tests_ok, cli=$cli_ok, adversary-verdict=$verdict)" echo "- **builder:** in=$bi out=$bo cache_create=$bcc cache_read=$bcr → ${btok} tok, \$${bcost}, turns=$bturns" echo "- **adversary:** in=$ai out=$ao cache_create=$acc cache_read=$acr → ${atok} tok, \$${acost}, turns=$aturns" echo "- **total:** ${vtok} tokens, \$${vcost}" echo } >>"$RESULTS.tmp" } # static prompt size (chars: kickoff + role, what gets sent each kickoff) prompt_chars() { cat "$ENGINE_EX/$1/prompts/kickoff.md" "$ENGINE_EX/$1/prompts/$2.md" | wc -c | tr -d ' '; } : >"$RESULTS.tmp" run_version builder-adversary run_version builder-adversary-min # ---- write RESULTS.md ---- { echo "# Benchmark results — original vs minimal prompts" echo echo "Engine pinned at: \`$(git -C "$BENCH_DIR/engine" rev-parse --short HEAD)\`. Task:" echo "\`plans/roman.md\` (integer → Roman numeral). Model: **$MODEL** for Builder and Adversary in" echo "both versions. Runs are independent (separate headless \`claude -p\` sessions, no shared" echo "context). Methodology + caveats: see \`run-bench.sh\` header and the note below." echo echo "## Static prompt size (chars: kickoff + role, what gets sent each kickoff)" echo echo "| version | builder prompt | adversary prompt |" echo "|---|--:|--:|" echo "| builder-adversary (orig) | $(prompt_chars builder-adversary builder) | $(prompt_chars builder-adversary adversary) |" echo "| builder-adversary-min | $(prompt_chars builder-adversary-min builder) | $(prompt_chars builder-adversary-min adversary) |" echo echo "## Per-run tokens & cost" echo cat "$RESULTS.tmp" echo "## Summary" echo echo "| version | success | total tokens | total cost |" echo "|---|:--:|--:|--:|" echo "| builder-adversary (orig) | ${SUM_OK[builder-adversary]} | ${SUM_TOK[builder-adversary]} | \$${SUM_COST[builder-adversary]} |" echo "| builder-adversary-min | ${SUM_OK[builder-adversary-min]} | ${SUM_TOK[builder-adversary-min]} | \$${SUM_COST[builder-adversary-min]} |" echo echo "> Note: each \`claude -p\` call carries a fixed ~24k-token cached Claude Code system-prompt +" echo "> tool-schema overhead, and most tokens come from the agentic work itself (reading the plan," echo "> writing/running code, tool results). The role/kickoff prompt is a small slice — so the" echo "> headline token totals are close; the minimisation shows up in the static prompt size above" echo "> and the (smaller) input/cache-creation portion. This bench is a single controlled pass per" echo "> version (N=1; expect run-to-run variance); it exercises task effectiveness + prompt cost," echo "> NOT the live watchdog loop / handoff machinery (that needs a full \`agents.py up\` run)." echo echo "_Work dirs for this run: \`$RUNROOT\`_" } >"$RESULTS" rm -f "$RESULTS.tmp" echo echo "===== DONE =====" echo "orig: success=${SUM_OK[builder-adversary]} tokens=${SUM_TOK[builder-adversary]} cost=\$${SUM_COST[builder-adversary]}" echo "min : success=${SUM_OK[builder-adversary-min]} tokens=${SUM_TOK[builder-adversary-min]} cost=\$${SUM_COST[builder-adversary-min]}" echo "Results: $RESULTS" echo "Run dirs: $RUNROOT"