run-solo-bench.sh runs the builder-solo variant (single builder, self-verify, no adversary) 5× on the same calculator and appends rows to the shared campaign data file (adversary col = 0). Separate script so the live campaign runner is untouched. analyze.py limit-detection now also covers the solo run layout. Engine example builder-solo committed at a0f7652; benchmark engine to be re- pinned to it before running solo (after the main campaign completes).
129 lines
6.5 KiB
Python
Executable File
129 lines
6.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""Analyse the campaign data file into RESULTS-campaign.md.
|
||
|
||
Reads RESULTS-campaign.md.data (TSV: variant rep success builder adversary total
|
||
duration_s commits loc) and writes a full report: per-variant token distributions,
|
||
median commits/LOC/duration, the normalised EFFICIENCY RATIOS (tokens/LOC,
|
||
tokens/sec, tokens/commit) with min–median–max ranges, correlations, and the full
|
||
raw per-run table. Stats are over SUCCESSFUL runs.
|
||
|
||
Usage: python3 analyze.py [data-file] [out-file]
|
||
"""
|
||
import sys, statistics as st, glob, os
|
||
|
||
DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data"
|
||
OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md"
|
||
|
||
rows = []
|
||
for line in open(DATA):
|
||
p = line.rstrip("\n").split("\t")
|
||
if len(p) != 9:
|
||
continue
|
||
v, rep, ok, b, a, tot, dur, commits, loc = p
|
||
r = dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
|
||
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc))
|
||
r["tok_per_loc"] = r["total"] / r["loc"] if r["loc"] else 0.0
|
||
r["tok_per_sec"] = r["total"] / r["dur"] if r["dur"] else 0.0
|
||
r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0
|
||
rows.append(r)
|
||
|
||
# flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
|
||
# inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
|
||
# run-root (repos are kept).
|
||
_camp = sorted(glob.glob("/tmp/ao-campaign-*"))
|
||
_camp_root = _camp[-1] if _camp else ""
|
||
_solo = sorted(glob.glob("/tmp/ao-solo-*"))
|
||
_solo_root = _solo[-1] if _solo else ""
|
||
def _limit_hit(v, rep):
|
||
pats = []
|
||
if _camp_root:
|
||
pats.append(f"{_camp_root}/{v}/r{rep}/.ao-state/*watchdog*.log") # campaign layout
|
||
if v == "builder-solo" and _solo_root:
|
||
pats.append(f"{_solo_root}/r{rep}/.ao-state/*watchdog*.log") # solo layout (no variant subdir)
|
||
for pat in pats:
|
||
for wl in glob.glob(pat):
|
||
try:
|
||
if "limit hit" in open(wl, errors="ignore").read(): return True
|
||
except OSError:
|
||
pass
|
||
return False
|
||
for r in rows:
|
||
r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else ""
|
||
|
||
variants = []
|
||
for r in rows:
|
||
if r["v"] not in variants:
|
||
variants.append(r["v"])
|
||
ok_rows = [r for r in rows if r["ok"] == "YES"]
|
||
|
||
def f(n): return f"{n:,}"
|
||
def fr(x): return f"{x:,.0f}"
|
||
def frr(x): return f"{x:.2f}"
|
||
def mmm(xs, fn=fr):
|
||
if not xs: return "— / — / —"
|
||
return f"{fn(min(xs))} / {fn(int(st.median(xs)) if all(float(x).is_integer() for x in xs) else st.median(xs))} / {fn(max(xs))}"
|
||
def pearson(a, b):
|
||
if len(a) < 2: return float("nan")
|
||
ma, mb = st.mean(a), st.mean(b)
|
||
den = (sum((x-ma)**2 for x in a) * sum((y-mb)**2 for y in b)) ** 0.5
|
||
return (sum((x-ma)*(y-mb) for x, y in zip(a, b)) / den) if den else float("nan")
|
||
|
||
with open(OUT, "w") as o:
|
||
o.write("# Full-harness benchmark — campaign analysis\n\n")
|
||
o.write("Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
|
||
"to SEQUENCE-COMPLETE. Both loops on Sonnet. Tokens summed from each loop's session "
|
||
"transcript; commits = work-repo commit count; LOC = non-blank `calc/*.py` lines (code + "
|
||
f"tests). {len(ok_rows)} successful runs of {len(rows)} total.\n\n")
|
||
|
||
o.write("## Per-variant total tokens (successful runs)\n\n")
|
||
o.write("| variant | runs(ok) | median | mean | min | max | spread |\n|---|:--:|--:|--:|--:|--:|--:|\n")
|
||
for v in variants:
|
||
ts = [r["total"] for r in ok_rows if r["v"] == v]
|
||
n = sum(1 for r in rows if r["v"] == v)
|
||
if not ts: o.write(f"| {v} | 0/{n} | — | — | — | — | — |\n"); continue
|
||
sp = f"{max(ts)/min(ts):.2f}x" if min(ts) else "—"
|
||
o.write(f"| {v} | {len(ts)}/{n} | {f(int(st.median(ts)))} | {f(int(st.mean(ts)))} | "
|
||
f"{f(min(ts))} | {f(max(ts))} | {sp} |\n")
|
||
|
||
o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n")
|
||
o.write("tokens/sec excludes runs flagged `LIMIT` (a usage-limit pause inflates duration without "
|
||
"adding tokens, so it would understate the true rate); tokens/LOC and tokens/commit are "
|
||
"unaffected and include all successful runs.\n\n")
|
||
o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n")
|
||
nolimit = lambda sub: [r["tok_per_sec"] for r in sub if not r["limit"]]
|
||
for v in variants:
|
||
sub = [r for r in ok_rows if r["v"] == v]
|
||
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
|
||
o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | "
|
||
f"{mmm(nolimit(sub))} | "
|
||
f"{mmm([r['tok_per_commit'] for r in sub])} |\n")
|
||
if ok_rows:
|
||
o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | "
|
||
f"{mmm(nolimit(ok_rows))} | "
|
||
f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n")
|
||
|
||
o.write("\n## Per-variant medians (commits / LOC / duration)\n\n")
|
||
o.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
|
||
for v in variants:
|
||
sub = [r for r in ok_rows if r["v"] == v]
|
||
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
|
||
o.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
|
||
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
|
||
|
||
o.write(f"\n## Correlations with total tokens (pooled, n={len(ok_rows)})\n\n")
|
||
o.write("| tokens vs | Pearson r |\n|---|--:|\n")
|
||
for k, lab in [("dur", "duration"), ("commits", "commits"), ("loc", "LOC")]:
|
||
o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n")
|
||
|
||
o.write("\n## All runs (raw)\n\n")
|
||
o.write("| variant | rep | ok | limit | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
|
||
o.write("|---|:--:|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
|
||
for r in rows:
|
||
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {r['limit']} | {f(r['total'])} | {r['dur']} | "
|
||
f"{r['commits']} | {r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | "
|
||
f"{fr(r['tok_per_commit'])} |\n")
|
||
o.write("\n_Stats over successful runs. `LIMIT` = the run hit a usage-limit pause (duration/tok-sec "
|
||
"distorted, token total fine). Repos kept under the run root for analysis._\n")
|
||
|
||
print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")
|