Files
agent-orchestrator-benchmark/analyze.py

124 lines
6.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Analyse the campaign data file into RESULTS-campaign.md.
Reads RESULTS-campaign.md.data (TSV: variant rep success builder adversary total
duration_s commits loc) and writes a full report: per-variant token distributions,
median commits/LOC/duration, the normalised EFFICIENCY RATIOS (tokens/LOC,
tokens/sec, tokens/commit) with minmedianmax ranges, correlations, and the full
raw per-run table. Stats are over SUCCESSFUL runs.
Usage: python3 analyze.py [data-file] [out-file]
"""
import sys, statistics as st, glob, os
DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data"
OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md"
rows = []
for line in open(DATA):
p = line.rstrip("\n").split("\t")
if len(p) != 9:
continue
v, rep, ok, b, a, tot, dur, commits, loc = p
r = dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc))
r["tok_per_loc"] = r["total"] / r["loc"] if r["loc"] else 0.0
r["tok_per_sec"] = r["total"] / r["dur"] if r["dur"] else 0.0
r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0
rows.append(r)
# flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
# inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
# run-root (repos are kept).
def _limit_hit(v, rep):
# search ALL run roots (a variant/rep may live in any campaign root; solo has its own layout)
pats = [f"/tmp/ao-campaign-*/{v}/r{rep}/.ao-state/*watchdog*.log"]
if v == "builder-solo":
pats.append(f"/tmp/ao-solo-*/r{rep}/.ao-state/*watchdog*.log")
for pat in pats:
for wl in glob.glob(pat):
try:
if "limit hit" in open(wl, errors="ignore").read(): return True
except OSError:
pass
return False
for r in rows:
r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else ""
variants = []
for r in rows:
if r["v"] not in variants:
variants.append(r["v"])
ok_rows = [r for r in rows if r["ok"] == "YES"]
def f(n): return f"{n:,}"
def fr(x): return f"{x:,.0f}"
def frr(x): return f"{x:.2f}"
def mmm(xs, fn=fr):
if not xs: return "— / — / —"
return f"{fn(min(xs))} / {fn(int(st.median(xs)) if all(float(x).is_integer() for x in xs) else st.median(xs))} / {fn(max(xs))}"
def pearson(a, b):
if len(a) < 2: return float("nan")
ma, mb = st.mean(a), st.mean(b)
den = (sum((x-ma)**2 for x in a) * sum((y-mb)**2 for y in b)) ** 0.5
return (sum((x-ma)*(y-mb) for x, y in zip(a, b)) / den) if den else float("nan")
with open(OUT, "w") as o:
o.write("# Full-harness benchmark — campaign analysis\n\n")
o.write("Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
"to SEQUENCE-COMPLETE. Both loops on Sonnet. Tokens summed from each loop's session "
"transcript; commits = work-repo commit count; LOC = non-blank `calc/*.py` lines (code + "
f"tests). {len(ok_rows)} successful runs of {len(rows)} total.\n\n")
o.write("## Per-variant total tokens (successful runs)\n\n")
o.write("| variant | runs(ok) | median | mean | min | max | spread |\n|---|:--:|--:|--:|--:|--:|--:|\n")
for v in variants:
ts = [r["total"] for r in ok_rows if r["v"] == v]
n = sum(1 for r in rows if r["v"] == v)
if not ts: o.write(f"| {v} | 0/{n} | — | — | — | — | — |\n"); continue
sp = f"{max(ts)/min(ts):.2f}x" if min(ts) else ""
o.write(f"| {v} | {len(ts)}/{n} | {f(int(st.median(ts)))} | {f(int(st.mean(ts)))} | "
f"{f(min(ts))} | {f(max(ts))} | {sp} |\n")
o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n")
o.write("tokens/sec excludes runs flagged `LIMIT` (a usage-limit pause inflates duration without "
"adding tokens, so it would understate the true rate); tokens/LOC and tokens/commit are "
"unaffected and include all successful runs.\n\n")
o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n")
nolimit = lambda sub: [r["tok_per_sec"] for r in sub if not r["limit"]]
for v in variants:
sub = [r for r in ok_rows if r["v"] == v]
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | "
f"{mmm(nolimit(sub))} | "
f"{mmm([r['tok_per_commit'] for r in sub])} |\n")
if ok_rows:
o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | "
f"{mmm(nolimit(ok_rows))} | "
f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n")
o.write("\n## Per-variant medians (commits / LOC / duration)\n\n")
o.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
for v in variants:
sub = [r for r in ok_rows if r["v"] == v]
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
o.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
o.write(f"\n## Correlations with total tokens (pooled, n={len(ok_rows)})\n\n")
o.write("| tokens vs | Pearson r |\n|---|--:|\n")
for k, lab in [("dur", "duration"), ("commits", "commits"), ("loc", "LOC")]:
o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n")
o.write("\n## All runs (raw)\n\n")
o.write("| variant | rep | ok | limit | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
o.write("|---|:--:|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
for r in rows:
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {r['limit']} | {f(r['total'])} | {r['dur']} | "
f"{r['commits']} | {r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | "
f"{fr(r['tok_per_commit'])} |\n")
o.write("\n_Stats over successful runs. `LIMIT` = the run hit a usage-limit pause (duration/tok-sec "
"distorted, token total fine). Repos kept under the run root for analysis._\n")
print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")