Files
agent-orchestrator-benchmark/analyze.py
mfowler 33eeb3ce6b feat: analyze.py — efficiency ratios (tokens/LOC, tokens/sec, tokens/commit)
Standalone analysis over RESULTS-campaign.md.data (safe: independent of the live
runner). Adds the normalised efficiency ratios per run with min/median/max per
variant, alongside the token distributions, commit/LOC medians, correlations,
and full raw table. Run: python3 analyze.py  (regenerates RESULTS-campaign.md).

Orig baseline (5 runs): tokens/LOC ~25k–34k, tokens/sec ~11.3k–14.0k.
2026-06-15 00:15:46 +00:00

100 lines
5.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Analyse the campaign data file into RESULTS-campaign.md.
Reads RESULTS-campaign.md.data (TSV: variant rep success builder adversary total
duration_s commits loc) and writes a full report: per-variant token distributions,
median commits/LOC/duration, the normalised EFFICIENCY RATIOS (tokens/LOC,
tokens/sec, tokens/commit) with minmedianmax ranges, correlations, and the full
raw per-run table. Stats are over SUCCESSFUL runs.
Usage: python3 analyze.py [data-file] [out-file]
"""
import sys, statistics as st
DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data"
OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md"
rows = []
for line in open(DATA):
p = line.rstrip("\n").split("\t")
if len(p) != 9:
continue
v, rep, ok, b, a, tot, dur, commits, loc = p
r = dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc))
r["tok_per_loc"] = r["total"] / r["loc"] if r["loc"] else 0.0
r["tok_per_sec"] = r["total"] / r["dur"] if r["dur"] else 0.0
r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0
rows.append(r)
variants = []
for r in rows:
if r["v"] not in variants:
variants.append(r["v"])
ok_rows = [r for r in rows if r["ok"] == "YES"]
def f(n): return f"{n:,}"
def fr(x): return f"{x:,.0f}"
def frr(x): return f"{x:.2f}"
def mmm(xs, fn=fr):
if not xs: return "— / — / —"
return f"{fn(min(xs))} / {fn(int(st.median(xs)) if all(float(x).is_integer() for x in xs) else st.median(xs))} / {fn(max(xs))}"
def pearson(a, b):
if len(a) < 2: return float("nan")
ma, mb = st.mean(a), st.mean(b)
den = (sum((x-ma)**2 for x in a) * sum((y-mb)**2 for y in b)) ** 0.5
return (sum((x-ma)*(y-mb) for x, y in zip(a, b)) / den) if den else float("nan")
with open(OUT, "w") as o:
o.write("# Full-harness benchmark — campaign analysis\n\n")
o.write("Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
"to SEQUENCE-COMPLETE. Both loops on Sonnet. Tokens summed from each loop's session "
"transcript; commits = work-repo commit count; LOC = non-blank `calc/*.py` lines (code + "
f"tests). {len(ok_rows)} successful runs of {len(rows)} total.\n\n")
o.write("## Per-variant total tokens (successful runs)\n\n")
o.write("| variant | runs(ok) | median | mean | min | max | spread |\n|---|:--:|--:|--:|--:|--:|--:|\n")
for v in variants:
ts = [r["total"] for r in ok_rows if r["v"] == v]
n = sum(1 for r in rows if r["v"] == v)
if not ts: o.write(f"| {v} | 0/{n} | — | — | — | — | — |\n"); continue
sp = f"{max(ts)/min(ts):.2f}x" if min(ts) else ""
o.write(f"| {v} | {len(ts)}/{n} | {f(int(st.median(ts)))} | {f(int(st.mean(ts)))} | "
f"{f(min(ts))} | {f(max(ts))} | {sp} |\n")
o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n")
o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n")
for v in variants:
sub = [r for r in ok_rows if r["v"] == v]
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | "
f"{mmm([r['tok_per_sec'] for r in sub])} | "
f"{mmm([r['tok_per_commit'] for r in sub])} |\n")
if ok_rows:
o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | "
f"{mmm([r['tok_per_sec'] for r in ok_rows])} | "
f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n")
o.write("\n## Per-variant medians (commits / LOC / duration)\n\n")
o.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
for v in variants:
sub = [r for r in ok_rows if r["v"] == v]
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
o.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
o.write(f"\n## Correlations with total tokens (pooled, n={len(ok_rows)})\n\n")
o.write("| tokens vs | Pearson r |\n|---|--:|\n")
for k, lab in [("dur", "duration"), ("commits", "commits"), ("loc", "LOC")]:
o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n")
o.write("\n## All runs (raw)\n\n")
o.write("| variant | rep | ok | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
o.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
for r in rows:
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {f(r['total'])} | {r['dur']} | {r['commits']} | "
f"{r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | {fr(r['tok_per_commit'])} |\n")
o.write("\n_Stats over successful runs. Repos kept under the run root for analysis._\n")
print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")