A run that hits a usage-limit pause has inflated duration (idle wait) but an accurate token total. analyze.py now scans each run's watchdog log for 'limit hit', flags it LIMIT in the raw table, and excludes it from the tokens/sec stat (token total, tok/LOC, tok/commit unaffected). Caught because campaign run r2 hit the limit ~00:40 and recovered at the 00:50 reset — watchdog handled it.
121 lines
6.1 KiB
Python
Executable File
121 lines
6.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""Analyse the campaign data file into RESULTS-campaign.md.
|
||
|
||
Reads RESULTS-campaign.md.data (TSV: variant rep success builder adversary total
|
||
duration_s commits loc) and writes a full report: per-variant token distributions,
|
||
median commits/LOC/duration, the normalised EFFICIENCY RATIOS (tokens/LOC,
|
||
tokens/sec, tokens/commit) with min–median–max ranges, correlations, and the full
|
||
raw per-run table. Stats are over SUCCESSFUL runs.
|
||
|
||
Usage: python3 analyze.py [data-file] [out-file]
|
||
"""
|
||
import sys, statistics as st, glob, os
|
||
|
||
DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data"
|
||
OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md"
|
||
|
||
rows = []
|
||
for line in open(DATA):
|
||
p = line.rstrip("\n").split("\t")
|
||
if len(p) != 9:
|
||
continue
|
||
v, rep, ok, b, a, tot, dur, commits, loc = p
|
||
r = dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a),
|
||
total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc))
|
||
r["tok_per_loc"] = r["total"] / r["loc"] if r["loc"] else 0.0
|
||
r["tok_per_sec"] = r["total"] / r["dur"] if r["dur"] else 0.0
|
||
r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0
|
||
rows.append(r)
|
||
|
||
# flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
|
||
# inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
|
||
# run-root (repos are kept).
|
||
_roots = sorted(glob.glob("/tmp/ao-campaign-*"))
|
||
_root = _roots[-1] if _roots else ""
|
||
def _limit_hit(v, rep):
|
||
for wl in glob.glob(f"{_root}/{v}/r{rep}/.ao-state/*watchdog*.log"):
|
||
try:
|
||
if "limit hit" in open(wl, errors="ignore").read(): return True
|
||
except OSError:
|
||
pass
|
||
return False
|
||
for r in rows:
|
||
r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else ""
|
||
|
||
variants = []
|
||
for r in rows:
|
||
if r["v"] not in variants:
|
||
variants.append(r["v"])
|
||
ok_rows = [r for r in rows if r["ok"] == "YES"]
|
||
|
||
def f(n): return f"{n:,}"
|
||
def fr(x): return f"{x:,.0f}"
|
||
def frr(x): return f"{x:.2f}"
|
||
def mmm(xs, fn=fr):
|
||
if not xs: return "— / — / —"
|
||
return f"{fn(min(xs))} / {fn(int(st.median(xs)) if all(float(x).is_integer() for x in xs) else st.median(xs))} / {fn(max(xs))}"
|
||
def pearson(a, b):
|
||
if len(a) < 2: return float("nan")
|
||
ma, mb = st.mean(a), st.mean(b)
|
||
den = (sum((x-ma)**2 for x in a) * sum((y-mb)**2 for y in b)) ** 0.5
|
||
return (sum((x-ma)*(y-mb) for x, y in zip(a, b)) / den) if den else float("nan")
|
||
|
||
with open(OUT, "w") as o:
|
||
o.write("# Full-harness benchmark — campaign analysis\n\n")
|
||
o.write("Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator "
|
||
"to SEQUENCE-COMPLETE. Both loops on Sonnet. Tokens summed from each loop's session "
|
||
"transcript; commits = work-repo commit count; LOC = non-blank `calc/*.py` lines (code + "
|
||
f"tests). {len(ok_rows)} successful runs of {len(rows)} total.\n\n")
|
||
|
||
o.write("## Per-variant total tokens (successful runs)\n\n")
|
||
o.write("| variant | runs(ok) | median | mean | min | max | spread |\n|---|:--:|--:|--:|--:|--:|--:|\n")
|
||
for v in variants:
|
||
ts = [r["total"] for r in ok_rows if r["v"] == v]
|
||
n = sum(1 for r in rows if r["v"] == v)
|
||
if not ts: o.write(f"| {v} | 0/{n} | — | — | — | — | — |\n"); continue
|
||
sp = f"{max(ts)/min(ts):.2f}x" if min(ts) else "—"
|
||
o.write(f"| {v} | {len(ts)}/{n} | {f(int(st.median(ts)))} | {f(int(st.mean(ts)))} | "
|
||
f"{f(min(ts))} | {f(max(ts))} | {sp} |\n")
|
||
|
||
o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n")
|
||
o.write("tokens/sec excludes runs flagged `LIMIT` (a usage-limit pause inflates duration without "
|
||
"adding tokens, so it would understate the true rate); tokens/LOC and tokens/commit are "
|
||
"unaffected and include all successful runs.\n\n")
|
||
o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n")
|
||
nolimit = lambda sub: [r["tok_per_sec"] for r in sub if not r["limit"]]
|
||
for v in variants:
|
||
sub = [r for r in ok_rows if r["v"] == v]
|
||
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
|
||
o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | "
|
||
f"{mmm(nolimit(sub))} | "
|
||
f"{mmm([r['tok_per_commit'] for r in sub])} |\n")
|
||
if ok_rows:
|
||
o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | "
|
||
f"{mmm(nolimit(ok_rows))} | "
|
||
f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n")
|
||
|
||
o.write("\n## Per-variant medians (commits / LOC / duration)\n\n")
|
||
o.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n")
|
||
for v in variants:
|
||
sub = [r for r in ok_rows if r["v"] == v]
|
||
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
|
||
o.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | "
|
||
f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n")
|
||
|
||
o.write(f"\n## Correlations with total tokens (pooled, n={len(ok_rows)})\n\n")
|
||
o.write("| tokens vs | Pearson r |\n|---|--:|\n")
|
||
for k, lab in [("dur", "duration"), ("commits", "commits"), ("loc", "LOC")]:
|
||
o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n")
|
||
|
||
o.write("\n## All runs (raw)\n\n")
|
||
o.write("| variant | rep | ok | limit | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
|
||
o.write("|---|:--:|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
|
||
for r in rows:
|
||
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {r['limit']} | {f(r['total'])} | {r['dur']} | "
|
||
f"{r['commits']} | {r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | "
|
||
f"{fr(r['tok_per_commit'])} |\n")
|
||
o.write("\n_Stats over successful runs. `LIMIT` = the run hit a usage-limit pause (duration/tok-sec "
|
||
"distorted, token total fine). Repos kept under the run root for analysis._\n")
|
||
|
||
print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")
|