fix: flag usage-limit-affected runs; correct tok/sec

A run that hits a usage-limit pause has inflated duration (idle wait) but an
accurate token total. analyze.py now scans each run's watchdog log for 'limit
hit', flags it LIMIT in the raw table, and excludes it from the tokens/sec stat
(token total, tok/LOC, tok/commit unaffected). Caught because campaign run r2
hit the limit ~00:40 and recovered at the 00:50 reset — watchdog handled it.
This commit is contained in:
2026-06-15 01:29:54 +00:00
parent 33eeb3ce6b
commit 25a77f5d3c

View File

@ -9,7 +9,7 @@ raw per-run table. Stats are over SUCCESSFUL runs.
Usage: python3 analyze.py [data-file] [out-file]
"""
import sys, statistics as st
import sys, statistics as st, glob, os
DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data"
OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md"
@ -27,6 +27,21 @@ for line in open(DATA):
r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0
rows.append(r)
# flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is
# inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign
# run-root (repos are kept).
_roots = sorted(glob.glob("/tmp/ao-campaign-*"))
_root = _roots[-1] if _roots else ""
def _limit_hit(v, rep):
for wl in glob.glob(f"{_root}/{v}/r{rep}/.ao-state/*watchdog*.log"):
try:
if "limit hit" in open(wl, errors="ignore").read(): return True
except OSError:
pass
return False
for r in rows:
r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else ""
variants = []
for r in rows:
if r["v"] not in variants:
@ -63,16 +78,20 @@ with open(OUT, "w") as o:
f"{f(min(ts))} | {f(max(ts))} | {sp} |\n")
o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n")
o.write("tokens/sec excludes runs flagged `LIMIT` (a usage-limit pause inflates duration without "
"adding tokens, so it would understate the true rate); tokens/LOC and tokens/commit are "
"unaffected and include all successful runs.\n\n")
o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n")
nolimit = lambda sub: [r["tok_per_sec"] for r in sub if not r["limit"]]
for v in variants:
sub = [r for r in ok_rows if r["v"] == v]
if not sub: o.write(f"| {v} | — | — | — |\n"); continue
o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | "
f"{mmm([r['tok_per_sec'] for r in sub])} | "
f"{mmm(nolimit(sub))} | "
f"{mmm([r['tok_per_commit'] for r in sub])} |\n")
if ok_rows:
o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | "
f"{mmm([r['tok_per_sec'] for r in ok_rows])} | "
f"{mmm(nolimit(ok_rows))} | "
f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n")
o.write("\n## Per-variant medians (commits / LOC / duration)\n\n")
@ -89,11 +108,13 @@ with open(OUT, "w") as o:
o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n")
o.write("\n## All runs (raw)\n\n")
o.write("| variant | rep | ok | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
o.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
o.write("| variant | rep | ok | limit | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n")
o.write("|---|:--:|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n")
for r in rows:
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {f(r['total'])} | {r['dur']} | {r['commits']} | "
f"{r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | {fr(r['tok_per_commit'])} |\n")
o.write("\n_Stats over successful runs. Repos kept under the run root for analysis._\n")
o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {r['limit']} | {f(r['total'])} | {r['dur']} | "
f"{r['commits']} | {r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | "
f"{fr(r['tok_per_commit'])} |\n")
o.write("\n_Stats over successful runs. `LIMIT` = the run hit a usage-limit pause (duration/tok-sec "
"distorted, token total fine). Repos kept under the run root for analysis._\n")
print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")