diff --git a/analyze.py b/analyze.py index 958521c..212a650 100755 --- a/analyze.py +++ b/analyze.py @@ -9,7 +9,7 @@ raw per-run table. Stats are over SUCCESSFUL runs. Usage: python3 analyze.py [data-file] [out-file] """ -import sys, statistics as st +import sys, statistics as st, glob, os DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data" OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md" @@ -27,6 +27,21 @@ for line in open(DATA): r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0 rows.append(r) +# flag runs whose watchdog log shows a usage-limit hit — their duration (and thus tok/sec) is +# inflated by the idle pause, even though the token total is unaffected. Look in the newest campaign +# run-root (repos are kept). +_roots = sorted(glob.glob("/tmp/ao-campaign-*")) +_root = _roots[-1] if _roots else "" +def _limit_hit(v, rep): + for wl in glob.glob(f"{_root}/{v}/r{rep}/.ao-state/*watchdog*.log"): + try: + if "limit hit" in open(wl, errors="ignore").read(): return True + except OSError: + pass + return False +for r in rows: + r["limit"] = "LIMIT" if _limit_hit(r["v"], r["rep"]) else "" + variants = [] for r in rows: if r["v"] not in variants: @@ -63,16 +78,20 @@ with open(OUT, "w") as o: f"{f(min(ts))} | {f(max(ts))} | {sp} |\n") o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n") + o.write("tokens/sec excludes runs flagged `LIMIT` (a usage-limit pause inflates duration without " + "adding tokens, so it would understate the true rate); tokens/LOC and tokens/commit are " + "unaffected and include all successful runs.\n\n") o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n") + nolimit = lambda sub: [r["tok_per_sec"] for r in sub if not r["limit"]] for v in variants: sub = [r for r in ok_rows if r["v"] == v] if not sub: o.write(f"| {v} | — | — | — |\n"); continue o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | " - f"{mmm([r['tok_per_sec'] for r in sub])} | " + f"{mmm(nolimit(sub))} | " f"{mmm([r['tok_per_commit'] for r in sub])} |\n") if ok_rows: o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | " - f"{mmm([r['tok_per_sec'] for r in ok_rows])} | " + f"{mmm(nolimit(ok_rows))} | " f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n") o.write("\n## Per-variant medians (commits / LOC / duration)\n\n") @@ -89,11 +108,13 @@ with open(OUT, "w") as o: o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n") o.write("\n## All runs (raw)\n\n") - o.write("| variant | rep | ok | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n") - o.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n") + o.write("| variant | rep | ok | limit | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n") + o.write("|---|:--:|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n") for r in rows: - o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {f(r['total'])} | {r['dur']} | {r['commits']} | " - f"{r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | {fr(r['tok_per_commit'])} |\n") - o.write("\n_Stats over successful runs. Repos kept under the run root for analysis._\n") + o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {r['limit']} | {f(r['total'])} | {r['dur']} | " + f"{r['commits']} | {r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | " + f"{fr(r['tok_per_commit'])} |\n") + o.write("\n_Stats over successful runs. `LIMIT` = the run hit a usage-limit pause (duration/tok-sec " + "distorted, token total fine). Repos kept under the run root for analysis._\n") print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")