From 33eeb3ce6b3069785907f53c4b831972c1d7928f Mon Sep 17 00:00:00 2001 From: mfowler Date: Mon, 15 Jun 2026 00:15:46 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20analyze.py=20=E2=80=94=20efficiency=20r?= =?UTF-8?q?atios=20(tokens/LOC,=20tokens/sec,=20tokens/commit)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone analysis over RESULTS-campaign.md.data (safe: independent of the live runner). Adds the normalised efficiency ratios per run with min/median/max per variant, alongside the token distributions, commit/LOC medians, correlations, and full raw table. Run: python3 analyze.py (regenerates RESULTS-campaign.md). Orig baseline (5 runs): tokens/LOC ~25k–34k, tokens/sec ~11.3k–14.0k. --- analyze.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100755 analyze.py diff --git a/analyze.py b/analyze.py new file mode 100755 index 0000000..958521c --- /dev/null +++ b/analyze.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +"""Analyse the campaign data file into RESULTS-campaign.md. + +Reads RESULTS-campaign.md.data (TSV: variant rep success builder adversary total +duration_s commits loc) and writes a full report: per-variant token distributions, +median commits/LOC/duration, the normalised EFFICIENCY RATIOS (tokens/LOC, +tokens/sec, tokens/commit) with min–median–max ranges, correlations, and the full +raw per-run table. Stats are over SUCCESSFUL runs. + +Usage: python3 analyze.py [data-file] [out-file] +""" +import sys, statistics as st + +DATA = sys.argv[1] if len(sys.argv) > 1 else "RESULTS-campaign.md.data" +OUT = sys.argv[2] if len(sys.argv) > 2 else "RESULTS-campaign.md" + +rows = [] +for line in open(DATA): + p = line.rstrip("\n").split("\t") + if len(p) != 9: + continue + v, rep, ok, b, a, tot, dur, commits, loc = p + r = dict(v=v, rep=int(rep), ok=ok, builder=int(b), adversary=int(a), + total=int(tot), dur=int(dur), commits=int(commits), loc=int(loc)) + r["tok_per_loc"] = r["total"] / r["loc"] if r["loc"] else 0.0 + r["tok_per_sec"] = r["total"] / r["dur"] if r["dur"] else 0.0 + r["tok_per_commit"] = r["total"] / r["commits"] if r["commits"] else 0.0 + rows.append(r) + +variants = [] +for r in rows: + if r["v"] not in variants: + variants.append(r["v"]) +ok_rows = [r for r in rows if r["ok"] == "YES"] + +def f(n): return f"{n:,}" +def fr(x): return f"{x:,.0f}" +def frr(x): return f"{x:.2f}" +def mmm(xs, fn=fr): + if not xs: return "— / — / —" + return f"{fn(min(xs))} / {fn(int(st.median(xs)) if all(float(x).is_integer() for x in xs) else st.median(xs))} / {fn(max(xs))}" +def pearson(a, b): + if len(a) < 2: return float("nan") + ma, mb = st.mean(a), st.mean(b) + den = (sum((x-ma)**2 for x in a) * sum((y-mb)**2 for y in b)) ** 0.5 + return (sum((x-ma)*(y-mb) for x, y in zip(a, b)) / den) if den else float("nan") + +with open(OUT, "w") as o: + o.write("# Full-harness benchmark — campaign analysis\n\n") + o.write("Real `agents.py up` Builder/Adversary loop pair + watchdog through the 3-phase calculator " + "to SEQUENCE-COMPLETE. Both loops on Sonnet. Tokens summed from each loop's session " + "transcript; commits = work-repo commit count; LOC = non-blank `calc/*.py` lines (code + " + f"tests). {len(ok_rows)} successful runs of {len(rows)} total.\n\n") + + o.write("## Per-variant total tokens (successful runs)\n\n") + o.write("| variant | runs(ok) | median | mean | min | max | spread |\n|---|:--:|--:|--:|--:|--:|--:|\n") + for v in variants: + ts = [r["total"] for r in ok_rows if r["v"] == v] + n = sum(1 for r in rows if r["v"] == v) + if not ts: o.write(f"| {v} | 0/{n} | — | — | — | — | — |\n"); continue + sp = f"{max(ts)/min(ts):.2f}x" if min(ts) else "—" + o.write(f"| {v} | {len(ts)}/{n} | {f(int(st.median(ts)))} | {f(int(st.mean(ts)))} | " + f"{f(min(ts))} | {f(max(ts))} | {sp} |\n") + + o.write("\n## Efficiency ratios — min / median / max (successful runs)\n\n") + o.write("| variant | tokens / LOC | tokens / sec | tokens / commit |\n|---|--:|--:|--:|\n") + for v in variants: + sub = [r for r in ok_rows if r["v"] == v] + if not sub: o.write(f"| {v} | — | — | — |\n"); continue + o.write(f"| {v} | {mmm([r['tok_per_loc'] for r in sub])} | " + f"{mmm([r['tok_per_sec'] for r in sub])} | " + f"{mmm([r['tok_per_commit'] for r in sub])} |\n") + if ok_rows: + o.write(f"| **all** | {mmm([r['tok_per_loc'] for r in ok_rows])} | " + f"{mmm([r['tok_per_sec'] for r in ok_rows])} | " + f"{mmm([r['tok_per_commit'] for r in ok_rows])} |\n") + + o.write("\n## Per-variant medians (commits / LOC / duration)\n\n") + o.write("| variant | median commits | median LOC | median dur(s) |\n|---|--:|--:|--:|\n") + for v in variants: + sub = [r for r in ok_rows if r["v"] == v] + if not sub: o.write(f"| {v} | — | — | — |\n"); continue + o.write(f"| {v} | {int(st.median([r['commits'] for r in sub]))} | " + f"{int(st.median([r['loc'] for r in sub]))} | {int(st.median([r['dur'] for r in sub]))} |\n") + + o.write(f"\n## Correlations with total tokens (pooled, n={len(ok_rows)})\n\n") + o.write("| tokens vs | Pearson r |\n|---|--:|\n") + for k, lab in [("dur", "duration"), ("commits", "commits"), ("loc", "LOC")]: + o.write(f"| {lab} | {pearson([r['total'] for r in ok_rows], [r[k] for r in ok_rows]):+.2f} |\n") + + o.write("\n## All runs (raw)\n\n") + o.write("| variant | rep | ok | total | dur(s) | commits | LOC | tok/LOC | tok/sec | tok/commit |\n") + o.write("|---|:--:|:--:|--:|--:|--:|--:|--:|--:|--:|\n") + for r in rows: + o.write(f"| {r['v']} | {r['rep']} | {r['ok']} | {f(r['total'])} | {r['dur']} | {r['commits']} | " + f"{r['loc']} | {fr(r['tok_per_loc'])} | {fr(r['tok_per_sec'])} | {fr(r['tok_per_commit'])} |\n") + o.write("\n_Stats over successful runs. Repos kept under the run root for analysis._\n") + +print(f"wrote {OUT} ({len(rows)} runs, {len(ok_rows)} ok)")