diff --git a/README.md b/README.md index 4600d02..39a8b1d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ agent-log.py render claude JSONL transcripts into clean, greppable logs agents.example.toml a self-contained 2-agent example project prompts/ generic role + kickoff templates (builder / adversary / kickoff) smoke.sh bring the example up + tear it down in an isolated sandbox, then clean up +tests/ the test suite — unit tests + isolated live backend smokes + a runner flake.nix/.lock a Nix devShell with the runtime deps (python311, tmux, git) ``` @@ -315,6 +316,39 @@ documents this in its banner. --- +## Testing + +The `tests/` directory holds the harness's own test suite. One runner drives everything: + +```bash +nix develop -c ./tests/run.sh # unit tests always; live backend smokes when available +# or just: ./tests/run.sh # (python3 + tmux must be on PATH) +``` + +What it runs: + +- **Unit tests** (`tests/test_unit.py`) — pure logic, **no agents spawned, no live tmux sessions**. + Cover config load + defaults merge, kickoff-template assembly, the phase machine (advance on the + done marker, idempotent sequence-complete, append-a-phase resumes), usage-limit reset-banner + parsing, `WAITING-UNTIL` / stall parsing, and the per-backend activity detectors (claude + + opencode footers). Always run; a failure fails the suite. Run them alone with + `python3 -m unittest discover -s tests` (or `python3 tests/test_unit.py`). +- **Live backend smokes** (`tests/smoke_claude.sh`, `tests/smoke_opencode.sh`) — each brings a + throwaway scratch project up **through `agents.py`** on a real backend, in a fully isolated + sandbox (its own unique `session_prefix`, a temp `log_dir`, and — for opencode — a dedicated + server on a non-default port `AOTEST_OC_PORT`, default `4097`), confirms the session attaches and + `status` reports it RUNNING, then `down`s it and cleans up (no leftover sessions, port freed). + Each **SKIPs gracefully** (exit 0) when its backend's binary or creds are unavailable. Useful env: + `CLAUDE_BIN` / `OPENCODE_BIN`, `AOTEST_MODEL`, `AOTEST_OC_PORT`, `AOTEST_OC_CREDS`. +- **Isolation sanity** — after the live runs, the runner asserts no `aotest-*` tmux sessions leaked + and reports that any live sessions are untouched. + +The smokes are safe by construction: a unique per-run session prefix (never `cc-ci-` or any real +project's), a dedicated opencode port (never `4096`), and a cleanup trap that fires on success, +failure, and Ctrl+C. + +--- + ## Adding things - **Add an agent** — add an `[[agent]]` block; `agents.py up `. No code change. diff --git a/tests/run.sh b/tests/run.sh new file mode 100755 index 0000000..560b0c8 --- /dev/null +++ b/tests/run.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────────────────────────────────────── +# agent-orchestrator test runner. +# +# • UNIT tests — always run (pure logic, no agents spawned). A failure fails the suite. +# • CLAUDE smoke — live, run when the `claude` CLI is available; SKIPs otherwise. +# • OPENCODE smoke — live, run when `opencode` + creds are available; SKIPs otherwise. +# • ISOLATION sanity — after the live runs: assert no leftover aotest-* tmux sessions, and that +# the live cc-ci-* sessions are untouched. +# +# Run inside the devShell: nix develop -c ./tests/run.sh +# or simply: ./tests/run.sh (python3 + tmux must be on PATH) +# +# Exit: 0 = all run tests passed (skips are OK); 1 = a unit test or a live smoke FAILED, or a +# leftover aotest-* session was found. +# ───────────────────────────────────────────────────────────────────────────── +set -uo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +REPO="$(cd "$HERE/.." && pwd)" +RC=0 +UNIT=FAIL CLAUDE=SKIP OPENCODE=SKIP ISO=PASS + +echo "######################################################################" +echo "# agent-orchestrator test suite" +echo "######################################################################" + +# ── unit tests (always) ─────────────────────────────────────────────────────────── +echo; echo ">>> UNIT TESTS" +if python3 -m unittest discover -s "$HERE" -p 'test_*.py' -v; then + UNIT=PASS +else + UNIT=FAIL; RC=1 +fi + +# helper: run a smoke script, classify its result from its output +run_smoke() { + local label="$1" script="$2"; shift 2 + echo; echo ">>> ${label} SMOKE" + local out + out="$(bash "$script" 2>&1)"; local rc=$? + echo "$out" + if echo "$out" | grep -q "BACKEND SMOKE: PASS"; then echo "PASS"; return 0; fi + if [ "$rc" -eq 0 ] && echo "$out" | grep -qE "^SKIP:"; then echo "SKIP"; return 2; fi + echo "FAIL"; return 1 +} + +# ── live smoke tests (when backends available) ────────────────────────────────────── +run_smoke "CLAUDE" "$HERE/smoke_claude.sh"; case $? in 0) CLAUDE=PASS;; 2) CLAUDE=SKIP;; *) CLAUDE=FAIL; RC=1;; esac +run_smoke "OPENCODE" "$HERE/smoke_opencode.sh"; case $? in 0) OPENCODE=PASS;; 2) OPENCODE=SKIP;; *) OPENCODE=FAIL; RC=1;; esac + +# ── isolation sanity ──────────────────────────────────────────────────────────────── +echo; echo ">>> ISOLATION SANITY" +if command -v tmux >/dev/null 2>&1; then + leftover="$(tmux ls 2>/dev/null | sed 's/:.*//' | grep '^aotest-' || true)" + if [ -n "$leftover" ]; then + echo " FAIL: leftover aotest-* sessions: $leftover"; ISO=FAIL; RC=1 + else + echo " PASS: no leftover aotest-* tmux sessions" + fi + intact="" + for s in cc-ci-orchestrator cc-ci-watchdog cc-ci-assistant3; do + tmux has-session -t "=$s" 2>/dev/null && intact="$intact $s" + done + echo " info: live cc-ci sessions present:${intact:- (none — not a cc-ci host)}" +else + echo " (tmux not on PATH — isolation sanity skipped)" +fi + +# ── summary ───────────────────────────────────────────────────────────────────────── +echo; echo "######################################################################" +echo "# SUMMARY: unit=$UNIT claude=$CLAUDE opencode=$OPENCODE isolation=$ISO" +echo "######################################################################" +[ "$RC" -eq 0 ] && echo "ALL RUN TESTS PASSED (skips are OK)" || echo "SUITE FAILED" +exit "$RC" diff --git a/tests/smoke_claude.sh b/tests/smoke_claude.sh new file mode 100755 index 0000000..175f5a7 --- /dev/null +++ b/tests/smoke_claude.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────────────────────────────────────── +# Isolated LIVE smoke of the CLAUDE backend, driven entirely through the harness. +# +# Brings a throwaway scratch project (its OWN session_prefix "aotest-c--" and a temporary +# log_dir) up through `agents.py up`, on the real `claude` CLI: +# • the harness builds the claude launch command (arg delivery + remote-control + model flag), +# • the agent attaches in tmux (claude TUI alive, not an instant crash), +# • `agents.py status` reports it RUNNING, +# • `agents.py down` tears it down cleanly — no leftover sessions. +# +# SAFE BY CONSTRUCTION — never touches the live cc-ci-* sessions: +# • a unique per-run session prefix (NOT "cc-ci-") +# • cleans up everything it creates on exit (even on Ctrl+C / error). +# +# Usage: bash tests/smoke_claude.sh +# Env: CLAUDE_BIN (default: `claude` on PATH, else ~/.local/bin/claude) +# AOTEST_MODEL (default: claude-haiku-4-5 — a cheap model for the trivial probe) +# Exit: 0 = PASS or SKIP (claude unavailable); 1 = FAIL. +# ───────────────────────────────────────────────────────────────────────────── +set -uo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +REPO="$(cd "$HERE/.." && pwd)" +CLAUDE_BIN="${CLAUDE_BIN:-$(command -v claude 2>/dev/null || echo "$HOME/.local/bin/claude")}" +MODEL="${AOTEST_MODEL:-claude-haiku-4-5}" +PREFIX="aotest-c-$$-" +SANDBOX="$(mktemp -d)" +CFG="$SANDBOX/agents.toml" +FAILED=0 + +pass(){ echo " PASS: $*"; } +fail(){ echo " FAIL: $*"; FAILED=1; } + +cleanup(){ + local rc=$? + python3 "$REPO/agents.py" --config "$CFG" down probe >/dev/null 2>&1 || true + if command -v tmux >/dev/null 2>&1; then + tmux ls 2>/dev/null | sed 's/:.*//' | grep "^${PREFIX}" | while read -r s; do + tmux kill-session -t "=$s" 2>/dev/null || true + done || true + fi + rm -rf "$SANDBOX" + exit "$rc" +} +trap cleanup EXIT INT TERM + +echo "=== claude backend smoke (isolated: prefix=${PREFIX}) ===" + +# 0 — preconditions (SKIP, not FAIL, when claude/tmux can't run here) +command -v tmux >/dev/null 2>&1 || { echo "SKIP: tmux not on PATH (run inside 'nix develop')"; exit 0; } +[ -x "$CLAUDE_BIN" ] || command -v "$CLAUDE_BIN" >/dev/null 2>&1 \ + || { echo "SKIP: claude binary not found ($CLAUDE_BIN)"; exit 0; } + +# 1 — isolated sandbox config (unique prefix + temp log_dir; one trivial persistent probe) +cat > "$CFG" </dev/null; then + cmd=$(tmux display-message -p -t "=${PREFIX}probe:" '#{pane_current_command}' 2>/dev/null) + pass "session ${PREFIX}probe created via agents.py (pane command: ${cmd})" +else + fail "${PREFIX}probe session was not created"; echo "=== RESULT: FAIL ==="; exit 1 +fi + +# 4 — claude actually attached (TUI alive), not an instant crash +sleep 6 +cmd=$(tmux display-message -p -t "=${PREFIX}probe:" '#{pane_current_command}' 2>/dev/null) +pane=$(tmux capture-pane -p -t "=${PREFIX}probe:" 2>/dev/null) +if [ "$cmd" = "claude" ] || echo "$pane" | grep -qiE "esc to interrupt|bypass permissions|READY|claude|❯|welcome"; then + pass "claude TUI attached + alive (driven entirely by agents.py)" +else + fail "no claude TUI in pane (cmd=${cmd}); tail: $(echo "$pane" | grep -vE '^\s*$' | tail -3)" +fi + +# 5 — status reports it RUNNING +if python3 "$REPO/agents.py" --config "$CFG" status | grep -E '^\s*probe\b' | grep -q RUNNING; then + pass "agents.py status reports probe RUNNING" +else + fail "agents.py status did not report probe RUNNING" +fi + +# 6 — lifecycle: down removes it cleanly +python3 "$REPO/agents.py" --config "$CFG" down probe >/dev/null 2>&1 +sleep 2 +if tmux has-session -t "=${PREFIX}probe" 2>/dev/null; then + fail "${PREFIX}probe still alive after agents.py down" +else + pass "agents.py down cleanly removed the session" +fi + +if [ "$FAILED" = 0 ]; then echo "=== CLAUDE BACKEND SMOKE: PASS ==="; exit 0 +else echo "=== CLAUDE BACKEND SMOKE: FAIL ==="; exit 1; fi diff --git a/tests/smoke_opencode.sh b/tests/smoke_opencode.sh new file mode 100755 index 0000000..178fad9 --- /dev/null +++ b/tests/smoke_opencode.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────────────────────────────────────── +# Isolated LIVE smoke of the OPENCODE backend, driven entirely through the harness. +# +# Generalizes the cc-ci `test-opencode.sh` isolation pattern onto the agent-orchestrator harness: +# stands up a DEDICATED opencode server on its own port (≠ 4096), then brings a throwaway scratch +# project up through `agents.py up` on the opencode backend: +# • the harness builds the opencode attach command + the post-connect bootstrap ping, +# • the agent attaches to the server (opencode TUI alive), +# • `agents.py status` reports it RUNNING, +# • `agents.py down` tears it down cleanly — server killed, no leftover sessions, port freed. +# +# SAFE BY CONSTRUCTION — never touches the live cc-ci-* sessions or the live opencode server: +# • a unique per-run session prefix (NOT "cc-ci-") +# • its OWN opencode server on AOTEST_OC_PORT (default 4097, never 4096) +# • cleans up everything it creates on exit (even on Ctrl+C / error). +# +# Usage: bash tests/smoke_opencode.sh +# Env: OPENCODE_BIN (default: `opencode` on PATH, else ~/.local/bin/opencode) +# AOTEST_OC_PORT (default 4097 — MUST differ from the live 4096) +# AOTEST_OC_CREDS (default /srv/cc-ci/.testenv — sourced as the backend preamble) +# AOTEST_MODEL (default: opencode's own configured default) +# Exit: 0 = PASS or SKIP (opencode / creds / server unavailable); 1 = FAIL. +# ───────────────────────────────────────────────────────────────────────────── +set -uo pipefail + +HERE="$(cd "$(dirname "$0")" && pwd)" +REPO="$(cd "$HERE/.." && pwd)" +OCBIN="${OPENCODE_BIN:-$(command -v opencode 2>/dev/null || echo "$HOME/.local/bin/opencode")}" +PORT="${AOTEST_OC_PORT:-4097}" +SERVER="http://127.0.0.1:${PORT}" +CREDS="${AOTEST_OC_CREDS:-/srv/cc-ci/.testenv}" +MODEL="${AOTEST_MODEL:-}" +PREFIX="aotest-o-$$-" +SANDBOX="$(mktemp -d)" +CFG="$SANDBOX/agents.toml" +SRVLOG="$SANDBOX/server.log" +SERVER_PID="" +FAILED=0 + +pass(){ echo " PASS: $*"; } +fail(){ echo " FAIL: $*"; FAILED=1; } + +cleanup(){ + local rc=$? + python3 "$REPO/agents.py" --config "$CFG" down probe >/dev/null 2>&1 || true + if command -v tmux >/dev/null 2>&1; then + tmux ls 2>/dev/null | sed 's/:.*//' | grep "^${PREFIX}" | while read -r s; do + tmux kill-session -t "=$s" 2>/dev/null || true + done || true + fi + # kill the server subshell AND the opencode serve child it forked (the subshell is not the + # listener — target the listener by our unique port so the port is actually freed). + [ -n "$SERVER_PID" ] && kill "$SERVER_PID" 2>/dev/null || true + pkill -f "opencode serve.*--port ${PORT}\b" 2>/dev/null || true + for _ in 1 2 3 4 5; do + ss -ltn 2>/dev/null | grep -q ":${PORT} " || break + sleep 1 + done + rm -rf "$SANDBOX" + exit "$rc" +} +trap cleanup EXIT INT TERM + +echo "=== opencode backend smoke (isolated: prefix=${PREFIX} port=${PORT}) ===" + +# 0 — preconditions (SKIP, not FAIL, when the environment can't run opencode) +command -v tmux >/dev/null 2>&1 || { echo "SKIP: tmux not on PATH (run inside 'nix develop')"; exit 0; } +[ "$PORT" != "4096" ] || { echo "FAIL: refusing port 4096 (the live cc-ci opencode port)"; exit 1; } +[ -x "$OCBIN" ] || command -v "$OCBIN" >/dev/null 2>&1 \ + || { echo "SKIP: opencode binary not found ($OCBIN)"; exit 0; } +[ -f "$CREDS" ] || { echo "SKIP: opencode creds file missing ($CREDS)"; exit 0; } + +# 1 — isolated sandbox config (unique prefix + temp log_dir + dedicated server) +cat > "$CFG" <"$SRVLOG" 2>&1 & +SERVER_PID=$! +for _ in $(seq 1 30); do ss -ltn 2>/dev/null | grep -q ":${PORT} " && break; sleep 1; done +if ! ss -ltn 2>/dev/null | grep -q ":${PORT} "; then + echo "SKIP: opencode server did not come up on :${PORT} (see ${SRVLOG})"; exit 0 +fi +pass "dedicated opencode server listening on :${PORT}" + +# 3 — bring the probe up THROUGH the harness (attaches to OUR server) +if ! python3 "$REPO/agents.py" --config "$CFG" up probe; then + fail "agents.py up probe errored"; echo "=== RESULT: FAIL ==="; exit 1 +fi + +# 4 — session created? +sleep 4 +if tmux has-session -t "=${PREFIX}probe" 2>/dev/null; then + cmd=$(tmux display-message -p -t "=${PREFIX}probe:" '#{pane_current_command}' 2>/dev/null) + pass "session ${PREFIX}probe created via agents.py (pane command: ${cmd})" +else + fail "${PREFIX}probe session was not created"; echo "=== RESULT: FAIL ==="; exit 1 +fi + +# 5 — opencode TUI attached + alive, not an instant crash +sleep 12 +pane=$(tmux capture-pane -p -t "=${PREFIX}probe:" 2>/dev/null) +if echo "$pane" | grep -qiE "opencode|build ·|gpt|claude|READY|esc interrupt|ctrl\+p|ctrl\+"; then + pass "opencode TUI attached + alive (driven entirely by agents.py)" +else + fail "no opencode TUI/response in pane; tail: $(echo "$pane" | grep -vE '^\s*$' | tail -3)" + echo " (server log tail:) $(tail -3 "$SRVLOG" 2>/dev/null)" +fi + +# 6 — status reports it RUNNING +if python3 "$REPO/agents.py" --config "$CFG" status | grep -E '^\s*probe\b' | grep -q RUNNING; then + pass "agents.py status reports probe RUNNING" +else + fail "agents.py status did not report probe RUNNING" +fi + +# 7 — lifecycle: down removes it cleanly +python3 "$REPO/agents.py" --config "$CFG" down probe >/dev/null 2>&1 +sleep 2 +if tmux has-session -t "=${PREFIX}probe" 2>/dev/null; then + fail "${PREFIX}probe still alive after agents.py down" +else + pass "agents.py down cleanly removed the session" +fi + +if [ "$FAILED" = 0 ]; then echo "=== OPENCODE BACKEND SMOKE: PASS ==="; exit 0 +else echo "=== OPENCODE BACKEND SMOKE: FAIL ==="; exit 1; fi diff --git a/tests/test_unit.py b/tests/test_unit.py new file mode 100755 index 0000000..646d3ac --- /dev/null +++ b/tests/test_unit.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 +"""Unit tests for the agent-orchestrator harness (agents.py). + +Pure-logic tests — NO agent CLIs spawned, NO live tmux sessions created. Every test builds a +throwaway config + fixture files in a tempdir and exercises the harness functions directly. +The one function that would spawn sessions (phase_advance_check → start/stop_loops) is tested +with those two hooks monkeypatched to recorders, so the phase-machine *logic* is covered without +launching anything. + +Run: python3 -m unittest tests.test_unit (from repo root) + or python3 tests/test_unit.py +""" + +import os +import sys +import time +import textwrap +import tempfile +import shutil +import unittest +from datetime import datetime, timedelta +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT)) +import agents # noqa: E402 + + +# ── shared fixture config ──────────────────────────────────────────────────────── + +BASE_TOML = r""" +[watchdog] +signal_interval = 30 +heavy_interval = 300 +limit_probe_fallback = 300 +limit_reset_slack = 45 +stall_grace = 180 + +[defaults] +session_prefix = "aotest-ut-" +log_dir = "state" +backend = "claude" +model = "claude-sonnet-4-6" +watch = "none" + +[backend.claude] +bin = "claude" +flags = "--dangerously-skip-permissions" +remote_control = true +supports_resume = true +prompt_delivery = "arg" +process_name = "claude" +submit_key = "Enter" +stall_idle = 300 +active_re = "esc to interrupt|Running tool|\\u00b7 \\d+" +limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)" +fatal_re = "redacted_thinking|blocks cannot be modified" + +[backend.opencode] +bin = "opencode" +attach = "{bin} attach {server} --dir {dir}" +server = "http://127.0.0.1:4096" +supports_resume = false +prompt_delivery = "ping" +process_name = "opencode" +footer_ui = true +log_grace = 180 +connect_delay = 12 +submit_key = "C-m" +stall_idle = 900 +active_re = "esc interrupt|thinking|inferring|running tool|tool call|preparing patch|reading|searching" +limit_re = "usage limit|limit reached" + +[backend.demo] +bin = "echo up; exec sleep 100000" +prompt_delivery = "exec" + +[[agent]] +name = "builder" +kind = "loop" +role = "builder" +backend = "demo" + +[[agent]] +name = "adversary" +kind = "loop" +role = "adversary" +backend = "demo" + +[[agent]] +name = "cl" +kind = "persistent" +backend = "claude" +prompt = "hi" + +[[agent]] +name = "oc" +kind = "persistent" +backend = "opencode" +prompt = "hi" + +[[agent]] +name = "custom" +kind = "persistent" +session = "explicit-session" +model = "override-model" +dir = "/abs/somewhere" +backend = "demo" +prompt = "x" + +[[service]] +name = "svc" +command = "sleep 1" + +[loop] +state_file = "phase-idx" +resume_phase = true +auto_advance = true +done_marker = "## DONE" +kickoff_template = "prompts/kickoff.md" +roles_dir = "prompts" +handoff = { repo = ".", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"], state_subdir = "machine-docs" } +phases = [ + { id = "p1", plan = "PLAN1.md", status = "STATUS-p1.md" }, + { id = "p2", plan = "PLAN2.md", status = "STATUS-p2.md", models = { builder = "opus-x" } }, +] +""" + +KICKOFF_TMPL = "*** PROJECT PHASE: {phase_id} ***\nPLAN: {plan}\nSTATUS: {status}\nROLE: {role}\n---\n" +BUILDER_PROMPT = "You are the **Builder** agent. (builder role body marker)\n" +ADVERSARY_PROMPT = "You are the **Adversary** agent. (adversary role body marker)\n" + + +def _make_project(tmp, toml=BASE_TOML): + """Write a self-contained project (config + prompts + machine-docs) into tmp; return cfg path.""" + root = Path(tmp) + (root / "prompts").mkdir(parents=True, exist_ok=True) + (root / "machine-docs").mkdir(parents=True, exist_ok=True) + (root / "prompts" / "kickoff.md").write_text(KICKOFF_TMPL) + (root / "prompts" / "builder.md").write_text(BUILDER_PROMPT) + (root / "prompts" / "adversary.md").write_text(ADVERSARY_PROMPT) + cfg_path = root / "agents.toml" + cfg_path.write_text(toml) + return cfg_path + + +# ── config loading + defaults merge ──────────────────────────────────────────────── + +class TestConfigLoad(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg_path = _make_project(self.tmp) + self.cfg = agents.load_config(self.cfg_path) + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_defaults_merge_into_agents(self): + b = self.cfg["agents"]["builder"] + self.assertEqual(b["session_prefix"], "aotest-ut-") + self.assertEqual(b["watch"], "none") # from defaults + self.assertEqual(b["kind"], "loop") # explicit + + def test_session_name_defaults_to_prefix_plus_name(self): + self.assertEqual(self.cfg["agents"]["builder"]["session"], "aotest-ut-builder") + + def test_explicit_session_overrides_prefix(self): + self.assertEqual(self.cfg["agents"]["custom"]["session"], "explicit-session") + + def test_per_agent_override_wins_over_default(self): + # default model is claude-sonnet-4-6; custom overrides + self.assertEqual(self.cfg["agents"]["custom"]["model"], "override-model") + self.assertEqual(self.cfg["agents"]["builder"]["model"], "claude-sonnet-4-6") + + def test_relative_dir_resolved_against_project_root(self): + # builder has no dir → defaults dir "." → project_dir + self.assertEqual(self.cfg["agents"]["builder"]["dir"], self.cfg["project_dir"]) + + def test_absolute_dir_kept(self): + self.assertEqual(self.cfg["agents"]["custom"]["dir"], "/abs/somewhere") + + def test_log_dir_and_state_dir_resolved(self): + self.assertEqual(self.cfg["log_dir"], str(Path(self.cfg["project_dir"]) / "state")) + self.assertEqual(self.cfg["state_dir"], os.path.join(self.cfg["log_dir"], "state")) + self.assertTrue(Path(self.cfg["state_dir"]).is_dir()) # created on load + + def test_service_session_named(self): + self.assertIn("svc", self.cfg["services"]) + self.assertEqual(self.cfg["services"]["svc"]["session"], "aotest-ut-svc") + + def test_backend_of_resolves(self): + b = agents.backend_of(self.cfg, self.cfg["agents"]["cl"]) + self.assertEqual(b["prompt_delivery"], "arg") + self.assertEqual(b["submit_key"], "Enter") + + def test_backend_of_unknown_dies(self): + a = dict(self.cfg["agents"]["cl"]); a["backend"] = "nope" + with self.assertRaises(SystemExit): + agents.backend_of(self.cfg, a) + + def test_missing_session_prefix_dies(self): + bad = self.tmp + "/bad1" + p = _make_project(bad, toml='[defaults]\nlog_dir = "state"\n') + with self.assertRaises(SystemExit): + agents.load_config(p) + + def test_missing_log_dir_dies(self): + bad = self.tmp + "/bad2" + p = _make_project(bad, toml='[defaults]\nsession_prefix = "x-"\n') + with self.assertRaises(SystemExit): + agents.load_config(p) + + def test_env_override_model_single_invocation(self): + os.environ["AGENT_MODEL_cl"] = "env-only-model" + try: + cfg2 = agents.load_config(self.cfg_path) + self.assertEqual(cfg2["agents"]["cl"]["model"], "env-only-model") + finally: + del os.environ["AGENT_MODEL_cl"] + # without the env var the file value stands again + cfg3 = agents.load_config(self.cfg_path) + self.assertEqual(cfg3["agents"]["cl"]["model"], "claude-sonnet-4-6") + + +class TestExampleConfig(unittest.TestCase): + """The SHIPPED agents.example.toml must parse and define the documented shape.""" + def test_example_config_loads(self): + ex = REPO_ROOT / "agents.example.toml" + self.assertTrue(ex.exists(), "agents.example.toml missing from repo") + cfg = agents.load_config(ex) + self.assertIn("builder", cfg["agents"]) + self.assertIn("adversary", cfg["agents"]) + for be in ("demo", "claude", "opencode"): + self.assertIn(be, cfg["backends"], f"backend {be} missing from example") + self.assertEqual(len(agents.phases(cfg)), 2) + + +# ── kickoff-template assembly ────────────────────────────────────────────────────── + +class TestKickoff(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg = agents.load_config(_make_project(self.tmp)) + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_kickoff_renders_slots_and_appends_role(self): + out = agents.build_loop_kickoff(self.cfg, self.cfg["agents"]["builder"]) + self.assertIn("PROJECT PHASE: p1", out) # phase_id slot filled (phase idx 0) + self.assertIn("PLAN: PLAN1.md", out) + self.assertIn("STATUS: STATUS-p1.md", out) + self.assertIn("ROLE: builder", out) + self.assertIn("builder role body marker", out) # role prompt appended + self.assertNotIn("{phase_id}", out) # no unrendered slot + self.assertNotIn("{role}", out) + + def test_kickoff_picks_correct_role_prompt(self): + out = agents.build_loop_kickoff(self.cfg, self.cfg["agents"]["adversary"]) + self.assertIn("adversary role body marker", out) + self.assertNotIn("builder role body marker", out) + + def test_agent_prompt_loop_returns_kickoff(self): + out = agents.agent_prompt(self.cfg, self.cfg["agents"]["builder"]) + self.assertIn("PROJECT PHASE: p1", out) + + def test_agent_prompt_persistent_returns_inline_prompt(self): + out = agents.agent_prompt(self.cfg, self.cfg["agents"]["cl"]) + self.assertEqual(out, "hi") + + def test_role_model_phase_override(self): + # phase p2 overrides builder model to opus-x; advance index to 1 + Path(agents.phase_idx_file(self.cfg)).write_text("1") + self.assertEqual(agents.role_model(self.cfg, self.cfg["agents"]["builder"]), "opus-x") + # adversary has no override → its configured/default model + self.assertEqual(agents.role_model(self.cfg, self.cfg["agents"]["adversary"]), + "claude-sonnet-4-6") + + +# ── phase machine ────────────────────────────────────────────────────────────────── + +class TestPhaseMachine(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg = agents.load_config(_make_project(self.tmp)) + self.md = Path(self.cfg["project_dir"]) / "machine-docs" + # monkeypatch the session-spawning hooks so the machine logic runs without tmux + self._orig = (agents.stop_loops, agents.start_loops, agents.handoff_reset) + self.calls = [] + agents.stop_loops = lambda cfg: self.calls.append("stop") + agents.start_loops = lambda cfg: self.calls.append("start") + agents.handoff_reset = lambda: self.calls.append("reset") + + def tearDown(self): + agents.stop_loops, agents.start_loops, agents.handoff_reset = self._orig + shutil.rmtree(self.tmp, ignore_errors=True) + + def _status(self, basename, text): + (self.md / basename).write_text(text) + + def test_phase_done_detects_marker(self): + self._status("STATUS-p1.md", "header\n## DONE\nall verified PASS\n") + self.assertTrue(agents.phase_done(self.cfg, "STATUS-p1.md")) + + def test_phase_done_rejects_placeholder_body(self): + self._status("STATUS-p1.md", "## DONE\nnot yet — written here only when complete\n") + self.assertFalse(agents.phase_done(self.cfg, "STATUS-p1.md")) + + def test_phase_done_false_when_no_marker(self): + self._status("STATUS-p1.md", "## In progress\nworking\n") + self.assertFalse(agents.phase_done(self.cfg, "STATUS-p1.md")) + + def test_phase_done_false_when_file_missing(self): + self.assertFalse(agents.phase_done(self.cfg, "STATUS-nope.md")) + + def test_cur_idx_reads_state_file(self): + Path(agents.phase_idx_file(self.cfg)).write_text("1") + self.assertEqual(agents.cur_idx(self.cfg), 1) + + def test_advance_on_done(self): + Path(agents.phase_idx_file(self.cfg)).write_text("0") + self._status("STATUS-p1.md", "## DONE\nverified\n") + advanced = agents.phase_advance_check(self.cfg) + self.assertTrue(advanced) + self.assertEqual(agents.cur_idx(self.cfg), 1) # moved to p2 + self.assertIn("stop", self.calls) + self.assertIn("start", self.calls) + + def test_no_advance_when_not_done(self): + Path(agents.phase_idx_file(self.cfg)).write_text("0") + self._status("STATUS-p1.md", "## In progress\n") + self.assertFalse(agents.phase_advance_check(self.cfg)) + self.assertEqual(agents.cur_idx(self.cfg), 0) + self.assertEqual(self.calls, []) + + def test_sequence_complete_idempotent(self): + Path(agents.phase_idx_file(self.cfg)).write_text("1") # last phase + self._status("STATUS-p2.md", "## DONE\nverified\n") + marker = Path(self.cfg["log_dir"]) / "SEQUENCE-COMPLETE" + # first call: completes the sequence + self.assertTrue(agents.phase_advance_check(self.cfg)) + self.assertTrue(marker.exists()) + self.assertEqual(self.calls.count("stop"), 1) + # second call: idempotent — no re-stop, returns False + self.assertFalse(agents.phase_advance_check(self.cfg)) + self.assertEqual(self.calls.count("stop"), 1) + + def test_append_phase_clears_marker_and_resumes(self): + # simulate "sequence already complete", then a 3rd phase appended to the config + Path(agents.phase_idx_file(self.cfg)).write_text("1") + self._status("STATUS-p2.md", "## DONE\nverified\n") + marker = Path(self.cfg["log_dir"]) / "SEQUENCE-COMPLETE" + marker.write_text("stale completion\n") + self.cfg["loop"]["phases"].append( + {"id": "p3", "plan": "PLAN3.md", "status": "STATUS-p3.md"}) + advanced = agents.phase_advance_check(self.cfg) + self.assertTrue(advanced) + self.assertEqual(agents.cur_idx(self.cfg), 2) # resumed onto p3 + self.assertFalse(marker.exists()) # stale marker cleared + self.assertIn("start", self.calls) + + def test_custom_done_marker(self): + self.cfg["loop"]["done_marker"] = "## SHIPPED" + self._status("STATUS-p1.md", "## SHIPPED\nverified\n") + self.assertTrue(agents.phase_done(self.cfg, "STATUS-p1.md")) + self.assertFalse(agents.phase_done(self.cfg, "STATUS-p2.md")) + + +# ── usage-limit banner reset parsing ─────────────────────────────────────────────── + +class TestLimitParsing(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg = agents.load_config(_make_project(self.tmp)) + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_parse_reset_pm(self): + ep = agents._parse_reset_epoch("You've hit your limit · resets at 10pm") + self.assertIsNotNone(ep) + self.assertEqual(datetime.fromtimestamp(ep).hour, 22) + + def test_parse_reset_am_with_minutes(self): + ep = agents._parse_reset_epoch("resets 3:30am") + self.assertIsNotNone(ep) + dt = datetime.fromtimestamp(ep) + self.assertEqual((dt.hour, dt.minute), (3, 30)) + + def test_parse_reset_12am_is_midnight(self): + ep = agents._parse_reset_epoch("resets at 12am") + self.assertEqual(datetime.fromtimestamp(ep).hour, 0) + + def test_parse_reset_invalid_hour_none(self): + self.assertIsNone(agents._parse_reset_epoch("resets at 25")) + + def test_parse_reset_no_match_none(self): + self.assertIsNone(agents._parse_reset_epoch("everything is fine here")) + + def test_parse_reset_picks_last_match(self): + ep = agents._parse_reset_epoch("resets at 9am ... actually resets at 11am") + self.assertEqual(datetime.fromtimestamp(ep).hour, 11) + + def test_next_limit_until_unparsable_fallback(self): + now = time.time() + until, parsed = agents._next_limit_until(self.cfg, "limit reached, no time given", now) + self.assertFalse(parsed) + self.assertEqual(int(until), int(now + 300)) # limit_probe_fallback + + def test_next_limit_until_within_window_uses_banner(self): + now = time.time() + t = datetime.now() + timedelta(hours=2) + h12 = t.hour % 12 or 12 + ampm = "am" if t.hour < 12 else "pm" + banner = f"weekly limit · resets at {h12}:{t.minute:02d}{ampm}" + until, parsed = agents._next_limit_until(self.cfg, banner, now) + self.assertTrue(parsed) + self.assertGreater(until, now) + self.assertLessEqual(until - now, 6 * 3600 + 60) # within 6h window (+slack) + + def test_next_limit_until_far_future_falls_back(self): + now = time.time() + t = datetime.now() + timedelta(hours=7) # > 6h window + h12 = t.hour % 12 or 12 + ampm = "am" if t.hour < 12 else "pm" + banner = f"limit · resets at {h12}:{t.minute:02d}{ampm}" + until, parsed = agents._next_limit_until(self.cfg, banner, now) + self.assertFalse(parsed) + self.assertEqual(int(until), int(now + 300)) + + +# ── stall / WAITING-UNTIL parsing ────────────────────────────────────────────────── + +class TestWaitingUntil(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg = agents.load_config(_make_project(self.tmp)) + self.claude_agent = self.cfg["agents"]["cl"] # non-footer backend + self.oc_agent = self.cfg["agents"]["oc"] # footer_ui backend + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_non_footer_finds_marker_anywhere(self): + pane = "blah blah\nWAITING-UNTIL: 2030-06-13T12:00:00Z\nmore output after\n" + ep = agents._parse_waiting_until(self.cfg, self.claude_agent, pane) + self.assertIsNotNone(ep) + self.assertEqual(ep, datetime.fromisoformat("2030-06-13T12:00:00+00:00").timestamp()) + + def test_non_footer_none_without_marker(self): + self.assertIsNone(agents._parse_waiting_until( + self.cfg, self.claude_agent, "just working, no marker")) + + def test_footer_requires_marker_as_last_line(self): + # marker present but NOT the last non-empty line → ignored for a footer UI + pane = "WAITING-UNTIL: 2030-06-13T12:00:00Z\n ▣ Build · GPT · 2m 19s\n" + self.assertIsNone(agents._parse_waiting_until(self.cfg, self.oc_agent, pane)) + + def test_footer_honors_marker_when_last_line(self): + pane = "some work\nWAITING-UNTIL: 2030-06-13T12:00:00Z\n\n" + ep = agents._parse_waiting_until(self.cfg, self.oc_agent, pane) + self.assertIsNotNone(ep) + + def test_bad_timestamp_none(self): + self.assertIsNone(agents._parse_waiting_until( + self.cfg, self.claude_agent, "WAITING-UNTIL: not-a-time")) + + +# ── backend activity detectors (claude + opencode footers) ────────────────────────── + +class TestActivityDetection(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="aotest-ut-") + self.cfg = agents.load_config(_make_project(self.tmp)) + self.claude_agent = self.cfg["agents"]["cl"] + self.oc_agent = self.cfg["agents"]["oc"] + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + # claude: non-footer, active_re matched anywhere in the pane + def test_claude_active_esc_to_interrupt(self): + self.assertTrue(agents.pane_active( + self.cfg, self.claude_agent, "thinking...\n esc to interrupt", use_log=False)) + + def test_claude_active_running_tool(self): + self.assertTrue(agents.pane_active( + self.cfg, self.claude_agent, "Running tool: Bash", use_log=False)) + + def test_claude_active_spinner_dot_count(self): + self.assertTrue(agents.pane_active( + self.cfg, self.claude_agent, "Compiling · 137 tokens", use_log=False)) + + def test_claude_idle_is_not_active(self): + self.assertFalse(agents.pane_active( + self.cfg, self.claude_agent, "Done.\n> ", use_log=False)) + + # opencode: footer_ui — only the bottom rows count as activity + def test_opencode_active_footer(self): + pane = "~ Preparing patch...\n ⬝⬝■ esc interrupt 137.6K\n" + self.assertTrue(agents.pane_active(self.cfg, self.oc_agent, pane, use_log=False)) + + def test_opencode_idle_footer_not_active(self): + pane = " ▣ Build · GPT-5.4 · 2m 19s\n 178.4K (17%) ctrl+p commands\n" + self.assertFalse(agents.pane_active(self.cfg, self.oc_agent, pane, use_log=False)) + + def test_opencode_active_only_at_top_is_ignored(self): + # active marker far above the bottom 10 lines → a footer UI ignores it + pane = "running tool now\n" + "\n".join(f"line {i}" for i in range(20)) + \ + "\n ▣ Build · GPT · idle\n" + self.assertFalse(agents.pane_active(self.cfg, self.oc_agent, pane, use_log=False)) + + def test_opencode_log_grace_fallback(self): + # idle footer, but a freshly-touched session log within the grace window → active + idle = " ▣ Build · GPT · idle\n 178K (17%) ctrl+p\n" + logp = agents._session_log_path(self.cfg, self.oc_agent["session"]) + logp.parent.mkdir(parents=True, exist_ok=True) + logp.write_text("recent activity\n") # mtime = now + self.assertTrue(agents.pane_active(self.cfg, self.oc_agent, idle, use_log=True)) + # remove the log → no fallback → idle footer reads as not active + logp.unlink() + self.assertFalse(agents.pane_active(self.cfg, self.oc_agent, idle, use_log=True)) + + +if __name__ == "__main__": + unittest.main(verbosity=2)