diff --git a/cc-ci-plan/JOURNAL.md b/cc-ci-plan/JOURNAL.md index 28a2d91..61899c9 100644 --- a/cc-ci-plan/JOURNAL.md +++ b/cc-ci-plan/JOURNAL.md @@ -618,3 +618,30 @@ session cc-ci-orchestrator-stale can be killed; recipe-mirrors org still private when cf48 starts since per-phase backend isn't overridable). - Also this session: relaunched after a session restart (NOT a host reboot, 13d uptime); loops+watchdog were stopped in a claude-sonnet limit window on cf55 → restarted via RESUME_PHASE=1 launch.py start. + +## 2026-06-13 ~05:27 — launch system unified (agents.toml + agents.py); cutover done +- Replaced the 5 bespoke launchers + ~15 dotfiles with ONE config (cc-ci-plan/agents.toml) + + ONE driver (cc-ci-plan/agents.py: up/down/status/watchdog/logs/phase). Design + behavior + mapping in cc-ci-plan/plan-unified-launch.md. +- launch.py and launch-orchestrator.sh are now COMPATIBILITY SHIMS → agents.py (originals at + *.orig). So `launch.py start|status|stop|watchdog|logs`, the systemd boot chain + (cc-ci-loops-start → launch.sh → launch.py start), and your startup routine all drive the + new system transparently — no behavior change for you. +- Config is the single source of truth; the watchdog re-reads it every tick (no more env-vs-file + drift, which had caused the opencode-revert bug earlier today). Backend/model/prompt/watch + policy per agent live in agents.toml. To change a model or backend: edit agents.toml. +- State (phase index, resume ids, limit windows) now under .cc-ci-logs/state/. Phase machine + unchanged; de-duped the doubled `mailu` entry (cf55 was idx 10 → now idx 9; current phase + pvfix = idx 10). All agents respawned via the new system and confirmed working on pvfix. +- The new watchdog tmux session (cc-ci-watchdog) runs `agents.py watchdog`. Same heal/limit/ + stall/handoff/phase-advance/wake behavior, lifted verbatim, now config-driven. + +## 2026-06-13 ~05:30 — startup: unified agents.toml live; re-added dropped cf48 +- Session relaunch (NOT a host reboot, 13d uptime). Supervision UP: unified `agents.py watchdog` + (--config agents.toml) + builder/adv on claude-sonnet + orchestrator on claude-opus-4-8. Phase + pvfix [proxy /16 fix] in progress; cf55 confirmed ## DONE (advance was legit). +- The launch-system unification (agents.toml + agents.py) was deployed in the gap. It was transcribed + from .phases-spec BEFORE I added cf48 (05:15), so cf48 (the operator's opus cfold review) was + DROPPED. Re-added it to agents.toml — appended AFTER ghost (the system is already past cf55/on + pvfix, so inserting before pvfix would shift the live phase index). agents.py re-reads config every + tick, so no watchdog bounce needed. cf48 runs as the last phase, opus 4.8, claude backend. diff --git a/cc-ci-plan/agents.toml b/cc-ci-plan/agents.toml new file mode 100644 index 0000000..71031ce --- /dev/null +++ b/cc-ci-plan/agents.toml @@ -0,0 +1,149 @@ +# cc-ci unified agent configuration — the single source of truth. +# +# One file declares: which agents exist, their backend, model, prompt, kind, and how the +# watchdog supervises them. Read by agents.py (driver + watchdog). Runtime state (phase +# index, resume ids, limit windows) lives under /state/, NOT here. +# +# Precedence: this file is authoritative. A one-off CLI override (env AGENT__) +# affects only a single `agents.py` invocation; the persisted watchdog always reads this file. + +# ─────────────────────────── global watchdog cadence ─────────────────────────── +[watchdog] +signal_interval = 30 # s between handoff / stall / limit checks (light) +heavy_interval = 300 # s between heal / phase-advance checks +limit_probe_fallback = 300 # flat probe cadence when a reset time can't be parsed +limit_reset_slack = 45 # s past a parsed reset before probing + +# ─────────────────────────── backends (declared as data) ─────────────────────────── +[backend.claude] +bin = "claude" +flags = "--dangerously-skip-permissions" +remote_control = true +supports_resume = true +prompt_delivery = "arg" # full prompt passed as a CLI argument +submit_key = "Enter" +stall_idle = 300 +active_re = "esc to interrupt|⠋|⠙|⠹|⠸|⠼|⠴|⠦|⠧|⠇|⠏|Running tool|▣|Build ·|· \\d+" +limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)" +fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified" + +[backend.opencode] +bin = "/home/loops/.local/bin/opencode" +server = "http://127.0.0.1:4096" +supports_resume = false +prompt_delivery = "ping" # send after the TUI connects +connect_delay = 12 +submit_key = "C-m" +preamble = "set -a; . /srv/cc-ci/.testenv; set +a" +stall_idle = 900 +active_re = "esc interrupt|thinking|inferring|running tool|tool call|preparing patch|reading|searching" +limit_re = "spend limit|usage limit|limit reached|reached your .*limit|out of (credits|tokens)" +fatal_re = "redacted_thinking|blocks cannot be modified|cannot be modified" + +# ─────────────────────────── defaults for every agent ─────────────────────────── +[defaults] +backend = "claude" +model = "claude-sonnet-4-6" +dir = "/srv/cc-ci-orch" +watch = "heal" # none | heal | heal+stall +log_dir = "/srv/cc-ci/.cc-ci-logs" + +# ─────────────────────────── agents ─────────────────────────── + +[[agent]] +name = "orchestrator" # tmux session: cc-ci-orchestrator +kind = "persistent" +model = "claude-opus-4-8" +resume = true # claude --resume +watch = "heal" # restart if dead / FATAL / backend-mismatch; never stall-reboot +wake = { interval = 3600, prompt_file = "ai-progress-monitor-prompt.txt" } +prompt = """ +STARTUP (auto-launch): you are the cc-ci orchestrator, just (re)launched, likely after a reboot. \ +Do your AGENTS.md On-startup routine NOW: read cc-ci-plan/REBOOTS.md and run cc-ci-plan/launch.py \ +status, then send a proactive PushNotification that you are online with the current phase and \ +reboot count, and confirm cc-ci-loops.service brought the loops + watchdog back (relaunch with \ +RESUME_PHASE=1 cc-ci-plan/launch.py start if not). Also read cc-ci-plan/JOURNAL.md for recent \ +context before resuming supervision.""" + +[[agent]] +name = "builder" # tmux session: cc-ci-builder +kind = "loop" +role = "builder" # phase kickoff = phase preamble + prompts/builder.md +dir = "/srv/cc-ci/cc-ci" +watch = "heal+stall" + +[[agent]] +name = "adversary" +session = "cc-ci-adv" # established convention is abbreviated (logs, remote-control) +kind = "loop" +role = "adversary" +dir = "/srv/cc-ci/cc-ci-adv" +watch = "heal+stall" + +[[agent]] +name = "assistant" # tmux session: cc-ci-assistant +kind = "persistent" +resume = true +watch = "none" +enabled = false +prompt = """ +You are the cc-ci ASSISTANT. Read cc-ci-plan/JOURNAL.md for context, then wait for a specific \ +plan/task from the orchestrator or operator. Work autonomously until the assigned task is \ +complete, report the result, and then wait for the next assignment.""" + +[[agent]] +name = "upgrader" # tmux session: cc-ci-upgrader +kind = "task" # one-shot: run to completion, then idle +dir = "/srv/cc-ci" +watch = "none" +enabled = false # launched on demand or by [loop].on_complete +prompt = """ +You are the cc-ci weekly UPGRADER. Invoke the /upgrade-all skill in DEFAULT mode (read \ +/srv/cc-ci/.claude/skills/upgrade-all/SKILL.md for the full procedure), run it to completion, \ +then report the summary of every PR opened and go idle.""" + +[[agent]] +name = "report" # tmux session: cc-ci-report +kind = "task" +dir = "/srv/cc-ci" +model = "claude-opus-4-8" # report is authored by opus even when the upgrader runs sonnet +watch = "none" +enabled = false +prompt = """ +You generate the public weekly "Recipe Report". Run the /recipe-report skill now (full spec: \ +/srv/cc-ci/.claude/skills/recipe-report/SKILL.md; creds in /srv/cc-ci/.testenv), then go idle.""" + +# Non-AI helper services (started by `up`, not AI sessions) +[[service]] +name = "cleanlogs" # tmux session: cc-ci-cleanlogs +command = "python3 cc-ci-plan/agent-log.py follow-all" +dir = "/srv/cc-ci-orch" + +# ─────────────────────────── the phase machine (kind="loop" agents) ─────────────────────────── +[loop] +state_file = "phase-idx" # under /state/ +resume_phase = true # keep current index across restarts (don't reset to 0) +auto_advance = true +done_marker = "## DONE" +handoff = { repo = "/srv/cc-ci/cc-ci", claim_pings = "adversary", review_pings = "builder", inboxes = ["ADVERSARY-INBOX.md", "BUILDER-INBOX.md"] } +on_complete = { trigger_file = ".run-upgrade-on-complete", run = "upgrader" } + +# Transcribed verbatim from the live .phases-spec, with the duplicate `mailu` (old idx 7) +# REMOVED. Per-phase model overrides (were .loop-model[-adv]- files) are inline. +# Current phase after de-dupe: cf55 (index 9) — see state/phase-idx. +phases = [ + { id = "rcust", plan = "recipe-custom-restructure-full-plan.md", status = "STATUS-rcust.md" }, + { id = "shot", plan = "plan-phase-shot-screenshots.md", status = "STATUS-shot.md" }, + { id = "lvl5", plan = "plan-phase-lvl5-lint-rung.md", status = "STATUS-lvl5.md" }, + { id = "bsky", plan = "plan-phase-bsky-fix.md", status = "STATUS-bsky.md" }, + { id = "dstamp", plan = "plan-phase-dstamp-discourse-drift.md", status = "STATUS-dstamp.md", models = { builder = "opus" } }, + { id = "mailu", plan = "plan-phase-mailu-backup.md", status = "STATUS-mailu.md" }, + { id = "kuma", plan = "plan-phase-kuma-monitor.md", status = "STATUS-kuma.md" }, + { id = "drone", plan = "plan-phase-drone-enroll.md", status = "STATUS-drone.md" }, + { id = "cfold", plan = "plan-phase-cfold-custom-folder.md", status = "STATUS-cfold.md" }, + { id = "cf55", plan = "plan-phase-cf55-gpt55-cfold-review.md", status = "STATUS-cf55.md", models = { builder = "claude-sonnet-4-6", adversary = "claude-sonnet-4-6" } }, + { id = "pvfix", plan = "plan-phase-pvfix-swarm-proxy.md", status = "STATUS-pvfix.md" }, + { id = "pvcheck", plan = "plan-phase-pvcheck-post-proxy-verification.md", status = "STATUS-pvcheck.md" }, + { id = "ghost", plan = "plan-phase-ghost-reeval.md", status = "STATUS-ghost.md" }, + { id = "cf48", plan = "plan-phase-cf48-opus-cfold-review.md", status = "STATUS-cf48.md", models = { builder = "claude-opus-4-8", adversary = "claude-opus-4-8" } }, +]