weekly-run: pre-reclaim stale cc-ci images + hourly glm-5.2 supervisor
Root-cause fix for the 2026-07-03 run stalling: the cc-ci host disk filled to 100% (ENOSPC) mid-run (Wave 6, lasuite-drive), the agent stopped to reclaim space, and nothing resumed it — the log-idle/429 watchdog only covers opencode-go usage-limit stalls, not an environmental wedge. - launch-upgrader.py: step-0 prereclaim_cc_ci() prunes STALE cc-ci docker images (unused AND older than a week, so this week's likely-reused images stay) before each weekly run. Best-effort; env-tunable (UPGRADER_PRERECLAIM*). - launch-supervisor.py (new): hourly glm-5.2 orchestrator wake-up. Cheap deterministic gate — no-ops (zero tokens) when the run is complete or progressing; only when a run stalled/died before completing does it launch a short-lived glm-5.2 agent to diagnose + drive it to a clean DONE. Progress is judged by live run-proc + log mtime (session_busy() is claude-tuned and misreads a headless opencode run as idle). - configuration.nix: cc-ci-upgrade-supervisor service + hourly timer (:07). - upgrade-all SKILL §0: note the stale-image reclaim for manual runs. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01WxbpH3DquKzoSTSwGvGuET
This commit is contained in:
@ -224,4 +224,38 @@ SSHCFG
|
||||
Persistent = true; # if the box was down at the scheduled time, run once on next boot
|
||||
};
|
||||
};
|
||||
|
||||
# Hourly SUPERVISOR — a glm-5.2 orchestrator wake-up that keeps the weekly run on track. The
|
||||
# log-idle/429 watchdog only handles opencode-go usage-limit stalls; it does NOT cover a host
|
||||
# disk-full crash (which killed the 2026-07-03 run) or any other environmental wedge. This is a
|
||||
# CHEAP deterministic gate: if the weekly run is complete or actively progressing it does NOTHING
|
||||
# (zero model tokens). Only when a run has stalled/died before completing does it launch a
|
||||
# short-lived glm-5.2 agent that diagnoses the blockage and drives the run to a clean DONE.
|
||||
systemd.services.cc-ci-upgrade-supervisor = {
|
||||
description = "cc-ci hourly weekly-run supervisor (glm-5.2 — drives a stalled /upgrade-all to completion)";
|
||||
after = [ "network-online.target" "tailscaled.service" ];
|
||||
wants = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot"; # launch-supervisor.py check: gate now, spawn the agent into tmux, return
|
||||
User = "loops"; Group = "users";
|
||||
WorkingDirectory = "/srv/cc-ci";
|
||||
# Shares the weekly run's optional override file (e.g. SUPERVISOR_MODEL=…); "-" = optional.
|
||||
EnvironmentFile = "-/srv/cc-ci/upgrader.env";
|
||||
};
|
||||
environment = { HOME = "/home/loops"; };
|
||||
path = [ pkgs.bash pkgs.tmux pkgs.git pkgs.python3 pkgs.openssh pkgs.nettools ];
|
||||
script = ''
|
||||
export PATH="/home/loops/.local/bin:$PATH"
|
||||
python3 /srv/cc-ci/cc-ci-plan/launch-supervisor.py check >> /srv/cc-ci/.cc-ci-logs/supervisor-cron.log 2>&1
|
||||
'';
|
||||
};
|
||||
|
||||
systemd.timers.cc-ci-upgrade-supervisor = {
|
||||
description = "Hourly trigger for cc-ci-upgrade-supervisor (weekly-run health check + drive)";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnCalendar = "*-*-* *:07:00"; # every hour at :07 (offset from the weekly :00 fire)
|
||||
Persistent = false; # a missed hourly check is moot — the next hour re-checks
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user