# Conservative, surgical Docker prune (Phase 2pc / PC1). # # REPLACES `virtualisation.docker.autoPrune` (which ran `docker system prune --force --all # --filter until=24h` daily). The `--all` removed every image NOT used by a *running* container — # between CI runs no test apps run, so it evicted the cached recipe base images and forced a cold # re-pull on the next run → the prune->re-pull->Docker-Hub-rate-limit churn documented in JOURNAL-2. # # On this SINGLE host, Docker's own local image store IS the cache (re-deploys reuse local layers, # no re-download; the daemon is PAT-authenticated). So we keep that store warm and only reclaim disk # under GENUINE pressure, and even then SURGICALLY: # - dangling images + stopped containers + dangling build cache, age-gated (until=24h) — NEVER # `--all` (would evict tagged base/in-use images), NEVER `--volumes` (warm canonical data — see # swarm.nix's existing comment; warm volumes are reaped only by the warm reconcilers). # and only when nothing is in flight: # - skip if any run-app stack is live (mid-pull layers can look prunable — "never prune mid-run"); # - skip if any swarm service has unmet replicas (a deploy/pull is converging, incl. warm redeploys). { pkgs, ... }: let # `/` usage % at/above which a surgical prune is permitted. Below this: keep the cache, no-op. threshold = 80; prune = pkgs.writeShellApplication { name = "cc-ci-docker-prune"; runtimeInputs = with pkgs; [ docker coreutils gnugrep gawk ]; text = '' THRESH=${toString threshold} used="$(df --output=pcent / | tail -1 | tr -dc '0-9')" : "''${used:=0}" if [ "$used" -lt "$THRESH" ]; then echo "docker-prune: / at ''${used}% (< ''${THRESH}%) — keeping local image cache, nothing to do" exit 0 fi # NEVER prune mid-run: a live run-app stack means a deploy/test is in flight (mid-pull layers # can look prunable). Run-app services: <=4char>-<6hex>_ci_commoninternet_net_* (lifecycle.py). if docker service ls --format '{{.Name}}' \ | grep -qE '^[a-z0-9]{1,4}-[0-9a-f]{6}_ci_commoninternet_net_'; then echo "docker-prune: a run-app stack is live — skipping (never prune mid-run)" exit 0 fi # NEVER prune while ANY swarm service is converging (unmet replicas => a pull/deploy in flight, # including infra warm redeploys). Replicas field is "running/desired" e.g. 1/1. converging="$(docker service ls --format '{{.Replicas}}' \ | awk -F/ '{ if (($1+0) != ($2+0)) c++ } END { print c+0 }')" if [ "$converging" -gt 0 ]; then echo "docker-prune: $converging service(s) converging (deploy/pull in flight) — skipping" exit 0 fi echo "docker-prune: / at ''${used}% (>= ''${THRESH}%) — surgical prune (dangling + until=24h; NEVER --all/--volumes)" docker container prune -f --filter until=24h || true docker image prune -f --filter until=24h || true docker builder prune -f --filter until=24h || true df -h / ''; }; in { systemd.services.ci-docker-prune = { description = "Surgical disk-pressure-gated Docker prune (dangling+old only; never --all/--volumes; never mid-run)"; after = [ "docker.service" ]; requires = [ "docker.service" ]; path = [ pkgs.docker ]; serviceConfig = { Type = "oneshot"; ExecStart = "${prune}/bin/cc-ci-docker-prune"; }; }; systemd.timers.ci-docker-prune = { description = "Daily timer for the surgical Docker prune"; wantedBy = [ "timers.target" ]; timerConfig = { OnCalendar = "daily"; Persistent = true; }; }; }