Files
cc-ci/nix/modules/docker-prune.nix
autonomic-bot b9bbd253eb fix(2pc): rename unit docker-prune -> ci-docker-prune (NixOS docker module reserves docker-prune)
The committed module used systemd.services.docker-prune, which conflicts with the NixOS docker
module's own docker-prune unit (`nixos-rebuild build` error: conflicting definition values). The
deployed+verified host already runs ci-docker-prune; this syncs the repo so a cold build matches.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 09:43:09 +01:00

76 lines
3.6 KiB
Nix

# Conservative, surgical Docker prune (Phase 2pc / PC1).
#
# REPLACES `virtualisation.docker.autoPrune` (which ran `docker system prune --force --all
# --filter until=24h` daily). The `--all` removed every image NOT used by a *running* container —
# between CI runs no test apps run, so it evicted the cached recipe base images and forced a cold
# re-pull on the next run → the prune->re-pull->Docker-Hub-rate-limit churn documented in JOURNAL-2.
#
# On this SINGLE host, Docker's own local image store IS the cache (re-deploys reuse local layers,
# no re-download; the daemon is PAT-authenticated). So we keep that store warm and only reclaim disk
# under GENUINE pressure, and even then SURGICALLY:
# - dangling images + stopped containers + dangling build cache, age-gated (until=24h) — NEVER
# `--all` (would evict tagged base/in-use images), NEVER `--volumes` (warm canonical data — see
# swarm.nix's existing comment; warm volumes are reaped only by the warm reconcilers).
# and only when nothing is in flight:
# - skip if any run-app stack is live (mid-pull layers can look prunable — "never prune mid-run");
# - skip if any swarm service has unmet replicas (a deploy/pull is converging, incl. warm redeploys).
{ pkgs, ... }:
let
# `/` usage % at/above which a surgical prune is permitted. Below this: keep the cache, no-op.
threshold = 80;
prune = pkgs.writeShellApplication {
name = "cc-ci-docker-prune";
runtimeInputs = with pkgs; [ docker coreutils gnugrep gawk ];
text = ''
THRESH=${toString threshold}
used="$(df --output=pcent / | tail -1 | tr -dc '0-9')"
: "''${used:=0}"
if [ "$used" -lt "$THRESH" ]; then
echo "docker-prune: / at ''${used}% (< ''${THRESH}%) keeping local image cache, nothing to do"
exit 0
fi
# NEVER prune mid-run: a live run-app stack means a deploy/test is in flight (mid-pull layers
# can look prunable). Run-app services: <=4char>-<6hex>_ci_commoninternet_net_* (lifecycle.py).
if docker service ls --format '{{.Name}}' \
| grep -qE '^[a-z0-9]{1,4}-[0-9a-f]{6}_ci_commoninternet_net_'; then
echo "docker-prune: a run-app stack is live skipping (never prune mid-run)"
exit 0
fi
# NEVER prune while ANY swarm service is converging (unmet replicas => a pull/deploy in flight,
# including infra warm redeploys). Replicas field is "running/desired" e.g. 1/1.
converging="$(docker service ls --format '{{.Replicas}}' \
| awk -F/ '{ if (($1+0) != ($2+0)) c++ } END { print c+0 }')"
if [ "$converging" -gt 0 ]; then
echo "docker-prune: $converging service(s) converging (deploy/pull in flight) skipping"
exit 0
fi
echo "docker-prune: / at ''${used}% (>= ''${THRESH}%) surgical prune (dangling + until=24h; NEVER --all/--volumes)"
docker container prune -f --filter until=24h || true
docker image prune -f --filter until=24h || true
docker builder prune -f --filter until=24h || true
df -h /
'';
};
in
{
systemd.services.ci-docker-prune = {
description = "Surgical disk-pressure-gated Docker prune (dangling+old only; never --all/--volumes; never mid-run)";
after = [ "docker.service" ];
requires = [ "docker.service" ];
path = [ pkgs.docker ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${prune}/bin/cc-ci-docker-prune";
};
};
systemd.timers.ci-docker-prune = {
description = "Daily timer for the surgical Docker prune";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "daily";
Persistent = true;
};
};
}