feat(2pc): PC1 conservative prune — drop autoPrune --all, add gated surgical docker-prune
Removes virtualisation.docker.autoPrune (daily `docker system prune --all` evicted in-use base images → cold re-pull → Hub rate-limit churn, JOURNAL-2). Adds modules/docker-prune.nix: daily timer + oneshot that prunes only dangling+until=24h, gated on disk pressure (>=80%) AND no run-app live AND no swarm service converging; never --all, never --volumes. Teardown unchanged (never removes images). Registry pull-through cache dropped per operator scope correction. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
75
nix/modules/docker-prune.nix
Normal file
75
nix/modules/docker-prune.nix
Normal file
@ -0,0 +1,75 @@
|
||||
# Conservative, surgical Docker prune (Phase 2pc / PC1).
|
||||
#
|
||||
# REPLACES `virtualisation.docker.autoPrune` (which ran `docker system prune --force --all
|
||||
# --filter until=24h` daily). The `--all` removed every image NOT used by a *running* container —
|
||||
# between CI runs no test apps run, so it evicted the cached recipe base images and forced a cold
|
||||
# re-pull on the next run → the prune->re-pull->Docker-Hub-rate-limit churn documented in JOURNAL-2.
|
||||
#
|
||||
# On this SINGLE host, Docker's own local image store IS the cache (re-deploys reuse local layers,
|
||||
# no re-download; the daemon is PAT-authenticated). So we keep that store warm and only reclaim disk
|
||||
# under GENUINE pressure, and even then SURGICALLY:
|
||||
# - dangling images + stopped containers + dangling build cache, age-gated (until=24h) — NEVER
|
||||
# `--all` (would evict tagged base/in-use images), NEVER `--volumes` (warm canonical data — see
|
||||
# swarm.nix's existing comment; warm volumes are reaped only by the warm reconcilers).
|
||||
# and only when nothing is in flight:
|
||||
# - skip if any run-app stack is live (mid-pull layers can look prunable — "never prune mid-run");
|
||||
# - skip if any swarm service has unmet replicas (a deploy/pull is converging, incl. warm redeploys).
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
# `/` usage % at/above which a surgical prune is permitted. Below this: keep the cache, no-op.
|
||||
threshold = 80;
|
||||
prune = pkgs.writeShellApplication {
|
||||
name = "cc-ci-docker-prune";
|
||||
runtimeInputs = with pkgs; [ docker coreutils gnugrep gawk ];
|
||||
text = ''
|
||||
THRESH=${toString threshold}
|
||||
used="$(df --output=pcent / | tail -1 | tr -dc '0-9')"
|
||||
: "''${used:=0}"
|
||||
if [ "$used" -lt "$THRESH" ]; then
|
||||
echo "docker-prune: / at ''${used}% (< ''${THRESH}%) — keeping local image cache, nothing to do"
|
||||
exit 0
|
||||
fi
|
||||
# NEVER prune mid-run: a live run-app stack means a deploy/test is in flight (mid-pull layers
|
||||
# can look prunable). Run-app services: <=4char>-<6hex>_ci_commoninternet_net_* (lifecycle.py).
|
||||
if docker service ls --format '{{.Name}}' \
|
||||
| grep -qE '^[a-z0-9]{1,4}-[0-9a-f]{6}_ci_commoninternet_net_'; then
|
||||
echo "docker-prune: a run-app stack is live — skipping (never prune mid-run)"
|
||||
exit 0
|
||||
fi
|
||||
# NEVER prune while ANY swarm service is converging (unmet replicas => a pull/deploy in flight,
|
||||
# including infra warm redeploys). Replicas field is "running/desired" e.g. 1/1.
|
||||
converging="$(docker service ls --format '{{.Replicas}}' \
|
||||
| awk -F/ '{ if (($1+0) != ($2+0)) c++ } END { print c+0 }')"
|
||||
if [ "$converging" -gt 0 ]; then
|
||||
echo "docker-prune: $converging service(s) converging (deploy/pull in flight) — skipping"
|
||||
exit 0
|
||||
fi
|
||||
echo "docker-prune: / at ''${used}% (>= ''${THRESH}%) — surgical prune (dangling + until=24h; NEVER --all/--volumes)"
|
||||
docker container prune -f --filter until=24h || true
|
||||
docker image prune -f --filter until=24h || true
|
||||
docker builder prune -f --filter until=24h || true
|
||||
df -h /
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.services.docker-prune = {
|
||||
description = "Surgical disk-pressure-gated Docker prune (dangling+old only; never --all/--volumes; never mid-run)";
|
||||
after = [ "docker.service" ];
|
||||
requires = [ "docker.service" ];
|
||||
path = [ pkgs.docker ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
ExecStart = "${prune}/bin/cc-ci-docker-prune";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.timers.docker-prune = {
|
||||
description = "Daily timer for the surgical Docker prune";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnCalendar = "daily";
|
||||
Persistent = true;
|
||||
};
|
||||
};
|
||||
}
|
||||
@ -5,18 +5,14 @@
|
||||
{
|
||||
virtualisation.docker = {
|
||||
enable = true;
|
||||
# Reclaim disk from churning per-run images (cc-ci root is ~28 GiB). Prune images/containers/
|
||||
# networks/build-cache older than 24h — but NEVER volumes:
|
||||
# (1) `--volumes` is incompatible with `--filter until=` (docker errors → the unit failed daily,
|
||||
# degrading the system and never actually pruning — that's why disk crept to 96%); and
|
||||
# (2) Phase 2w keeps DATA-WARM canonical volumes that are UNDEPLOYED (no container), so
|
||||
# `prune --volumes` would DELETE the warm known-good data. Warm volumes are pruned
|
||||
# deliberately by the warm reconcilers (WC8), never by this blanket sweep.
|
||||
autoPrune = {
|
||||
enable = true;
|
||||
dates = "daily";
|
||||
flags = [ "--all" "--filter" "until=24h" ];
|
||||
};
|
||||
# Image pruning is handled by modules/docker-prune.nix (Phase 2pc / PC1), NOT by
|
||||
# `virtualisation.docker.autoPrune`. The old autoPrune ran `docker system prune --all` daily;
|
||||
# `--all` evicts every image not used by a *running* container — between runs that wiped the
|
||||
# cached recipe base images and forced a cold re-pull → the Docker-Hub-rate-limit churn in
|
||||
# JOURNAL-2. The replacement keeps Docker's local store warm (it IS our cache on this single
|
||||
# host) and prunes only dangling+old layers, gated on genuine disk pressure and nothing in
|
||||
# flight. NEVER --volumes either: Phase-2w keeps DATA-WARM undeployed canonical volumes, reaped
|
||||
# only by the warm reconcilers. autoPrune left OFF (the default) on purpose.
|
||||
};
|
||||
|
||||
environment.systemPackages = [ pkgs.docker ];
|
||||
|
||||
Reference in New Issue
Block a user