diff --git a/.claude/skills/upgrade-all/SKILL.md b/.claude/skills/upgrade-all/SKILL.md
index 0582b73..bb7ba57 100644
--- a/.claude/skills/upgrade-all/SKILL.md
+++ b/.claude/skills/upgrade-all/SKILL.md
@@ -25,6 +25,20 @@ session, but the agent is the intended path so the weekly run isn't buried in he
 - `--dry-run` → survey + print what WOULD upgrade; spawn nothing.
 - `--parallel` → fan out all per-recipe subagents at once (faster, more host load — see safety below).
 
+## 0. Sweep orphans from previous runs (FIRST, before anything else)
+A prior run's teardown can crash, an agent can be killed mid-deploy, or a manual debug probe can be
+left running — leaving an orphan test stack, a standalone debug container, leaked volumes, or a stuck
+`docker run` wrapper that contends for the shared Swarm and skews the survey. Clear them before
+surveying. The sweep is **safe by allowlist** — it removes only what is NOT infra (traefik/drone/
+backups/the `ccci-*` control plane) and NOT a `warm-*` canonical (whose retained volumes are spared),
+so it can never take down a live service:
+```
+ssh cc-ci 'bash -s' < /srv/cc-ci/.claude/skills/upgrade-all/sweep-orphans.sh
+```
+It is idempotent (a no-op when the host is already clean) and prints what it removed plus the
+surviving Swarm services (which should be infra + `warm-*` only — eyeball that before continuing). If
+anything legitimate looks at risk, stop and investigate rather than proceeding.
+
 ## 1. Build the candidate list
 Enrolled recipes = the cc-ci `tests/<recipe>/` dirs (same set `ci-test-review` sweeps):
 ```
diff --git a/.claude/skills/upgrade-all/sweep-orphans.sh b/.claude/skills/upgrade-all/sweep-orphans.sh
new file mode 100755
index 0000000..c6b95e1
--- /dev/null
+++ b/.claude/skills/upgrade-all/sweep-orphans.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+# Sweep orphaned test deployments + debug debris left by PREVIOUS cc-ci runs.
+#
+# Runs ON the cc-ci host (root) — pipe it in:  ssh cc-ci 'bash -s' < sweep-orphans.sh
+# Invoked at the START of /upgrade-all so a leaked stack/container/volume/process from a prior run
+# (a teardown that crashed, a manual debug probe, a killed agent) does not contend for the shared
+# Swarm or skew the survey. Idempotent + safe to run anytime: a no-op when the host is already clean.
+#
+# SAFE BY ALLOWLIST. It removes ONLY things NOT on the keep-list, so it can never take down infra or
+# the warm canonicals. The keep-list (leading name prefix) is:
+#   - traefik, drone, backups            : Swarm + CI infra
+#   - ccci-bridge / -dashboard / -reports: the cc-ci control plane
+#   - warm-*                             : warm canonicals (idle persistent deps reused across runs;
+#                                          their retained volumes are spared too)
+# Everything else deployed on the Swarm is a per-run test stack and is fair game.
+set -uo pipefail
+export PATH=/run/current-system/sw/bin:$PATH
+
+KEEP_RE='^(traefik|drone|backups|ccci-(bridge|dashboard|reports)|warm-)'
+removed=0
+
+echo "== orphan sweep: scanning (keep-list: infra + warm-* canonicals) =="
+
+# 1) Orphan Swarm stacks — any deployed stack not on the keep-list is a leftover per-run test deploy
+#    (the harness deploys each recipe under its own per-run stack; a clean run tears it down).
+mapfile -t STACKS < <(docker stack ls --format '{{.Name}}' 2>/dev/null)
+for s in "${STACKS[@]}"; do
+  [ -z "$s" ] && continue
+  if printf '%s' "$s" | grep -Eq "$KEEP_RE"; then continue; fi
+  echo "  orphan stack -> removing: $s"
+  docker stack rm "$s" >/dev/null 2>&1 || true
+  removed=$((removed + 1))
+done
+# wait (bounded) for removed stacks' services to drain so their volumes free up for step 3
+for _ in $(seq 1 30); do
+  [ -z "$(docker service ls --format '{{.Name}}' 2>/dev/null | grep -Ev "$KEEP_RE")" ] && break
+  sleep 2
+done
+
+# 2) Orphan standalone containers — running/exited containers NOT managed by Swarm (no
+#    com.docker.swarm.service.id label) are debug `docker run` leftovers (e.g. the plausible
+#    clickhouse entrypoint probes). Remove only ones started > 30 min ago, so an in-flight manual
+#    probe started right before the run is spared.
+now=$(date +%s)
+for c in $(docker ps -aq 2>/dev/null); do
+  sid=$(docker inspect -f '{{ index .Config.Labels "com.docker.swarm.service.id" }}' "$c" 2>/dev/null)
+  [ -n "$sid" ] && continue                 # Swarm-managed → keep
+  started=$(docker inspect -f '{{.State.StartedAt}}' "$c" 2>/dev/null)
+  st=$(date -d "$started" +%s 2>/dev/null || echo "$now")
+  [ $((now - st)) -lt 1800 ] && continue    # younger than 30 min → spare a fresh manual probe
+  echo "  orphan container -> removing: $(docker inspect -f '{{.Name}} ({{.Config.Image}})' "$c" 2>/dev/null)"
+  docker rm -f "$c" >/dev/null 2>&1 || true
+  removed=$((removed + 1))
+done
+
+# 3) Leaked volumes — dangling (referenced by no container) volumes left by removed test stacks.
+#    Spare warm-* volumes: a warm canonical idles UNDEPLOYED with its data volume retained, so its
+#    volume is legitimately dangling and must NOT be pruned.
+for v in $(docker volume ls -qf dangling=true 2>/dev/null); do
+  printf '%s' "$v" | grep -Eq "$KEEP_RE" && continue
+  docker volume rm "$v" >/dev/null 2>&1 && { echo "  leaked volume -> removed: $v"; removed=$((removed + 1)); }
+done
+
+# 4) Orphan debug host-processes — reparented (ppid==1) `timeout … docker run …` wrappers left by
+#    manual recipe probes; they outlive their container and never self-reap. Kill by explicit PID
+#    (NEVER pkill -f, which would self-match this script's own command line).
+for p in $(ps -eo pid=,ppid=,cmd= 2>/dev/null | awk '$2==1 && /timeout/ && /docker[[:space:]]+run/ {print $1}'); do
+  echo "  orphan debug wrapper -> killing pid $p"
+  kill "$p" 2>/dev/null || true
+  removed=$((removed + 1))
+done
+
+# 5) Stray exited containers (debug one-shots) — best-effort prune.
+docker container prune -f >/dev/null 2>&1 || true
+
+if [ "$removed" -eq 0 ]; then
+  echo "== orphan sweep: clean (nothing to remove) =="
+else
+  echo "== orphan sweep: removed/killed $removed orphan(s) =="
+fi
+echo "-- surviving Swarm services (should be infra + warm-* only) --"
+docker service ls --format '{{.Name}}' 2>/dev/null | sort