diff --git a/.claude/skills/upgrade-all/SKILL.md b/.claude/skills/upgrade-all/SKILL.md index 0582b73..bb7ba57 100644 --- a/.claude/skills/upgrade-all/SKILL.md +++ b/.claude/skills/upgrade-all/SKILL.md @@ -25,6 +25,20 @@ session, but the agent is the intended path so the weekly run isn't buried in he - `--dry-run` → survey + print what WOULD upgrade; spawn nothing. - `--parallel` → fan out all per-recipe subagents at once (faster, more host load — see safety below). +## 0. Sweep orphans from previous runs (FIRST, before anything else) +A prior run's teardown can crash, an agent can be killed mid-deploy, or a manual debug probe can be +left running — leaving an orphan test stack, a standalone debug container, leaked volumes, or a stuck +`docker run` wrapper that contends for the shared Swarm and skews the survey. Clear them before +surveying. The sweep is **safe by allowlist** — it removes only what is NOT infra (traefik/drone/ +backups/the `ccci-*` control plane) and NOT a `warm-*` canonical (whose retained volumes are spared), +so it can never take down a live service: +``` +ssh cc-ci 'bash -s' < /srv/cc-ci/.claude/skills/upgrade-all/sweep-orphans.sh +``` +It is idempotent (a no-op when the host is already clean) and prints what it removed plus the +surviving Swarm services (which should be infra + `warm-*` only — eyeball that before continuing). If +anything legitimate looks at risk, stop and investigate rather than proceeding. + ## 1. Build the candidate list Enrolled recipes = the cc-ci `tests//` dirs (same set `ci-test-review` sweeps): ``` diff --git a/.claude/skills/upgrade-all/sweep-orphans.sh b/.claude/skills/upgrade-all/sweep-orphans.sh new file mode 100755 index 0000000..c6b95e1 --- /dev/null +++ b/.claude/skills/upgrade-all/sweep-orphans.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# Sweep orphaned test deployments + debug debris left by PREVIOUS cc-ci runs. +# +# Runs ON the cc-ci host (root) — pipe it in: ssh cc-ci 'bash -s' < sweep-orphans.sh +# Invoked at the START of /upgrade-all so a leaked stack/container/volume/process from a prior run +# (a teardown that crashed, a manual debug probe, a killed agent) does not contend for the shared +# Swarm or skew the survey. Idempotent + safe to run anytime: a no-op when the host is already clean. +# +# SAFE BY ALLOWLIST. It removes ONLY things NOT on the keep-list, so it can never take down infra or +# the warm canonicals. The keep-list (leading name prefix) is: +# - traefik, drone, backups : Swarm + CI infra +# - ccci-bridge / -dashboard / -reports: the cc-ci control plane +# - warm-* : warm canonicals (idle persistent deps reused across runs; +# their retained volumes are spared too) +# Everything else deployed on the Swarm is a per-run test stack and is fair game. +set -uo pipefail +export PATH=/run/current-system/sw/bin:$PATH + +KEEP_RE='^(traefik|drone|backups|ccci-(bridge|dashboard|reports)|warm-)' +removed=0 + +echo "== orphan sweep: scanning (keep-list: infra + warm-* canonicals) ==" + +# 1) Orphan Swarm stacks — any deployed stack not on the keep-list is a leftover per-run test deploy +# (the harness deploys each recipe under its own per-run stack; a clean run tears it down). +mapfile -t STACKS < <(docker stack ls --format '{{.Name}}' 2>/dev/null) +for s in "${STACKS[@]}"; do + [ -z "$s" ] && continue + if printf '%s' "$s" | grep -Eq "$KEEP_RE"; then continue; fi + echo " orphan stack -> removing: $s" + docker stack rm "$s" >/dev/null 2>&1 || true + removed=$((removed + 1)) +done +# wait (bounded) for removed stacks' services to drain so their volumes free up for step 3 +for _ in $(seq 1 30); do + [ -z "$(docker service ls --format '{{.Name}}' 2>/dev/null | grep -Ev "$KEEP_RE")" ] && break + sleep 2 +done + +# 2) Orphan standalone containers — running/exited containers NOT managed by Swarm (no +# com.docker.swarm.service.id label) are debug `docker run` leftovers (e.g. the plausible +# clickhouse entrypoint probes). Remove only ones started > 30 min ago, so an in-flight manual +# probe started right before the run is spared. +now=$(date +%s) +for c in $(docker ps -aq 2>/dev/null); do + sid=$(docker inspect -f '{{ index .Config.Labels "com.docker.swarm.service.id" }}' "$c" 2>/dev/null) + [ -n "$sid" ] && continue # Swarm-managed → keep + started=$(docker inspect -f '{{.State.StartedAt}}' "$c" 2>/dev/null) + st=$(date -d "$started" +%s 2>/dev/null || echo "$now") + [ $((now - st)) -lt 1800 ] && continue # younger than 30 min → spare a fresh manual probe + echo " orphan container -> removing: $(docker inspect -f '{{.Name}} ({{.Config.Image}})' "$c" 2>/dev/null)" + docker rm -f "$c" >/dev/null 2>&1 || true + removed=$((removed + 1)) +done + +# 3) Leaked volumes — dangling (referenced by no container) volumes left by removed test stacks. +# Spare warm-* volumes: a warm canonical idles UNDEPLOYED with its data volume retained, so its +# volume is legitimately dangling and must NOT be pruned. +for v in $(docker volume ls -qf dangling=true 2>/dev/null); do + printf '%s' "$v" | grep -Eq "$KEEP_RE" && continue + docker volume rm "$v" >/dev/null 2>&1 && { echo " leaked volume -> removed: $v"; removed=$((removed + 1)); } +done + +# 4) Orphan debug host-processes — reparented (ppid==1) `timeout … docker run …` wrappers left by +# manual recipe probes; they outlive their container and never self-reap. Kill by explicit PID +# (NEVER pkill -f, which would self-match this script's own command line). +for p in $(ps -eo pid=,ppid=,cmd= 2>/dev/null | awk '$2==1 && /timeout/ && /docker[[:space:]]+run/ {print $1}'); do + echo " orphan debug wrapper -> killing pid $p" + kill "$p" 2>/dev/null || true + removed=$((removed + 1)) +done + +# 5) Stray exited containers (debug one-shots) — best-effort prune. +docker container prune -f >/dev/null 2>&1 || true + +if [ "$removed" -eq 0 ]; then + echo "== orphan sweep: clean (nothing to remove) ==" +else + echo "== orphan sweep: removed/killed $removed orphan(s) ==" +fi +echo "-- surviving Swarm services (should be infra + warm-* only) --" +docker service ls --format '{{.Name}}' 2>/dev/null | sort