From 23bba98be4fc423444ae2d3e725a9e8889f6fd94 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 15:42:23 +0000 Subject: [PATCH] feat(cleanup): guarantee step-2b dev deploys get reaped - /recipe-upgrade step 2b: teardown is now MANDATORY on every exit path (finally), with a verify-no-leak check; tear down even on failure before reporting. - reap-dev-deploys.sh: safe, age-gated backstop that removes only idle dev-* stacks (never CI per-run stacks, warm-*, infra; an active dev loop stays fresh). - orchestrator: hourly cc-ci-reap-dev-deploys systemd timer runs it against cc-ci, bounding any leaked dev deploy from a crashed/abandoned loop. Co-Authored-By: Claude Opus 4.8 --- .claude/skills/recipe-upgrade/SKILL.md | 17 +++++-- .../skills/upgrade-all/reap-dev-deploys.sh | 49 +++++++++++++++++++ .../configuration.nix | 33 +++++++++++++ 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100755 .claude/skills/upgrade-all/reap-dev-deploys.sh diff --git a/.claude/skills/recipe-upgrade/SKILL.md b/.claude/skills/recipe-upgrade/SKILL.md index 3f4c4c3..d1a342f 100644 --- a/.claude/skills/recipe-upgrade/SKILL.md +++ b/.claude/skills/recipe-upgrade/SKILL.md @@ -146,9 +146,20 @@ ssh cc-ci 'export PATH=/run/current-system/sw/bin:$PATH; set -a; . /srv/cc-ci/.t - **Iterate:** edit the recipe → `abra app deploy $D --chaos --force` to cycle → re-read logs, until it converges, serves, and any migration/config behaves. This is where you debug the real upgrade with full visibility — fold what you learn back into the recipe edit (step 2). -- **Tear down when done — ALWAYS** (shared swarm): `script -qec "abra app undeploy $D -n" /dev/null` - then `script -qec "abra app rm $D -n --no-input" /dev/null` (removes volumes/secrets). The - `/upgrade-all` orphan-sweep (Step 0) is the backstop, but clean up explicitly. +- **Tear down — MANDATORY, on every exit path** (success, RED, or abort — shared swarm): the dev + deploy is yours and you MUST remove it before you finish this recipe / move to the next one. Treat + it as a `finally`: + ``` + ssh cc-ci 'export PATH=/run/current-system/sw/bin:$PATH; D=dev-.ci.commoninternet.net; \ + script -qec "abra app undeploy $D -n" /dev/null 2>&1 | tail -2; \ + script -qec "abra app rm $D -n --no-input" /dev/null 2>&1 | tail -2' + ``` + Then **verify nothing leaked**: `ssh cc-ci 'docker stack ls --format "{{.Name}}" | grep -c "^dev-"'` + should print `0` (and no `dev-_*` volumes remain). If the recipe failed, tear the dev deploy + down anyway, THEN report the failure — never leave it running. + Backstops (defence-in-depth, NOT a substitute for the explicit teardown above): the `/upgrade-all` + orphan-sweep (Step 0) and the **hourly `cc-ci-reap-dev-deploys` timer** (reaps idle `dev-*` stacks), + so a crashed/abandoned loop's deploy is bounded — but you must still clean up yourself. - Caveats: shared swarm — keep to **ONE** `dev-` instance at a time and tear it down before the next recipe; the `dev-` domain is distinct from the harness's per-run domains and from the `warm-*` canonicals, so the sweep removes a leaked one without touching live services. diff --git a/.claude/skills/upgrade-all/reap-dev-deploys.sh b/.claude/skills/upgrade-all/reap-dev-deploys.sh new file mode 100755 index 0000000..5e5202b --- /dev/null +++ b/.claude/skills/upgrade-all/reap-dev-deploys.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Reap LEAKED step-2b dev deploys on the cc-ci server. +# +# /recipe-upgrade step 2b deploys a recipe under a `dev-` domain to debug an upgrade with live +# logs, and REQUIRES the agent to tear it down when done. This is the automated backstop for when that +# teardown is missed (agent crashed / killed / abandoned mid-loop): it removes `dev-*` Swarm stacks +# (+ their now-dangling volumes) whose newest service has not been updated in THRESHOLD seconds. +# +# SAFE to run anytime — even while CI is mid-run — because it is scoped + age-gated: +# - it touches ONLY the `dev-` naming convention used by step 2b. CI per-run stacks +# (`-`), `warm-*` canonicals, and infra are never `dev-*`, so never matched. +# - an ACTIVE dev loop redeploys (refreshing the service UpdatedAt), so it stays "fresh" and is NOT +# reaped mid-use; only an idle/abandoned `dev-*` ages past THRESHOLD and is removed. +# - volume cleanup uses `dangling=true`, so an active deploy's attached volumes are never removed. +# +# Run ON the cc-ci host: ssh cc-ci 'THRESHOLD=14400 bash -s' < reap-dev-deploys.sh +set -uo pipefail +export PATH=/run/current-system/sw/bin:$PATH + +THRESHOLD="${THRESHOLD:-14400}" # 4h — generous, so a long but ACTIVE dev loop is never reaped +now=$(date +%s) +reaped=0 + +mapfile -t STACKS < <(docker stack ls --format '{{.Name}}' 2>/dev/null | grep -E '^dev-' || true) +for s in "${STACKS[@]}"; do + [ -z "$s" ] && continue + newest=0 + for sid in $(docker service ls --filter "label=com.docker.stack.namespace=$s" -q 2>/dev/null); do + ua=$(docker service inspect "$sid" --format '{{.UpdatedAt}}' 2>/dev/null) + e=$(date -d "$ua" +%s 2>/dev/null || echo 0) + [ "$e" -gt "$newest" ] && newest="$e" + done + age=$(( now - newest )) + if [ "$newest" -gt 0 ] && [ "$age" -gt "$THRESHOLD" ]; then + echo "reap: dev stack '$s' idle ${age}s (> ${THRESHOLD}s) — removing" + docker stack rm "$s" >/dev/null 2>&1 || true + reaped=$((reaped + 1)) + else + echo "keep: dev stack '$s' active (last update ${age}s ago)" + fi +done + +if [ "$reaped" -gt 0 ]; then + sleep 8 # let removed stacks' services drain so their volumes become dangling + for v in $(docker volume ls -qf dangling=true 2>/dev/null | grep -E '^dev-' || true); do + docker volume rm "$v" >/dev/null 2>&1 && echo "reap: removed leaked volume $v" + done +fi +echo "reap-dev-deploys: ${reaped} stale dev deploy(s) removed" diff --git a/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix b/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix index fd14b5e..b0b8550 100644 --- a/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix +++ b/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix @@ -220,4 +220,37 @@ SSHCFG Persistent = true; # if the box was down at the scheduled time, run once on next boot }; }; + + # Hourly reaper for LEAKED /recipe-upgrade step-2b dev deploys (`dev-*` stacks on the cc-ci server). + # The upgrader must tear down its own dev deploy; this is the automated backstop for a missed + # teardown (crashed/abandoned loop). reap-dev-deploys.sh is scoped + age-gated so it is safe to run + # even mid-CI: it only touches `dev-*`, and only when idle > THRESHOLD (an active dev loop keeps + # redeploying and is never reaped). cc-ci-plan/IDEAS.md tracks the eventual separate-infra fix; this + # just bounds the leak window in the meantime. + systemd.services.cc-ci-reap-dev-deploys = { + description = "Reap leaked step-2b dev deploys (dev-* stacks) on the cc-ci server"; + after = [ "network-online.target" "tailscaled.service" ]; + wants = [ "network-online.target" ]; + serviceConfig = { + Type = "oneshot"; + User = "loops"; Group = "users"; + WorkingDirectory = "/srv/cc-ci"; + }; + environment = { HOME = "/home/loops"; }; + path = [ pkgs.bash pkgs.openssh pkgs.coreutils ]; + script = '' + ssh cc-ci 'THRESHOLD=14400 bash -s' \ + < /srv/cc-ci/.claude/skills/upgrade-all/reap-dev-deploys.sh \ + >> /srv/cc-ci/.cc-ci-logs/reap-dev-deploys.log 2>&1 + ''; + }; + + systemd.timers.cc-ci-reap-dev-deploys = { + description = "Hourly reaper for leaked step-2b dev deploys on cc-ci"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "hourly"; + Persistent = true; + }; + }; }