harness: fix A2 (janitor real-name + docker reap + age gate) and A3 (verified teardown)
All checks were successful
continuous-integration/drone/push Build is passing

teardown_app now docker-stack-rm fallback, removes .env only after stack gone,
retries volume rm, and verifies no residual (raises TeardownError). janitor matches
the real <recipe[:4]>-<6hex> scheme + reaps env-less orphans via docker. Verified.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 01:05:04 +01:00
parent b8f3473777
commit b7a2d70380
2 changed files with 136 additions and 9 deletions

View File

@ -366,3 +366,26 @@ M5 CLAIMED.
**M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked,
pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6.
## 2026-05-27 — Fix adversary findings A2 (dead janitor) + A3 (unverified teardown)
**A2 (janitor matched dead `-pr` filter):** rewrote `harness.lifecycle.janitor` to match the real
run-app naming (`RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`), reap via
docker primitives, AND scan `docker service ls` to catch orphans whose `.env` is already gone
(reconstructs the domain from the service name). Age-gated (default 2h, env `CCCI_JANITOR_MAX_AGE`)
so concurrent in-flight runs are never killed.
**A3 (teardown unverified + unconditional .env removal):** `teardown_app` now (1) `docker stack rm`
fallback if `abra undeploy` leaves services, (2) removes volumes/secrets *before* the `.env` and
only drops the `.env` after the stack is confirmed gone, (3) retries docker volume rm (a stopped
task briefly holds the volume), (4) **verifies** no residual services/volumes/secrets and raises
`TeardownError` otherwise — so a partial teardown FAILS the run instead of silently orphaning.
**Re-test (commands + output):**
- Normal install run → 2 passed, verified teardown clean.
- Orphan (deploy, no teardown) → `janitor(CCCI_JANITOR_MAX_AGE=0)` → services/volumes/secrets/env 0.
- **Env-less orphan** (deploy then `rm` the .env, the A3 bad state) → janitor reaps via docker stack
rm → services/volumes/secrets 0.
- Full 3-stage run (install/upgrade/backup) still green with verified teardown, no TeardownError.
A2/A3 fixed; left for the Adversary to re-test + close.

View File

@ -5,6 +5,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
"""
from __future__ import annotations
import datetime
import re
import ssl
import subprocess
import time
@ -13,6 +15,52 @@ import urllib.request
from . import abra
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
# A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
# janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match).
RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$")
class TeardownError(RuntimeError):
pass
def _docker_names(kind: str, stack: str) -> list[str]:
"""docker <kind> ls names filtered to a stack (kind: service|volume|secret)."""
proc = subprocess.run(
["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"],
capture_output=True, text=True,
)
return [n for n in proc.stdout.split("\n") if n.strip()]
def _residual(domain: str) -> dict:
stack = _stack_name(domain)
return {
"services": _docker_names("service", stack),
"volumes": _docker_names("volume", stack),
"secrets": _docker_names("secret", stack),
}
def _stack_age_seconds(stack: str) -> float | None:
"""Age of the stack's oldest service, or None if not present."""
svcs = _docker_names("service", stack)
if not svcs:
return None
oldest = None
for s in svcs:
p = subprocess.run(["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
capture_output=True, text=True)
ts = p.stdout.strip()
try:
# docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
tzinfo=datetime.timezone.utc)
except ValueError:
continue
age = (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds()
oldest = age if oldest is None else max(oldest, age)
return oldest
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
@ -133,18 +181,74 @@ def http_body(domain: str, path: str = "/", timeout: int = 15) -> str:
return resp.read().decode(errors="replace")
def teardown_app(domain: str) -> None:
"""Idempotent, best-effort full teardown. Never raises (finalizer-safe)."""
def _force_stack_rm(stack: str, timeout: int = 120) -> None:
"""Remove a stack's services directly (no .env needed) and wait for them to disappear."""
subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True)
deadline = time.time() + timeout
while time.time() < deadline and _docker_names("service", stack):
time.sleep(2)
def teardown_app(domain: str, verify: bool = True) -> None:
"""Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise).
Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra
needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra
undeploy fails, fall back to `docker stack rm` (which needs no .env)."""
stack = _stack_name(domain)
abra.undeploy(domain)
abra.volume_remove(domain)
if _docker_names("service", stack):
_force_stack_rm(stack) # fallback: abra undeploy didn't clear it
abra.volume_remove(domain) # needs the .env -> before removing it
abra.secret_remove_all(domain)
abra.app_config_remove(domain)
# belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be
# briefly held by a just-stopped task after `stack rm`, so retry the volume removal.
deadline = time.time() + 60
while time.time() < deadline:
vols = _docker_names("volume", stack)
if not vols:
break
for v in vols:
subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True)
if not _docker_names("volume", stack):
break
time.sleep(3)
for s in _docker_names("secret", stack):
subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True)
abra.app_config_remove(domain) # only now (stack gone) drop the .env
if verify:
residual = _residual(domain)
if any(residual.values()):
raise TeardownError(f"teardown left residual for {domain}: {residual}")
def janitor(max_age_hours: int = 6) -> None:
"""Remove orphaned *-pr* apps left by crashed runs older than max_age_hours."""
def janitor(max_age_seconds: int | None = None) -> None:
"""Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only
reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via
docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable
via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately)."""
import os
if max_age_seconds is None:
max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
seen = set()
for app in abra.app_ls():
name = app.get("appName") or app.get("domain") or ""
if "-pr" in name and ".ci.commoninternet.net" in name:
# best-effort; deployed-status/age detail varies by abra version
teardown_app(name)
if RUN_APP_RE.match(name):
seen.add(name)
# also catch stacks whose .env was already deleted (abra ls won't list them)
for svc in _docker_names("service", ""):
# svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain
m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc)
if m:
seen.add(f"{m.group(1)}.ci.commoninternet.net")
for name in seen:
stack = _stack_name(name)
age = _stack_age_seconds(stack)
if age is not None and age < max_age_seconds:
continue # likely a concurrent in-flight run; leave it
try:
teardown_app(name, verify=False)
except Exception:
pass