harness: fix A2 (janitor real-name + docker reap + age gate) and A3 (verified teardown)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
teardown_app now docker-stack-rm fallback, removes .env only after stack gone, retries volume rm, and verifies no residual (raises TeardownError). janitor matches the real <recipe[:4]>-<6hex> scheme + reaps env-less orphans via docker. Verified. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
23
JOURNAL.md
23
JOURNAL.md
@ -366,3 +366,26 @@ M5 CLAIMED.
|
||||
|
||||
**M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked,
|
||||
pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6.
|
||||
|
||||
## 2026-05-27 — Fix adversary findings A2 (dead janitor) + A3 (unverified teardown)
|
||||
|
||||
**A2 (janitor matched dead `-pr` filter):** rewrote `harness.lifecycle.janitor` to match the real
|
||||
run-app naming (`RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`), reap via
|
||||
docker primitives, AND scan `docker service ls` to catch orphans whose `.env` is already gone
|
||||
(reconstructs the domain from the service name). Age-gated (default 2h, env `CCCI_JANITOR_MAX_AGE`)
|
||||
so concurrent in-flight runs are never killed.
|
||||
|
||||
**A3 (teardown unverified + unconditional .env removal):** `teardown_app` now (1) `docker stack rm`
|
||||
fallback if `abra undeploy` leaves services, (2) removes volumes/secrets *before* the `.env` and
|
||||
only drops the `.env` after the stack is confirmed gone, (3) retries docker volume rm (a stopped
|
||||
task briefly holds the volume), (4) **verifies** no residual services/volumes/secrets and raises
|
||||
`TeardownError` otherwise — so a partial teardown FAILS the run instead of silently orphaning.
|
||||
|
||||
**Re-test (commands + output):**
|
||||
- Normal install run → 2 passed, verified teardown clean.
|
||||
- Orphan (deploy, no teardown) → `janitor(CCCI_JANITOR_MAX_AGE=0)` → services/volumes/secrets/env 0.
|
||||
- **Env-less orphan** (deploy then `rm` the .env, the A3 bad state) → janitor reaps via docker stack
|
||||
rm → services/volumes/secrets 0.
|
||||
- Full 3-stage run (install/upgrade/backup) still green with verified teardown, no TeardownError.
|
||||
|
||||
A2/A3 fixed; left for the Adversary to re-test + close.
|
||||
|
||||
@ -5,6 +5,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import ssl
|
||||
import subprocess
|
||||
import time
|
||||
@ -13,6 +15,52 @@ import urllib.request
|
||||
from . import abra
|
||||
|
||||
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
|
||||
# A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
|
||||
# janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match).
|
||||
RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$")
|
||||
|
||||
|
||||
class TeardownError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _docker_names(kind: str, stack: str) -> list[str]:
|
||||
"""docker <kind> ls names filtered to a stack (kind: service|volume|secret)."""
|
||||
proc = subprocess.run(
|
||||
["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"],
|
||||
capture_output=True, text=True,
|
||||
)
|
||||
return [n for n in proc.stdout.split("\n") if n.strip()]
|
||||
|
||||
|
||||
def _residual(domain: str) -> dict:
|
||||
stack = _stack_name(domain)
|
||||
return {
|
||||
"services": _docker_names("service", stack),
|
||||
"volumes": _docker_names("volume", stack),
|
||||
"secrets": _docker_names("secret", stack),
|
||||
}
|
||||
|
||||
|
||||
def _stack_age_seconds(stack: str) -> float | None:
|
||||
"""Age of the stack's oldest service, or None if not present."""
|
||||
svcs = _docker_names("service", stack)
|
||||
if not svcs:
|
||||
return None
|
||||
oldest = None
|
||||
for s in svcs:
|
||||
p = subprocess.run(["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
|
||||
capture_output=True, text=True)
|
||||
ts = p.stdout.strip()
|
||||
try:
|
||||
# docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
|
||||
dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
|
||||
tzinfo=datetime.timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
age = (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds()
|
||||
oldest = age if oldest is None else max(oldest, age)
|
||||
return oldest
|
||||
|
||||
|
||||
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
|
||||
@ -133,18 +181,74 @@ def http_body(domain: str, path: str = "/", timeout: int = 15) -> str:
|
||||
return resp.read().decode(errors="replace")
|
||||
|
||||
|
||||
def teardown_app(domain: str) -> None:
|
||||
"""Idempotent, best-effort full teardown. Never raises (finalizer-safe)."""
|
||||
def _force_stack_rm(stack: str, timeout: int = 120) -> None:
|
||||
"""Remove a stack's services directly (no .env needed) and wait for them to disappear."""
|
||||
subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True)
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline and _docker_names("service", stack):
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def teardown_app(domain: str, verify: bool = True) -> None:
|
||||
"""Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise).
|
||||
|
||||
Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra
|
||||
needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra
|
||||
undeploy fails, fall back to `docker stack rm` (which needs no .env)."""
|
||||
stack = _stack_name(domain)
|
||||
abra.undeploy(domain)
|
||||
abra.volume_remove(domain)
|
||||
if _docker_names("service", stack):
|
||||
_force_stack_rm(stack) # fallback: abra undeploy didn't clear it
|
||||
abra.volume_remove(domain) # needs the .env -> before removing it
|
||||
abra.secret_remove_all(domain)
|
||||
abra.app_config_remove(domain)
|
||||
# belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be
|
||||
# briefly held by a just-stopped task after `stack rm`, so retry the volume removal.
|
||||
deadline = time.time() + 60
|
||||
while time.time() < deadline:
|
||||
vols = _docker_names("volume", stack)
|
||||
if not vols:
|
||||
break
|
||||
for v in vols:
|
||||
subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True)
|
||||
if not _docker_names("volume", stack):
|
||||
break
|
||||
time.sleep(3)
|
||||
for s in _docker_names("secret", stack):
|
||||
subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True)
|
||||
abra.app_config_remove(domain) # only now (stack gone) drop the .env
|
||||
|
||||
if verify:
|
||||
residual = _residual(domain)
|
||||
if any(residual.values()):
|
||||
raise TeardownError(f"teardown left residual for {domain}: {residual}")
|
||||
|
||||
|
||||
def janitor(max_age_hours: int = 6) -> None:
|
||||
"""Remove orphaned *-pr* apps left by crashed runs older than max_age_hours."""
|
||||
def janitor(max_age_seconds: int | None = None) -> None:
|
||||
"""Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only
|
||||
reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via
|
||||
docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable
|
||||
via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately)."""
|
||||
import os
|
||||
if max_age_seconds is None:
|
||||
max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
|
||||
seen = set()
|
||||
for app in abra.app_ls():
|
||||
name = app.get("appName") or app.get("domain") or ""
|
||||
if "-pr" in name and ".ci.commoninternet.net" in name:
|
||||
# best-effort; deployed-status/age detail varies by abra version
|
||||
teardown_app(name)
|
||||
if RUN_APP_RE.match(name):
|
||||
seen.add(name)
|
||||
# also catch stacks whose .env was already deleted (abra ls won't list them)
|
||||
for svc in _docker_names("service", ""):
|
||||
# svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain
|
||||
m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc)
|
||||
if m:
|
||||
seen.add(f"{m.group(1)}.ci.commoninternet.net")
|
||||
|
||||
for name in seen:
|
||||
stack = _stack_name(name)
|
||||
age = _stack_age_seconds(stack)
|
||||
if age is not None and age < max_age_seconds:
|
||||
continue # likely a concurrent in-flight run; leave it
|
||||
try:
|
||||
teardown_app(name, verify=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user