harness: fix A2 (janitor real-name + docker reap + age gate) and A3 (verified teardown)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
teardown_app now docker-stack-rm fallback, removes .env only after stack gone, retries volume rm, and verifies no residual (raises TeardownError). janitor matches the real <recipe[:4]>-<6hex> scheme + reaps env-less orphans via docker. Verified. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
23
JOURNAL.md
23
JOURNAL.md
@ -366,3 +366,26 @@ M5 CLAIMED.
|
|||||||
|
|
||||||
**M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked,
|
**M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked,
|
||||||
pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6.
|
pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6.
|
||||||
|
|
||||||
|
## 2026-05-27 — Fix adversary findings A2 (dead janitor) + A3 (unverified teardown)
|
||||||
|
|
||||||
|
**A2 (janitor matched dead `-pr` filter):** rewrote `harness.lifecycle.janitor` to match the real
|
||||||
|
run-app naming (`RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`), reap via
|
||||||
|
docker primitives, AND scan `docker service ls` to catch orphans whose `.env` is already gone
|
||||||
|
(reconstructs the domain from the service name). Age-gated (default 2h, env `CCCI_JANITOR_MAX_AGE`)
|
||||||
|
so concurrent in-flight runs are never killed.
|
||||||
|
|
||||||
|
**A3 (teardown unverified + unconditional .env removal):** `teardown_app` now (1) `docker stack rm`
|
||||||
|
fallback if `abra undeploy` leaves services, (2) removes volumes/secrets *before* the `.env` and
|
||||||
|
only drops the `.env` after the stack is confirmed gone, (3) retries docker volume rm (a stopped
|
||||||
|
task briefly holds the volume), (4) **verifies** no residual services/volumes/secrets and raises
|
||||||
|
`TeardownError` otherwise — so a partial teardown FAILS the run instead of silently orphaning.
|
||||||
|
|
||||||
|
**Re-test (commands + output):**
|
||||||
|
- Normal install run → 2 passed, verified teardown clean.
|
||||||
|
- Orphan (deploy, no teardown) → `janitor(CCCI_JANITOR_MAX_AGE=0)` → services/volumes/secrets/env 0.
|
||||||
|
- **Env-less orphan** (deploy then `rm` the .env, the A3 bad state) → janitor reaps via docker stack
|
||||||
|
rm → services/volumes/secrets 0.
|
||||||
|
- Full 3-stage run (install/upgrade/backup) still green with verified teardown, no TeardownError.
|
||||||
|
|
||||||
|
A2/A3 fixed; left for the Adversary to re-test + close.
|
||||||
|
|||||||
@ -5,6 +5,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
import ssl
|
import ssl
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
@ -13,6 +15,52 @@ import urllib.request
|
|||||||
from . import abra
|
from . import abra
|
||||||
|
|
||||||
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
|
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
|
||||||
|
# A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
|
||||||
|
# janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match).
|
||||||
|
RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$")
|
||||||
|
|
||||||
|
|
||||||
|
class TeardownError(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _docker_names(kind: str, stack: str) -> list[str]:
|
||||||
|
"""docker <kind> ls names filtered to a stack (kind: service|volume|secret)."""
|
||||||
|
proc = subprocess.run(
|
||||||
|
["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
return [n for n in proc.stdout.split("\n") if n.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _residual(domain: str) -> dict:
|
||||||
|
stack = _stack_name(domain)
|
||||||
|
return {
|
||||||
|
"services": _docker_names("service", stack),
|
||||||
|
"volumes": _docker_names("volume", stack),
|
||||||
|
"secrets": _docker_names("secret", stack),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _stack_age_seconds(stack: str) -> float | None:
|
||||||
|
"""Age of the stack's oldest service, or None if not present."""
|
||||||
|
svcs = _docker_names("service", stack)
|
||||||
|
if not svcs:
|
||||||
|
return None
|
||||||
|
oldest = None
|
||||||
|
for s in svcs:
|
||||||
|
p = subprocess.run(["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
|
||||||
|
capture_output=True, text=True)
|
||||||
|
ts = p.stdout.strip()
|
||||||
|
try:
|
||||||
|
# docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
|
||||||
|
dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
|
||||||
|
tzinfo=datetime.timezone.utc)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
age = (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds()
|
||||||
|
oldest = age if oldest is None else max(oldest, age)
|
||||||
|
return oldest
|
||||||
|
|
||||||
|
|
||||||
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
|
def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None:
|
||||||
@ -133,18 +181,74 @@ def http_body(domain: str, path: str = "/", timeout: int = 15) -> str:
|
|||||||
return resp.read().decode(errors="replace")
|
return resp.read().decode(errors="replace")
|
||||||
|
|
||||||
|
|
||||||
def teardown_app(domain: str) -> None:
|
def _force_stack_rm(stack: str, timeout: int = 120) -> None:
|
||||||
"""Idempotent, best-effort full teardown. Never raises (finalizer-safe)."""
|
"""Remove a stack's services directly (no .env needed) and wait for them to disappear."""
|
||||||
|
subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True)
|
||||||
|
deadline = time.time() + timeout
|
||||||
|
while time.time() < deadline and _docker_names("service", stack):
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
|
||||||
|
def teardown_app(domain: str, verify: bool = True) -> None:
|
||||||
|
"""Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise).
|
||||||
|
|
||||||
|
Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra
|
||||||
|
needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra
|
||||||
|
undeploy fails, fall back to `docker stack rm` (which needs no .env)."""
|
||||||
|
stack = _stack_name(domain)
|
||||||
abra.undeploy(domain)
|
abra.undeploy(domain)
|
||||||
abra.volume_remove(domain)
|
if _docker_names("service", stack):
|
||||||
|
_force_stack_rm(stack) # fallback: abra undeploy didn't clear it
|
||||||
|
abra.volume_remove(domain) # needs the .env -> before removing it
|
||||||
abra.secret_remove_all(domain)
|
abra.secret_remove_all(domain)
|
||||||
abra.app_config_remove(domain)
|
# belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be
|
||||||
|
# briefly held by a just-stopped task after `stack rm`, so retry the volume removal.
|
||||||
|
deadline = time.time() + 60
|
||||||
|
while time.time() < deadline:
|
||||||
|
vols = _docker_names("volume", stack)
|
||||||
|
if not vols:
|
||||||
|
break
|
||||||
|
for v in vols:
|
||||||
|
subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True)
|
||||||
|
if not _docker_names("volume", stack):
|
||||||
|
break
|
||||||
|
time.sleep(3)
|
||||||
|
for s in _docker_names("secret", stack):
|
||||||
|
subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True)
|
||||||
|
abra.app_config_remove(domain) # only now (stack gone) drop the .env
|
||||||
|
|
||||||
|
if verify:
|
||||||
|
residual = _residual(domain)
|
||||||
|
if any(residual.values()):
|
||||||
|
raise TeardownError(f"teardown left residual for {domain}: {residual}")
|
||||||
|
|
||||||
|
|
||||||
def janitor(max_age_hours: int = 6) -> None:
|
def janitor(max_age_seconds: int | None = None) -> None:
|
||||||
"""Remove orphaned *-pr* apps left by crashed runs older than max_age_hours."""
|
"""Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only
|
||||||
|
reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via
|
||||||
|
docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable
|
||||||
|
via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately)."""
|
||||||
|
import os
|
||||||
|
if max_age_seconds is None:
|
||||||
|
max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
|
||||||
|
seen = set()
|
||||||
for app in abra.app_ls():
|
for app in abra.app_ls():
|
||||||
name = app.get("appName") or app.get("domain") or ""
|
name = app.get("appName") or app.get("domain") or ""
|
||||||
if "-pr" in name and ".ci.commoninternet.net" in name:
|
if RUN_APP_RE.match(name):
|
||||||
# best-effort; deployed-status/age detail varies by abra version
|
seen.add(name)
|
||||||
teardown_app(name)
|
# also catch stacks whose .env was already deleted (abra ls won't list them)
|
||||||
|
for svc in _docker_names("service", ""):
|
||||||
|
# svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain
|
||||||
|
m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc)
|
||||||
|
if m:
|
||||||
|
seen.add(f"{m.group(1)}.ci.commoninternet.net")
|
||||||
|
|
||||||
|
for name in seen:
|
||||||
|
stack = _stack_name(name)
|
||||||
|
age = _stack_age_seconds(stack)
|
||||||
|
if age is not None and age < max_age_seconds:
|
||||||
|
continue # likely a concurrent in-flight run; leave it
|
||||||
|
try:
|
||||||
|
teardown_app(name, verify=False)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user