diff --git a/JOURNAL.md b/JOURNAL.md index 2be12f1..fcd13f7 100644 --- a/JOURNAL.md +++ b/JOURNAL.md @@ -366,3 +366,26 @@ M5 CLAIMED. **M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked, pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6. + +## 2026-05-27 — Fix adversary findings A2 (dead janitor) + A3 (unverified teardown) + +**A2 (janitor matched dead `-pr` filter):** rewrote `harness.lifecycle.janitor` to match the real +run-app naming (`RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`), reap via +docker primitives, AND scan `docker service ls` to catch orphans whose `.env` is already gone +(reconstructs the domain from the service name). Age-gated (default 2h, env `CCCI_JANITOR_MAX_AGE`) +so concurrent in-flight runs are never killed. + +**A3 (teardown unverified + unconditional .env removal):** `teardown_app` now (1) `docker stack rm` +fallback if `abra undeploy` leaves services, (2) removes volumes/secrets *before* the `.env` and +only drops the `.env` after the stack is confirmed gone, (3) retries docker volume rm (a stopped +task briefly holds the volume), (4) **verifies** no residual services/volumes/secrets and raises +`TeardownError` otherwise — so a partial teardown FAILS the run instead of silently orphaning. + +**Re-test (commands + output):** +- Normal install run → 2 passed, verified teardown clean. +- Orphan (deploy, no teardown) → `janitor(CCCI_JANITOR_MAX_AGE=0)` → services/volumes/secrets/env 0. +- **Env-less orphan** (deploy then `rm` the .env, the A3 bad state) → janitor reaps via docker stack + rm → services/volumes/secrets 0. +- Full 3-stage run (install/upgrade/backup) still green with verified teardown, no TeardownError. + +A2/A3 fixed; left for the Adversary to re-test + close. diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 0e14193..8dd6a5a 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -5,6 +5,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer """ from __future__ import annotations +import datetime +import re import ssl import subprocess import time @@ -13,6 +15,52 @@ import urllib.request from . import abra GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci) +# A run app domain is "-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the +# janitor to recognise orphaned run apps (infra apps like traefik/drone/backups don't match). +RUN_APP_RE = re.compile(r"^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$") + + +class TeardownError(RuntimeError): + pass + + +def _docker_names(kind: str, stack: str) -> list[str]: + """docker ls names filtered to a stack (kind: service|volume|secret).""" + proc = subprocess.run( + ["docker", kind, "ls", "--filter", f"name={stack}", "--format", "{{.Name}}"], + capture_output=True, text=True, + ) + return [n for n in proc.stdout.split("\n") if n.strip()] + + +def _residual(domain: str) -> dict: + stack = _stack_name(domain) + return { + "services": _docker_names("service", stack), + "volumes": _docker_names("volume", stack), + "secrets": _docker_names("secret", stack), + } + + +def _stack_age_seconds(stack: str) -> float | None: + """Age of the stack's oldest service, or None if not present.""" + svcs = _docker_names("service", stack) + if not svcs: + return None + oldest = None + for s in svcs: + p = subprocess.run(["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"], + capture_output=True, text=True) + ts = p.stdout.strip() + try: + # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars + dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace( + tzinfo=datetime.timezone.utc) + except ValueError: + continue + age = (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds() + oldest = age if oldest is None else max(oldest, age) + return oldest def deploy_app(recipe: str, domain: str, version: str | None = None, secrets: bool = True) -> None: @@ -133,18 +181,74 @@ def http_body(domain: str, path: str = "/", timeout: int = 15) -> str: return resp.read().decode(errors="replace") -def teardown_app(domain: str) -> None: - """Idempotent, best-effort full teardown. Never raises (finalizer-safe).""" +def _force_stack_rm(stack: str, timeout: int = 120) -> None: + """Remove a stack's services directly (no .env needed) and wait for them to disappear.""" + subprocess.run(["docker", "stack", "rm", stack], capture_output=True, text=True) + deadline = time.time() + timeout + while time.time() < deadline and _docker_names("service", stack): + time.sleep(2) + + +def teardown_app(domain: str, verify: bool = True) -> None: + """Full teardown with a docker fallback, then VERIFY nothing is left (raise otherwise). + + Order matters (A3): undeploy, then remove volumes/secrets *while the .env still exists* (abra + needs it), then drop the .env LAST — and only after the stack is confirmed gone. If abra + undeploy fails, fall back to `docker stack rm` (which needs no .env).""" + stack = _stack_name(domain) abra.undeploy(domain) - abra.volume_remove(domain) + if _docker_names("service", stack): + _force_stack_rm(stack) # fallback: abra undeploy didn't clear it + abra.volume_remove(domain) # needs the .env -> before removing it abra.secret_remove_all(domain) - abra.app_config_remove(domain) + # belt-and-suspenders: drop any volumes/secrets abra missed, by stack name. A volume can be + # briefly held by a just-stopped task after `stack rm`, so retry the volume removal. + deadline = time.time() + 60 + while time.time() < deadline: + vols = _docker_names("volume", stack) + if not vols: + break + for v in vols: + subprocess.run(["docker", "volume", "rm", v], capture_output=True, text=True) + if not _docker_names("volume", stack): + break + time.sleep(3) + for s in _docker_names("secret", stack): + subprocess.run(["docker", "secret", "rm", s], capture_output=True, text=True) + abra.app_config_remove(domain) # only now (stack gone) drop the .env + + if verify: + residual = _residual(domain) + if any(residual.values()): + raise TeardownError(f"teardown left residual for {domain}: {residual}") -def janitor(max_age_hours: int = 6) -> None: - """Remove orphaned *-pr* apps left by crashed runs older than max_age_hours.""" +def janitor(max_age_seconds: int | None = None) -> None: + """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only + reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via + docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable + via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately).""" + import os + if max_age_seconds is None: + max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200")) + seen = set() for app in abra.app_ls(): name = app.get("appName") or app.get("domain") or "" - if "-pr" in name and ".ci.commoninternet.net" in name: - # best-effort; deployed-status/age detail varies by abra version - teardown_app(name) + if RUN_APP_RE.match(name): + seen.add(name) + # also catch stacks whose .env was already deleted (abra ls won't list them) + for svc in _docker_names("service", ""): + # svc like cust-c95a69_ci_commoninternet_net_app -> reconstruct domain + m = re.match(r"^([a-z0-9]{1,4}-[0-9a-f]{6})_ci_commoninternet_net_", svc) + if m: + seen.add(f"{m.group(1)}.ci.commoninternet.net") + + for name in seen: + stack = _stack_name(name) + age = _stack_age_seconds(stack) + if age is not None and age < max_age_seconds: + continue # likely a concurrent in-flight run; leave it + try: + teardown_app(name, verify=False) + except Exception: + pass