From b492f995bdfc4be2248938d8b74ed0da15147fb1 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:04:28 +0000 Subject: [PATCH 1/6] =?UTF-8?q?feat(harness):=20P1=20lock-lifetime=20harde?= =?UTF-8?q?ning=20=E2=80=94=20PDEATHSIG=20+=20SIGTERM/SIGALRM=20teardown?= =?UTF-8?q?=20funnel=20+=2060-min=20hard=20deadline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - new harness/lifetime.py: install_lifetime_guards() arms PR_SET_PDEATHSIG(SIGTERM) (with post-prctl ppid==1 orphan refusal), a SIGTERM handler raising SystemExit through the run's finally: teardown funnel (exit 143), and signal.alarm(3600) funnelling SIGALRM the same way with a distinct deadline log line (exit 142). Re-entrant signals during teardown are logged and ignored (begin_teardown guard) so a second signal can't abort the running cleanup. - run_recipe_ci.main(): guards installed first thing, before any abra call/lock; both teardown finally: blocks (cold + quick) mark begin_teardown(). - .drone.yml recipe-ci step: harness runs under setsid in its own process group; a trap forwards the step shell's TERM/EXIT to the whole group so drone cancel reaches the harness instead of leaking it (docs/concurrency.md §8.1). - PEP 446 note on the recipe-lock open(): the fd is non-inheritable, children never carry it. --- .drone.yml | 11 ++++- runner/harness/lifecycle.py | 1 + runner/harness/lifetime.py | 95 +++++++++++++++++++++++++++++++++++++ runner/run_recipe_ci.py | 9 ++++ 4 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 runner/harness/lifetime.py diff --git a/.drone.yml b/.drone.yml index 5c4c88b..d46adcc 100644 --- a/.drone.yml +++ b/.drone.yml @@ -70,4 +70,13 @@ steps: # build's custom params. CCCI_QUICK=1 makes run_recipe_ci take the opt-in fast lane (WC7); # absent => full cold (default). run_quick ignores STAGES (always upgrade+custom). - 'echo "recipe-ci: RECIPE=$RECIPE REF=$REF PR=$PR SRC=$SRC stages=$STAGES quick=${CCCI_QUICK:-0}"' - - cc-ci-run runner/run_recipe_ci.py + # P1 lock-lifetime hardening: run the harness in its own session/process group (setsid) and + # forward a drone cancel (TERM to this step shell) to the WHOLE group, so the harness's + # SIGTERM handler runs its teardown funnel instead of being leaked (the exec runner kills + # only the step shell, not the tree). PDEATHSIG inside the harness backstops the case where + # this shell dies without the trap firing. `wait` propagates the harness exit code. + - | + setsid cc-ci-run runner/run_recipe_ci.py & + PID=$! + trap 'kill -TERM -- "-$PID" 2>/dev/null' TERM EXIT + wait "$PID" diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 5c44c81..2346948 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -48,6 +48,7 @@ def acquire_recipe_lock(recipe: str): recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the whole run; the lock is released only when the process exits and the fd closes.""" path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") + # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) diff --git a/runner/harness/lifetime.py b/runner/harness/lifetime.py new file mode 100644 index 0000000..4ad7f60 --- /dev/null +++ b/runner/harness/lifetime.py @@ -0,0 +1,95 @@ +"""Run-lifetime hardening (concurrency restructure P1). + +The concurrency model's invariant chain is: + + lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline + +Locks are kernel flocks released on process exit, so the only thing that needs managing is the +PROCESS lifetime. Three guards, installed at run startup (before any abra call) by +`install_lifetime_guards()`: + + 1. `PR_SET_PDEATHSIG(SIGTERM)`: if the parent (the drone step shell) dies — cancel, runner + crash, host shutdown of the step — the kernel delivers SIGTERM to the harness, so a dead + build can never leak a running harness that holds locks. Paired with a ppid==1 re-check + AFTER the prctl: a parent that died BEFORE the prctl took effect would never trigger the + death signal, so a harness that finds itself already reparented refuses to run. + 2. SIGTERM handler: raise SystemExit so the run's `finally:` teardown funnel executes and the + process exits non-zero. Re-entrant deliveries during teardown are logged and IGNORED so a + second signal can't abort the cleanup the first one asked for (`begin_teardown()` guards + this; the run's own `finally:` blocks also call it so a signal landing mid-normal-teardown + can't abort that either). + 3. `signal.alarm(3600)`: self-imposed hard deadline. SIGALRM funnels into the same teardown + path with a distinct log line. Teardown time after the deadline is not alarm-bounded — + interrupting a teardown buys nothing; the janitor (flock probe) is the backstop if a + teardown wedges and the process is killed harder. +""" + +from __future__ import annotations + +import ctypes +import os +import signal +import sys + +HARD_DEADLINE_SECONDS = 60 * 60 + +_PR_SET_PDEATHSIG = 1 # linux/prctl.h + +_state = {"tearing_down": False} + + +def begin_teardown() -> None: + """Mark the teardown funnel as running. From here on SIGTERM/SIGALRM must NOT raise — it + would abort the very cleanup it asks for — so the handlers log and return instead. Called by + the handlers themselves before raising, and at the top of the run's `finally:` blocks.""" + _state["tearing_down"] = True + + +def _funnel_handler(log_line: str, exit_code: int): + """A signal handler that routes into the teardown funnel exactly once: log, then raise + SystemExit (propagates through the run's try/finally → teardown executes → non-zero exit). + While teardown is already running, further signals are logged and swallowed.""" + + def handler(signum: int, frame) -> None: # noqa: ARG001 + print(log_line, flush=True) + if _state["tearing_down"]: + print( + f"== signal {signum} during teardown — ignored (teardown continues, " + "exit stays non-zero) ==", + flush=True, + ) + return + begin_teardown() + raise SystemExit(exit_code) + + return handler + + +def install_lifetime_guards(deadline_seconds: int = HARD_DEADLINE_SECONDS) -> None: + """Install all three lifetime guards (see module docstring). Must run at harness startup, + before any abra call and before any lock is taken.""" + libc = ctypes.CDLL("libc.so.6", use_errno=True) + if libc.prctl(_PR_SET_PDEATHSIG, signal.SIGTERM, 0, 0, 0) != 0: + err = ctypes.get_errno() + raise OSError(err, f"prctl(PR_SET_PDEATHSIG, SIGTERM) failed: {os.strerror(err)}") + # The prctl is armed now — but only fires for a parent death AFTER this point. If the parent + # already died, we are reparented (ppid 1) and would never get the signal: refuse to run, an + # orphaned harness would hold locks/apps with nothing managing its lifetime. + if os.getppid() == 1: + sys.exit("parent died before prctl(PR_SET_PDEATHSIG) — refusing to run orphaned") + signal.signal( + signal.SIGTERM, + _funnel_handler( + "== SIGTERM received (drone cancel / parent death) — tearing down ==", + 128 + signal.SIGTERM, + ), + ) + minutes = deadline_seconds // 60 + signal.signal( + signal.SIGALRM, + _funnel_handler( + f"== run exceeded {minutes}-minute hard deadline — tearing down ==", + 128 + signal.SIGALRM, + ), + ) + signal.alarm(deadline_seconds) diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index c7e55da..5884e63 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -47,6 +47,7 @@ from harness import ( # noqa: E402 discovery, generic, lifecycle, + lifetime, naming, warm, warmsnap, @@ -658,6 +659,8 @@ def run_quick( results["upgrade"] = "fail" results["custom"] = "skip" finally: + # Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py). + lifetime.begin_teardown() # F2-11 skip count (read before deciding pass/fail) requires_deps_skipped = 0 try: @@ -821,6 +824,9 @@ def promote_canonical(recipe: str, head_ref: str | None) -> None: def main() -> int: + # P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard + # deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py). + lifetime.install_lifetime_guards() recipe = os.environ.get("RECIPE") if not recipe: print("RECIPE env is required", file=sys.stderr) @@ -1123,6 +1129,9 @@ def main() -> int: if op in stages: results[op] = "skip" finally: + # From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored + # (lifetime.py) so a second signal can't abort the cleanup the first one asked for. + lifetime.begin_teardown() # Teardown the recipe under test FIRST, then deps in reverse declaration order. # Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier # failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are From b302f3ab635c82dba9f2e60f3a6e964921ea07e4 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:11:31 +0000 Subject: [PATCH 2/6] =?UTF-8?q?feat(harness):=20P2=20flock-probe=20janitor?= =?UTF-8?q?=20=E2=80=94=20the=20kernel=20flock=20IS=20the=20liveness=20ora?= =?UTF-8?q?cle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - acquire_app_lock(domain): exclusive flock on /run/lock/cc-ci-app-.lock, taken in deploy_app exactly where register_run_app was (BEFORE app creation); blocks with a log line when another run of the same domain is in flight (double-!testme serialisation). The file object is retained in module-level _held_app_locks so GC can never close the fd and silently release the lock. mtime is touched at acquisition (lock age for the long-held flag). - janitor(): probes each candidate's lock (discovery unchanged: abra app ls + docker-service sweep vs RUN_APP_RE). Acquirable -> orphan -> teardown_app(verify=False) WHILE HOLDING the probe lock (a new same-domain run blocks until the reap finishes), then unlink before release. Held -> live run -> leave it; held >120min (2x hard deadline) -> warn, never steal. Stale unheld lockfiles with no app are unlinked on sight. Unreadable lockfile -> skip + log. - unlink/recreate race guard (both sides): after ANY acquisition, verify the locked fd still is the inode the path names (fstat vs stat); a waiter that won a just-unlinked inode retries on the live path, and a probe that won one skips (unlinking now would hit a newer run's file). - deleted: register_run_app, unregister_run_app, _run_owner_state, _registry_path, ACTIVE_RUN_DIR, CCCI_JANITOR_MAX_AGE + age fallback, _stack_age_seconds, pid-reuse guard. teardown_app no longer unregisters (release is process exit). janitor() takes no args now. - post-reboot: /run/lock is tmpfs -> lockfiles gone -> probe trivially acquires -> immediate reap (improvement over the old 2h age fallback). --- .drone.yml | 9 +- runner/harness/lifecycle.py | 253 ++++++++++++++++++++++-------------- 2 files changed, 160 insertions(+), 102 deletions(-) diff --git a/.drone.yml b/.drone.yml index d46adcc..d356171 100644 --- a/.drone.yml +++ b/.drone.yml @@ -37,10 +37,11 @@ steps: # # Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) + # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by -# the harness, not by serialisation: same-recipe runs serialise on a per-recipe flock -# (lifecycle.acquire_recipe_lock — the shared ~/.abra/recipes/ checkout is the conflict), -# and every run registers its app domain + pid in /run/cc-ci-active so the run-start janitor only -# reaps orphans whose owning run is DEAD (alive → never touched; unknown → age fallback, default 2h). +# the harness, not by serialisation: every run holds an exclusive flock on its app domain +# (/run/lock/cc-ci-app-.lock) for its whole process lifetime, the run-start janitor probes +# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs +# serialise on a per-recipe flock for the shared ~/.abra/recipes/ checkout +# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md. kind: pipeline type: exec name: recipe-ci diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 2346948..210a011 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -7,8 +7,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer from __future__ import annotations import contextlib -import datetime import fcntl +import glob import json import os import re @@ -18,7 +18,7 @@ import subprocess import time import urllib.request -from . import abra +from . import abra, lifetime GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci) # A run app domain is "-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the @@ -31,22 +31,22 @@ class TeardownError(RuntimeError): # --- Concurrent-run safety (capacity=2) ------------------------------------------------------- -# Two cooperating mechanisms, both process-lifetime-scoped so SIGKILL can't leak a stale lock: -# 1. Per-recipe flock: ~/.abra/recipes/ is ONE shared working tree that fetch_recipe -# rm-rf's/reclones and the upgrade tier git-checkouts mid-run. Concurrent runs of the SAME -# recipe would corrupt each other's deploy tree (observed: immich builds 229/230 deployed a -# tree missing its config), so they serialise on an exclusive flock; different recipes run in -# parallel. The kernel drops a flock when the holder dies, however it dies. -# 2. Active-run registry: each run registers its app domain + pid before creating the app, so the -# janitor can tell a live concurrent run from a crashed run's orphan (see janitor()). +# ONE mechanism, process-lifetime-scoped so SIGKILL can't leak a stale claim: every run holds an +# exclusive kernel flock on its app DOMAIN (/run/lock/cc-ci-app-.lock) for the whole run. +# A held lock implies a live owner — the kernel releases a flock when the holding process dies, +# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held → +# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never +# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each +# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR) +# serialise on this app lock. See docs/concurrency.md. RECIPE_LOCK_DIR = "/run/lock" -ACTIVE_RUN_DIR = "/run/cc-ci-active" def acquire_recipe_lock(recipe: str): - """Take the per-recipe exclusive lock; blocks (with a log line) if another run of the same - recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the - whole run; the lock is released only when the process exits and the fd closes.""" + """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes + checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and + with it this lock, structurally unnecessary); until then the caller keeps the returned file + alive for the whole run and release is implicit at process exit.""" path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run @@ -63,39 +63,55 @@ def acquire_recipe_lock(recipe: str): return f -def _registry_path(domain: str) -> str: - return os.path.join(ACTIVE_RUN_DIR, domain) +# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the +# caller drops the returned file object, GC would close the fd and silently release the lock — +# this list is the lock's owner of record. Never cleared; release is process exit. +_held_app_locks: list = [] -def register_run_app(domain: str) -> None: - """Record this process as the live owner of a run app (called BEFORE the app is created, so a - concurrent run's janitor can never observe the app without its registration).""" - with contextlib.suppress(OSError): - os.makedirs(ACTIVE_RUN_DIR, exist_ok=True) - with open(_registry_path(domain), "w") as f: - f.write(str(os.getpid())) +def _app_lock_dir() -> str: + """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so + post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the + tests/concurrency suite (and its helper subprocesses) can use a sandbox dir.""" + return os.environ.get("CCCI_APP_LOCK_DIR", "/run/lock") -def unregister_run_app(domain: str) -> None: - with contextlib.suppress(OSError): - os.remove(_registry_path(domain)) +def _app_lock_path(domain: str) -> str: + return os.path.join(_app_lock_dir(), f"cc-ci-app-{domain}.lock") -def _run_owner_state(domain: str) -> str: - """'alive' if the registered owner is a live run_recipe_ci process, 'dead' if registered but - gone (definite orphan), 'unknown' if never registered (pre-registry code or post-reboot).""" - try: - with open(_registry_path(domain)) as f: - pid = int(f.read().strip()) - except (OSError, ValueError): - return "unknown" - try: - with open(f"/proc/{pid}/cmdline", "rb") as f: - cmdline = f.read().decode(errors="replace").replace("\0", " ") - except OSError: - return "dead" - # Guard against pid reuse: the owner must still look like a harness run. - return "alive" if "run_recipe_ci" in cmdline else "dead" +def acquire_app_lock(domain: str): + """Take the per-app-domain exclusive lock; blocks (with a log line) if another run of the + same domain is in flight (double-!testme serialisation). Returns the open lock file, which is + ALSO retained in _held_app_locks so the flock lives exactly as long as the process. + + Unlink/recreate race guard: the janitor unlinks a reaped orphan's lockfile while holding its + flock, so a waiter blocked on the OLD inode can win a lock no later opener can observe (a new + open() at the path creates a FRESH inode). After every acquisition, verify the locked fd is + still the file at the path (st_ino match); if not, drop it and retry on the live path.""" + path = _app_lock_path(domain) + waited = False + while True: + # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. + f = open(path, "a") # noqa: SIM115 — deliberately held for the rest of the process + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + if not waited: + print(f"== app lock: another run of {domain} is in flight — waiting ==", flush=True) + waited = True + fcntl.flock(f, fcntl.LOCK_EX) + try: + if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino: + break # we hold the lock on the inode the path names — done + except FileNotFoundError: + pass + f.close() # locked a stale (unlinked) inode — retry on the live path + os.utime(f.fileno()) # mtime = acquisition time = lock age (janitor's long-held flag) + _held_app_locks.append(f) + if waited: + print(f"== app lock: acquired {path} ==", flush=True) + return f def _docker_names(kind: str, stack: str) -> list[str]: @@ -117,31 +133,6 @@ def _residual(domain: str) -> dict: } -def _stack_age_seconds(stack: str) -> float | None: - """Age of the stack's oldest service, or None if not present.""" - svcs = _docker_names("service", stack) - if not svcs: - return None - oldest = None - for s in svcs: - p = subprocess.run( - ["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"], - capture_output=True, - text=True, - ) - ts = p.stdout.strip() - try: - # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars - dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace( - tzinfo=datetime.UTC - ) - except ValueError: - continue - age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds() - oldest = age if oldest is None else max(oldest, age) - return oldest - - def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]: """Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe @@ -279,9 +270,10 @@ def deploy_app( past the 900s default. abra's INTERNAL TIMEOUT (recipe's TIMEOUT env, default 300s) is set via EXTRA_ENV; this is the Python subprocess wrapper's timeout so abra doesn't get SIGKILLed mid-deploy.""" _record_deploy() - # Register BEFORE the app exists: a concurrent run's janitor must never see this app without - # its live-owner registration (it would reap an in-flight deploy). - register_run_app(domain) + # Lock BEFORE the app exists: a concurrent run's janitor must never see this app without a + # held app lock (it would probe it as an orphan and reap an in-flight deploy). Also the + # double-!testme serialisation point: a second run of the same domain blocks here. + acquire_app_lock(domain) abra.app_config_remove(domain) # clear any stale .env from a prior crashed run abra.app_new(recipe, domain, version=version, secrets=secrets) # A pinned version must actually deploy that version: check the recipe out to the tag so the @@ -720,23 +712,84 @@ def teardown_app(domain: str, verify: bool = True) -> None: residual = _residual(domain) if any(residual.values()): raise TeardownError(f"teardown left residual for {domain}: {residual}") - # The app is gone — drop its active-run registration (janitor() also clears it when reaping). - unregister_run_app(domain) + # No unregistration step: the app lock releases implicitly at process exit. The clean run's + # leftover lockfile (unheld) is unlinked on sight by the next janitor's stale-lockfile sweep. -def janitor(max_age_seconds: int | None = None) -> None: - """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme. Safe under - CONCURRENT runs (capacity=2): every harness run registers its app in the active-run registry - (register_run_app), so the janitor distinguishes the three cases instead of using age alone: - - registered + owner run_recipe_ci process ALIVE -> in-flight concurrent run: never reap - - registered + owner DEAD (crashed/SIGKILLed run) -> definite orphan: reap immediately - - no registry entry (pre-registry code, reboot) -> fall back to the age threshold - Reaps via docker primitives so it works even when the .env is gone (A2/A3). Age fallback default - 2h, env-overridable via CCCI_JANITOR_MAX_AGE.""" - import os +# A lock held longer than 2x the 60-min hard deadline can only be a leaked run (the deadline +# bounds every healthy run). Flag it for a human — NEVER steal a held lock. +LONG_HELD_LOCK_SECONDS = 2 * lifetime.HARD_DEADLINE_SECONDS - if max_age_seconds is None: - max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200")) + +def _probe_and_reap(domain: str) -> None: + """Probe one run app's lock; reap iff nobody holds it (kernel-guaranteed orphan). + + Reaping happens WHILE HOLDING the probe lock, closing the janitor-vs-new-run race: a new run + of the same domain blocks in acquire_app_lock until the reap finishes, so a fresh app never + coexists with a half-reaped one. The lockfile is unlinked before release (still holding the + lock); a waiter that blocked on the unlinked inode re-checks identity and retries. Two racing + janitors arbitrate on the same flock: one reaps, the other sees 'held' and leaves — + teardown_app(verify=False) is idempotent either way.""" + path = _app_lock_path(domain) + try: + # PEP 446: non-inheritable fd, same as acquire_app_lock. + f = open(path, "a") # noqa: SIM115 — closed in the finally below, lock released with it + except OSError as e: + print(f"!! janitor: cannot open lockfile {path} ({e}) — skipping {domain}", flush=True) + return + try: + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + # Held -> live run. Never steal; flag if it has been held implausibly long. + try: + held_for = time.time() - os.stat(path).st_mtime + except OSError: + held_for = 0 + if held_for > LONG_HELD_LOCK_SECONDS: + print( + f"!! lock for {domain} held >{LONG_HELD_LOCK_SECONDS // 60}min — possible " + "leaked run; inspect with lslocks", + flush=True, + ) + else: + print( + f" janitor: {domain} lock held — live concurrent run, leaving it", flush=True + ) + return + # Acquired — but only the inode the PATH names counts (another janitor may have reaped + # and unlinked this inode while we raced; a lock on an unlinked inode protects nothing + # and unlinking the path now would delete a NEWER run's lockfile). + try: + if os.fstat(f.fileno()).st_ino != os.stat(path).st_ino: + return + except FileNotFoundError: + return + # Orphan: no live owner (the kernel released the lock when the owner died). Reap while + # holding the probe lock, then unlink the lockfile before releasing. + print(f" janitor: {domain} lock acquirable — orphan, reaping", flush=True) + with contextlib.suppress(Exception): + teardown_app(domain, verify=False) + with contextlib.suppress(OSError): + os.unlink(path) + finally: + f.close() + + +def janitor() -> None: + """Reap orphaned run apps from crashed/rebooted runs; the kernel flock is the only liveness + oracle. For every candidate run app, probe its app-domain lock (LOCK_NB): + + acquirable -> nobody holds it -> orphan -> reap under the probe lock + unlink lockfile + held -> live concurrent run -> leave it (warn if held >2x the hard deadline) + + Candidate discovery is unchanged: `abra app ls` + a docker-service sweep (catches stacks + whose .env is already gone), both matched against RUN_APP_RE — warm/canonical apps never + match and are never probed. Post-reboot, /run/lock (tmpfs) is empty, so every surviving app + probes as an orphan and is reaped immediately (no age threshold). Stale lockfiles with no + app behind them are unlinked on sight. Degrades safely: an unreadable lockfile/dir is + skipped with a log line, never a crash. Reaps via docker primitives so it works even when + the .env is gone (A2/A3).""" seen = set() for app in abra.app_ls(): name = app.get("appName") or app.get("domain") or "" @@ -750,18 +803,22 @@ def janitor(max_age_seconds: int | None = None) -> None: seen.add(f"{m.group(1)}.ci.commoninternet.net") for name in seen: - owner = _run_owner_state(name) - if owner == "alive": - print(f" janitor: {name} is a live concurrent run — leaving it", flush=True) - continue - if owner == "unknown": - # No registry entry (manual run on pre-registry code, or post-reboot): only the age - # threshold protects it, as before. - stack = _stack_name(name) - age = _stack_age_seconds(stack) - if age is not None and age < max_age_seconds: - continue # young and of unknown provenance; leave it - # owner == "dead" (a crashed/killed run's definite orphan) or old enough -> reap - with contextlib.suppress(Exception): - teardown_app(name, verify=False) - unregister_run_app(name) + _probe_and_reap(name) + + # Tidy /run/lock: a clean run's leftover lockfile is unheld and appless — unlink it (under + # its own probe lock, with the same identity check as above). + with contextlib.suppress(OSError): + for path in glob.glob(os.path.join(_app_lock_dir(), "cc-ci-app-*.lock")): + domain = os.path.basename(path)[len("cc-ci-app-") : -len(".lock")] + if domain in seen: + continue # handled (or deliberately left) above + with contextlib.suppress(OSError): + f = open(path, "a") # noqa: SIM115 — closed below, lock released with it + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino: + os.unlink(path) + except (BlockingIOError, FileNotFoundError): + pass # held (live run pre-deploy) or already gone — leave it + finally: + f.close() From 17ebdf39acf19448c8fb8b7d15b356fb9ef06610 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:18:33 +0000 Subject: [PATCH 3/6] =?UTF-8?q?feat(harness):=20P3=20per-run=20ABRA=5FDIR?= =?UTF-8?q?=20=E2=80=94=20structural=20recipe-tree=20isolation,=20recipe?= =?UTF-8?q?=20flock=20deleted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - run_recipe_ci.setup_run_abra_dir(): builds //abra with servers/ and catalogue/ symlinked to the canonical ~/.abra (app .env files keep landing in the shared canonical path, so janitor discovery and env-based teardown are unchanged; per-domain filenames + the P2 app-domain lock prevent write conflicts) and a FRESH empty recipes/ — each run clones + checkouts its own recipe trees. Exported as $ABRA_DIR (honored by the abra CLI, verified on-host) before ANY abra call. Manual runs get manual- isolation. - fetch_recipe(): plain clone into $ABRA_DIR/recipes/ — no shared-tree rm-rf, no lock. CCCI_SKIP_FETCH=1 now copies the canonically-staged clone into the per-run tree (same staging workflow, run reads staged state). - abra.abra_dir()/recipe_dir(): single resolution rule ($ABRA_DIR else ~/.abra), used by recipe_checkout, has_lightweight_version_tags, recipe_head_commit, recipe_versions, generic._recipe_dir, lifecycle.prepull_images, snapshot_recipe_tests, and warm_reconcile._recipe_dir (which keeps the canonical default for its own systemd runs but follows the per-run tree when imported by promote_canonical inside a run). - deleted: lifecycle.acquire_recipe_lock, RECIPE_LOCK_DIR, the main() call site and the must-lock-before-fetch ordering rule. - tests/{ghost,discourse}/install_steps.sh: RECIPE_DIR resolves ${ABRA_DIR:-$HOME/.abra} so the compose.ccci.yml overlay lands in the tree the run actually deploys from (mechanical path fix required by per-run trees; no assertion/gate touched — see DECISIONS.md). - .drone.yml comments updated (HOME=/root rationale now via the servers symlink). --- .drone.yml | 14 +++---- runner/harness/abra.py | 32 ++++++++------ runner/harness/generic.py | 2 +- runner/harness/lifecycle.py | 34 +++------------ runner/run_recipe_ci.py | 72 +++++++++++++++++++++++++------- runner/warm_reconcile.py | 8 +++- tests/discourse/install_steps.sh | 4 +- tests/ghost/install_steps.sh | 4 +- 8 files changed, 103 insertions(+), 67 deletions(-) diff --git a/.drone.yml b/.drone.yml index d356171..9908c8b 100644 --- a/.drone.yml +++ b/.drone.yml @@ -39,9 +39,8 @@ steps: # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by # the harness, not by serialisation: every run holds an exclusive flock on its app domain # (/run/lock/cc-ci-app-.lock) for its whole process lifetime, the run-start janitor probes -# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs -# serialise on a per-recipe flock for the shared ~/.abra/recipes/ checkout -# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md. +# that lock to reap only orphans (held lock = live run, never touched), and recipe working trees +# are per-run ($ABRA_DIR/recipes — no shared checkout, no recipe lock). See docs/concurrency.md. kind: pipeline type: exec name: recipe-ci @@ -61,10 +60,11 @@ steps: - name: ci environment: STAGES: install,upgrade,backup,restore,custom - # The exec runner points HOME at a per-build workspace; force it to /root so abra finds its - # server config + recipes under /root/.abra (as the manual M4/M5 runs did). Safe with - # capacity=2: app names are unique per (recipe,pr,ref) and same-recipe runs serialise on the - # per-recipe flock, so concurrent builds never touch the same recipe checkout or app. + # The exec runner points HOME at a per-build workspace; force it to /root so abra's server + # config is found via the per-run ABRA_DIR's servers/ symlink -> /root/.abra/servers. + # Recipe trees are PER-RUN ($ABRA_DIR/recipes, exported by run_recipe_ci before any abra + # call), so concurrent builds never share a recipe checkout; app .env files are per-domain + # in the shared canonical servers/ path, guarded by the app-domain flock. HOME: /root commands: # RECIPE/REF/PR/SRC (+ CCCI_QUICK for `!testme --quick`) are injected as env vars from the diff --git a/runner/harness/abra.py b/runner/harness/abra.py index 1c92066..b9d6454 100644 --- a/runner/harness/abra.py +++ b/runner/harness/abra.py @@ -10,6 +10,7 @@ Bakes in the known abra gotchas (re-verify per installed abra version, currently from __future__ import annotations import json +import os import subprocess ABRA = "abra" @@ -19,6 +20,20 @@ class AbraError(RuntimeError): pass +def abra_dir() -> str: + """abra's state dir, resolved the same way the abra CLI resolves it: $ABRA_DIR if set, else + ~/.abra. Inside a CI run, run_recipe_ci exports a PER-RUN $ABRA_DIR (fresh recipes/, shared + servers/+catalogue/ symlinks) before any abra call, so every helper here and every abra + subprocess agree on the same tree; outside a run (warm_reconcile's systemd timer, manual use) + both fall back to the canonical /root/.abra.""" + return os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra") + + +def recipe_dir(recipe: str) -> str: + """The current ABRA_DIR's working tree for a recipe (per-run inside a CI run).""" + return os.path.join(abra_dir(), "recipes", recipe) + + def _run_pty( args: list[str], timeout: int = 900, check: bool = True ) -> subprocess.CompletedProcess: @@ -77,9 +92,7 @@ def recipe_checkout(recipe: str, version: str) -> None: a chaos (`-C`) deploy ignores ENV VERSION and uses the current checkout — together that silently deployed LATEST for a 'previous-version' base, making the upgrade a no-op (Adversary F1d-2). With this checkout + a non-chaos deploy, a pinned deploy genuinely deploys that version.""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) # -f (force): the version-pinning checkout must yield the EXACT ref tree. Without it, a cc-ci # install_steps-provided overlay (e.g. discourse's compose.ccci.yml, copied into the pinned base) # is an UNTRACKED file that collides with the same path TRACKED in a later ref, and @@ -100,9 +113,7 @@ def has_lightweight_version_tags(recipe: str) -> bool: 'reference not found'.) The caller (deploy_app) uses this to fall back to a chaos base deploy (which skips lint and deploys the explicitly-checked-out pinned version — see lifecycle.deploy_app). Read-only: just `git tag` + `cat-file -t`; no fetch/mutation, so it can't trigger abra's revert.""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) tags = subprocess.run( ["git", "-C", path, "tag", "-l"], capture_output=True, text=True ).stdout.split() @@ -231,9 +242,7 @@ def recipe_head_commit(recipe: str) -> str | None: """The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the prev-tag base deploy reset the working tree (HC1).""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True) out = proc.stdout.strip() return out or None @@ -241,10 +250,7 @@ def recipe_head_commit(recipe: str) -> str | None: def recipe_versions(recipe: str) -> list[str]: """Published versions of a recipe, oldest→newest (from the recipe git tags).""" - import os - import subprocess - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) proc = subprocess.run( ["git", "-C", path, "tag", "--sort=creatordate"], capture_output=True, text=True ) diff --git a/runner/harness/generic.py b/runner/harness/generic.py index f42b896..d468dbf 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -25,7 +25,7 @@ _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE) def _recipe_dir(recipe: str) -> str: - return os.path.expanduser(f"~/.abra/recipes/{recipe}") + return abra.recipe_dir(recipe) # the per-run tree inside a CI run ($ABRA_DIR) def backup_capable(recipe: str, meta: dict | None = None) -> bool: diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 210a011..10a1d4b 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -37,31 +37,9 @@ class TeardownError(RuntimeError): # however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held → # leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never # steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each -# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR) -# serialise on this app lock. See docs/concurrency.md. -RECIPE_LOCK_DIR = "/run/lock" - - -def acquire_recipe_lock(recipe: str): - """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes - checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and - with it this lock, structurally unnecessary); until then the caller keeps the returned file - alive for the whole run and release is implicit at process exit.""" - path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") - # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. - f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run - try: - fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) - except BlockingIOError: - print( - f"== recipe lock: another {recipe} run is in flight — waiting for {path} " - "(shared ~/.abra/recipes checkout) ==", - flush=True, - ) - fcntl.flock(f, fcntl.LOCK_EX) - print(f"== recipe lock: acquired {path} ==", flush=True) - return f - +# run deploys from its own per-run ABRA_DIR — there is no shared recipe tree and no recipe lock), +# and same-domain runs (double-!testme of one PR) serialise on this app lock. +# See docs/concurrency.md. # Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the # caller drops the returned file object, GC would close the fd and silently release the lock — @@ -209,9 +187,9 @@ def prepull_images(recipe: str, domain: str) -> None: app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE). Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real pull error (don't mask it).""" - import os - - recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}") + recipe_dir = abra.recipe_dir(recipe) # per-run tree inside a CI run + # The app .env lives in the CANONICAL servers path (the per-run ABRA_DIR's servers/ is a + # symlink to it, so abra and this path agree on the same file). env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env") if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path): print(f" prepull: recipe dir or .env missing for {recipe} — skipping", flush=True) diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 5884e63..9f8f539 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -138,18 +138,62 @@ def _gitea_token() -> str | None: return tok or None +def setup_run_abra_dir() -> str: + """P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees. + + `//abra/` with: + servers/ -> symlink to the canonical ~/.abra/servers. App .env files land in the shared + canonical path, so janitor discovery (`abra app ls`) and env-based teardown + work unchanged from any process; per-domain filenames + the app-domain lock + prevent write conflicts. + catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly). + recipes/ fresh + empty — THE isolation that matters: each run clones and git-checkouts + its own recipe trees, so concurrent runs (same recipe included) can never + corrupt each other's deploy tree. Replaces the per-recipe flock. + Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper + (abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention.""" + canonical = os.path.expanduser("~/.abra") + rid = results_mod.run_id() + if rid == "manual": + rid = f"manual-{os.getpid()}" # two concurrent hand-runs must not share a tree + run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra") + os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True) + for shared in ("servers", "catalogue"): + link = os.path.join(run_abra_dir, shared) + if not os.path.islink(link): + os.symlink(os.path.join(canonical, shared), link) + os.environ["ABRA_DIR"] = run_abra_dir + print( + f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==", + flush=True, + ) + return run_abra_dir + + def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: - """Make the recipe available at the code under test. If SRC+REF point at the mirror PR, + """Make the recipe available at the code under test in THIS RUN's recipe tree + ($ABRA_DIR/recipes/): a plain clone — no locking needed, no rm-rf of any shared + state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered + `abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR, clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed).""" - recipes_dir = os.path.expanduser("~/.abra/recipes") - os.makedirs(recipes_dir, exist_ok=True) - dest = os.path.join(recipes_dir, recipe) - # CCCI_SKIP_FETCH=1: use the local recipe clone as-is (lets a test/Adversary stage a fake/broken - # ref — e.g. a simulated broken PR head for the --quick rollback proof — without it being clobbered - # by a re-fetch). Never set in production CI. + dest = abra.recipe_dir(recipe) + os.makedirs(os.path.dirname(dest), exist_ok=True) + # CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a + # fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it + # being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/; + # copy it into the per-run tree so the rest of the run reads the staged state. Never set in + # production CI. if os.environ.get("CCCI_SKIP_FETCH") == "1": - print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True) + canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}") + subprocess.run(["rm", "-rf", dest], check=False) + if os.path.isdir(canonical): + shutil.copytree(canonical, dest, symlinks=True) + print( + f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is " + f"(copied {canonical} -> per-run tree)", + flush=True, + ) return if src and ref: url = f"https://git.autonomic.zone/{src}.git" @@ -178,7 +222,7 @@ def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: def snapshot_recipe_tests(recipe: str) -> str | None: """Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the recipe to a version tag during the run. Returns the snapshot path, or None if no tests/.""" - src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests") + src = os.path.join(abra.recipe_dir(recipe), "tests") if not os.path.isdir(src): return None has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile( @@ -841,12 +885,10 @@ def main() -> int: print( f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}" ) - # Concurrent-run safety: runs of the SAME recipe serialise on a per-recipe flock — they share - # ONE ~/.abra/recipes/ working tree which fetch_recipe (below) rm-rf's/reclones and the - # upgrade tier git-checkouts mid-run. Must be taken BEFORE fetch_recipe. Different recipes run - # in parallel (capacity=2). The reference must stay alive for the whole run: the kernel drops - # the flock when the fd closes (including on any crash/SIGKILL — no stale-lock failure mode). - _recipe_lock = lifecycle.acquire_recipe_lock(recipe) # noqa: F841 + # Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR + # (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs + # serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md). + setup_run_abra_dir() fetch_recipe(recipe, ref, src) # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test # (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index d9e58b7..d41fef0 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -199,7 +199,13 @@ def _run(cmd, timeout=120, check=False): def _recipe_dir(recipe: str) -> str: - return os.path.expanduser(f"~/.abra/recipes/{recipe}") + # Resolve like the abra CLI does: $ABRA_DIR (the per-run tree when imported by a CI run, + # e.g. promote_canonical) else the canonical ~/.abra (this module's own systemd-timer runs, + # which set no ABRA_DIR). Keeps fetch_recipe (an `abra` subprocess) and the git readers + # below pointed at the SAME tree in both contexts. + return os.path.join( + os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra"), "recipes", recipe + ) def recipe_tags(recipe: str) -> list[str]: diff --git a/tests/discourse/install_steps.sh b/tests/discourse/install_steps.sh index 330a5cc..930e663 100755 --- a/tests/discourse/install_steps.sh +++ b/tests/discourse/install_steps.sh @@ -15,7 +15,9 @@ set -euo pipefail : "${CCCI_RECIPE:?missing CCCI_RECIPE}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}" +# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else +# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from. +RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}" if [ ! -d "$RECIPE_DIR" ]; then echo " discourse install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2 diff --git a/tests/ghost/install_steps.sh b/tests/ghost/install_steps.sh index ef10674..2c2dc50 100755 --- a/tests/ghost/install_steps.sh +++ b/tests/ghost/install_steps.sh @@ -15,7 +15,9 @@ set -euo pipefail : "${CCCI_RECIPE:?missing CCCI_RECIPE}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}" +# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else +# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from. +RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}" if [ ! -d "$RECIPE_DIR" ]; then echo " ghost install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2 From 91d3cc7e992b93dfce33d9cb603161ce66c4c10d Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:19:35 +0000 Subject: [PATCH 4/6] =?UTF-8?q?chore(ci):=20P4=20config=20cleanup=20?= =?UTF-8?q?=E2=80=94=20DRONE=5FRUNNER=5FCAPACITY=20is=20the=20single=20con?= =?UTF-8?q?currency=20knob?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove concurrency.limit from the recipe-ci pipeline (.drone.yml): it duplicated DRONE_RUNNER_CAPACITY (nix/modules/drone-runner.nix maxTests) and the two had to be kept in step by hand (docs/concurrency.md §8.6). maxTests comment updated to state it is the single knob and to describe the new safety model. --- .drone.yml | 4 ++-- nix/modules/drone-runner.nix | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.drone.yml b/.drone.yml index 9908c8b..d7c3674 100644 --- a/.drone.yml +++ b/.drone.yml @@ -53,8 +53,8 @@ trigger: event: - custom -concurrency: - limit: 2 +# NB deliberately NO `concurrency.limit` here: DRONE_RUNNER_CAPACITY (nix/modules/drone-runner.nix +# maxTests) is the single concurrency knob (P4 — two knobs in two files drifted). steps: - name: ci diff --git a/nix/modules/drone-runner.nix b/nix/modules/drone-runner.nix index b6f91b4..3c14f01 100644 --- a/nix/modules/drone-runner.nix +++ b/nix/modules/drone-runner.nix @@ -8,18 +8,18 @@ { pkgs, config, lib, ... }: let # MAX_TESTS (plan §4.2/§4.3 resource safety): max CI builds the exec runner runs at once. Drone - # queues the rest in its native pending-build queue (no custom queue). THE concurrency cap that - # bounds how many test apps can be live at once. + # queues the rest in its native pending-build queue (no custom queue). THE SINGLE concurrency + # knob — nothing else caps recipe-ci parallelism (the .drone.yml concurrency.limit was removed: + # one knob, one place). Bounds how many test apps can be live at once. # # Raised to 2 (operator request 2026-06-09) so two recipes can be tested in parallel (e.g. immich # and plausible under active development at once). Verified safe on the current node (Hetzner cpx22, # ~7.6 GiB / 4 vCPU — NOTE: smaller than the original 28 GiB this was written for): a full immich CI # stack measured ~1 GiB (server+ML+pg+redis) with multiple GiB free, so two concurrent recipes fit. - # The concurrency PRECONDITION holds: the run-start janitor is age-based (default 2h) + run-app-name - # scoped, so it never reaps a concurrent in-flight run (harness.lifecycle.janitor). TRADE-OFF: with - # capacity>1 a SIGKILL'd build (no teardown) leaves an orphan the run-start sweep can't reap - # immediately (it might be a live run) — bounded instead by the 2h janitor + the /upgrade-all - # start/end reap + sweep-orphans. Revert to "1" if OOM / disk-I/O contention is observed under load. + # Concurrent-run safety is the harness's job at ANY capacity (docs/concurrency.md): per-run + # ABRA_DIR recipe trees, per-app-domain flocks, and a flock-probe janitor that reaps a crashed + # build's orphan immediately (held lock = live run, never touched). Revert to "1" if OOM / + # disk-I/O contention is observed under load. maxTests = "2"; in { From 84d90fb6552a71dbd44adb23a7767aac8289f270 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:29:36 +0000 Subject: [PATCH 5/6] =?UTF-8?q?test(concurrency):=20real-kernel=20suite=20?= =?UTF-8?q?for=20the=20restructured=20model=20=E2=80=94=2020=20tests,=2019?= =?UTF-8?q?=20plan=20cases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tests/concurrency/ — NOT in the default `pytest tests/unit` gate; run explicitly with `pytest tests/concurrency -q`. flock/prctl/alarm are never mocked: helper subprocesses (helpers.py) hold real locks and install the real lifetime guards; locks live in a per-test tmp dir via CCCI_APP_LOCK_DIR; every helper (and recorded grandchild) is reaped by fixture cleanup. - test_locks.py (cases 1-4): SIGKILL auto-release; LOCK_NB held/unheld semantics; PEP 446 fd-not-inherited (holder's child survives, lock still releases); same-domain second acquire blocks until first holder exits. - test_janitor.py (cases 5-12): orphan reaped once + lockfile unlinked; live holder never reaped + logged; new-run acquire blocks until a slow reap completes (reap-under-probe-lock); two overlapping janitors -> exactly one reaps (flock arbitration); reboot sim (no lockfile) reaps immediately with no age wait; >120min-held lock flagged 'possible leaked run' and NOT stolen; warm/canonical names never probed (no lockfile even created); directory-as-lockfile and missing lock dir degrade to skip+log, never crash. - test_lifetime.py (cases 13-16): PDEATHSIG (wrapper parent SIGKILL'd -> guarded child TERM'd, teardown marker, lock released); already-orphaned helper REFUSES to run (ppid race); 2s deadline alarm -> teardown + exit 142 + lock released; SIGTERM -> teardown + exit 143 + lock released. - test_abra_dir.py (cases 17-19 + 18b): per-run dir built + $ABRA_DIR exported before the first abra call (recording stub abra on PATH); two CONCURRENT same-recipe fetch+checkout flows into different ABRA_DIRs -> divergent correct trees, canonical staged clone untouched; .env written through the servers/ symlink lands in the canonical path (env_get/env_set agree); manual runs get pid-suffixed dirs. On cc-ci: pytest tests/concurrency -q -> 20 passed; tests/unit -> 138 passed; lint PASS. --- tests/concurrency/concutil.py | 108 +++++++++++++++++ tests/concurrency/conftest.py | 34 ++++++ tests/concurrency/helpers.py | 120 ++++++++++++++++++ tests/concurrency/test_abra_dir.py | 175 ++++++++++++++++++++++++++ tests/concurrency/test_janitor.py | 189 +++++++++++++++++++++++++++++ tests/concurrency/test_lifetime.py | 82 +++++++++++++ tests/concurrency/test_locks.py | 85 +++++++++++++ 7 files changed, 793 insertions(+) create mode 100644 tests/concurrency/concutil.py create mode 100644 tests/concurrency/conftest.py create mode 100644 tests/concurrency/helpers.py create mode 100644 tests/concurrency/test_abra_dir.py create mode 100644 tests/concurrency/test_janitor.py create mode 100644 tests/concurrency/test_lifetime.py create mode 100644 tests/concurrency/test_locks.py diff --git a/tests/concurrency/concutil.py b/tests/concurrency/concutil.py new file mode 100644 index 0000000..374e747 --- /dev/null +++ b/tests/concurrency/concutil.py @@ -0,0 +1,108 @@ +"""Shared utilities for the real-kernel concurrency suite (imported by the test modules; the +fixtures in conftest.py wrap these). No flock mocking anywhere — probes use real LOCK_NB.""" + +from __future__ import annotations + +import contextlib +import fcntl +import os +import signal +import subprocess +import sys +import time + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle # noqa: E402 + +HELPERS = os.path.join(os.path.dirname(__file__), "helpers.py") +DOMAIN = "test-abc123.ci.commoninternet.net" # matches RUN_APP_RE + + +class HelperPool: + """Spawns helpers.py subprocesses and GUARANTEES their cleanup (incl. recorded grandchild + pids from `hold-with-child`/`wrapper` markers) — no leaked children in the test VM.""" + + def __init__(self, out_dir: str): + self.out_dir = out_dir + self.procs: list[subprocess.Popen] = [] + self.extra_pids: list[int] = [] + self._n = 0 + + def spawn(self, *args: str, env_extra: dict | None = None) -> tuple[subprocess.Popen, str]: + """Start `helpers.py `; returns (proc, marker_file).""" + self._n += 1 + out = os.path.join(self.out_dir, f"helper-{self._n}.out") + env = dict(os.environ, CCCI_HELPER_OUT=out, **(env_extra or {})) + p = subprocess.Popen( # noqa: S603 + [sys.executable, HELPERS, *args], + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + self.procs.append(p) + return p, out + + def track_pid(self, pid: int) -> None: + self.extra_pids.append(pid) + + def cleanup(self) -> None: + for p in self.procs: + if p.poll() is None: + p.kill() + with contextlib.suppress(subprocess.TimeoutExpired): + p.wait(timeout=10) + for pid in self.extra_pids: + with contextlib.suppress(OSError): + os.kill(pid, signal.SIGKILL) + + +def wait_marker(out: str, token: str, timeout: float = 15.0) -> str | None: + """Poll a helper's marker file for a line containing `token`; returns the line or None.""" + deadline = time.time() + timeout + while time.time() < deadline: + try: + with open(out) as f: + for line in f: + if token in line: + return line.strip() + except OSError: + pass + time.sleep(0.1) + return None + + +def lock_state(domain: str) -> str: + """'held' | 'free' | 'absent' for the domain's lockfile, probed with a REAL LOCK_NB.""" + path = lifecycle._app_lock_path(domain) # noqa: SLF001 + if not os.path.exists(path): + return "absent" + with open(path, "a") as f: + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + return "free" + except BlockingIOError: + return "held" + + +def wait_lock_state(domain: str, want: str, timeout: float = 10.0) -> str: + """Poll until lock_state(domain) == want (kernel release on process death is fast, but give + the scheduler room). Returns the final observed state.""" + deadline = time.time() + timeout + state = lock_state(domain) + while state != want and time.time() < deadline: + time.sleep(0.1) + state = lock_state(domain) + return state + + +def pid_alive(pid: int) -> bool: + return os.path.exists(f"/proc/{pid}") + + +def wait_pid_gone(pid: int, timeout: float = 15.0) -> bool: + deadline = time.time() + timeout + while time.time() < deadline: + if not pid_alive(pid): + return True + time.sleep(0.1) + return False diff --git a/tests/concurrency/conftest.py b/tests/concurrency/conftest.py new file mode 100644 index 0000000..ab7be03 --- /dev/null +++ b/tests/concurrency/conftest.py @@ -0,0 +1,34 @@ +"""Fixtures for the real-kernel concurrency suite (concurrency-restructure plan, 19 cases). + +NOT part of the default `pytest tests/unit` gate — run explicitly with `pytest tests/concurrency +-q` (docs/concurrency.md). Locks live in a per-test tmp dir (CCCI_APP_LOCK_DIR); helper +subprocesses hold REAL flocks / install the REAL prctl+signal guards and are always reaped in +fixture finalizers (no leaked children in the test VM). +""" + +from __future__ import annotations + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) +from concutil import HelperPool # noqa: E402 + + +@pytest.fixture +def lock_dir(tmp_path, monkeypatch): + """Sandbox lock dir, exported so BOTH this process's lifecycle calls and helper subprocesses + (which inherit os.environ) resolve their lockfiles here — never /run/lock.""" + d = tmp_path / "locks" + d.mkdir() + monkeypatch.setenv("CCCI_APP_LOCK_DIR", str(d)) + return str(d) + + +@pytest.fixture +def pool(tmp_path): + hp = HelperPool(str(tmp_path)) + yield hp + hp.cleanup() diff --git a/tests/concurrency/helpers.py b/tests/concurrency/helpers.py new file mode 100644 index 0000000..559801f --- /dev/null +++ b/tests/concurrency/helpers.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +"""Subprocess helpers for tests/concurrency — REAL kernel locks and the REAL lifetime guards in +separate processes (flock/prctl are never mocked; tests assert on actual kernel behavior). + +Invoked as: python3 helpers.py + +Env contract (set by the spawning test): + CCCI_APP_LOCK_DIR sandbox lock dir (never /run/lock in tests) + CCCI_HELPER_OUT marker file this helper APPENDS progress lines to (ACQUIRED/READY/...) + +Commands: + hold acquire the app lock, mark `ACQUIRED `, sleep forever + hold-with-child acquire the lock, spawn a plain sleeping subprocess child, mark + `ACQUIRED ` + `CHILD ` (PEP 446: the child must NOT + inherit the lock fd), sleep forever + guarded install the REAL lifetime guards (alarm=s), acquire the + lock, mark `READY`; when the teardown funnel runs (`finally:`), + mark `TEARDOWN` before exiting + wrapper spawn `guarded 3600` as MY child, mark `WRAPPED `, + sleep — the test kills me to prove PDEATHSIG TERMs the child + orphan-probe wait (bounded) until reparented (ppid==1), then install the + guards; mark `REFUSED` if they exit (expected) or `GUARDS_OK` + fetch-checkout run run_recipe_ci.fetch_recipe (the test sets CCCI_SKIP_FETCH=1 + + a per-"run" ABRA_DIR), git-checkout , mark + `RESULT ` +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import time + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "runner")) +from harness import abra, lifecycle, lifetime # noqa: E402 + +OUT = os.environ.get("CCCI_HELPER_OUT") + + +def mark(line: str) -> None: + if OUT: + with open(OUT, "a") as f: + f.write(line + "\n") + f.flush() + print(line, flush=True) + + +def cmd_hold(domain: str) -> None: + lifecycle.acquire_app_lock(domain) + mark(f"ACQUIRED {time.time()}") + time.sleep(3600) + + +def cmd_hold_with_child(domain: str) -> None: + lifecycle.acquire_app_lock(domain) + child = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(3600)"]) + mark(f"ACQUIRED {time.time()}") + mark(f"CHILD {child.pid}") + time.sleep(3600) + + +def cmd_guarded(domain: str, deadline: str) -> None: + lifetime.install_lifetime_guards(deadline_seconds=int(deadline)) + lifecycle.acquire_app_lock(domain) + mark("READY") + try: + time.sleep(3600) + finally: + mark("TEARDOWN") + + +def cmd_wrapper(domain: str) -> None: + p = subprocess.Popen( # noqa: S603 + [sys.executable, os.path.abspath(__file__), "guarded", domain, "3600"], + env=os.environ.copy(), + ) + mark(f"WRAPPED {p.pid}") + time.sleep(3600) + + +def cmd_orphan_probe() -> None: + # Our spawner exits immediately after fork; wait (bounded) until we are reparented so the + # prctl is installed with the parent ALREADY dead — the exact race the ppid check closes. + for _ in range(200): + if os.getppid() == 1: + break + time.sleep(0.05) + else: + mark("NEVER_REPARENTED") # e.g. a subreaper environment — test will fail visibly + return + try: + lifetime.install_lifetime_guards() + except SystemExit: + mark("REFUSED") + raise + mark("GUARDS_OK") + + +def cmd_fetch_checkout(recipe: str, ref: str) -> None: + import run_recipe_ci + + run_recipe_ci.fetch_recipe(recipe, None, None) + abra.recipe_checkout(recipe, ref) + head = abra.recipe_head_commit(recipe) + with open(os.path.join(abra.recipe_dir(recipe), "data.txt")) as f: + content = f.read().strip() + mark(f"RESULT {head} {content}") + + +if __name__ == "__main__": + cmd, *args = sys.argv[1:] + { + "hold": cmd_hold, + "hold-with-child": cmd_hold_with_child, + "guarded": cmd_guarded, + "wrapper": cmd_wrapper, + "orphan-probe": cmd_orphan_probe, + "fetch-checkout": cmd_fetch_checkout, + }[cmd](*args) diff --git a/tests/concurrency/test_abra_dir.py b/tests/concurrency/test_abra_dir.py new file mode 100644 index 0000000..9867fe1 --- /dev/null +++ b/tests/concurrency/test_abra_dir.py @@ -0,0 +1,175 @@ +"""Per-run ABRA_DIR isolation (concurrency-restructure plan, cases 17-19). Real directories, +real symlinks, real git — abra itself is replaced by a recording stub where a CLI call is +involved (case 17), because these cases test OUR dir/env plumbing, not abra.""" + +from __future__ import annotations + +import os +import stat +import subprocess +import sys + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +import run_recipe_ci # noqa: E402 +from concutil import wait_marker # noqa: E402 +from harness import abra # noqa: E402 + +RECIPE = "fakerecipe" + + +def _git(cwd, *args): + subprocess.run( + ["git", "-c", "user.email=t@t", "-c", "user.name=t", *args], + cwd=cwd, + check=True, + capture_output=True, + ) + + +def _make_fake_home(tmp_path): + """A fake $HOME with a canonical ~/.abra: servers/default + catalogue dirs, and a recipe git + repo with two tags whose data.txt differs (v1 -> 'one', v2 -> 'two', HEAD at v2).""" + home = tmp_path / "home" + (home / ".abra" / "servers" / "default").mkdir(parents=True) + (home / ".abra" / "catalogue").mkdir(parents=True) + repo = home / ".abra" / "recipes" / RECIPE + repo.mkdir(parents=True) + _git(repo, "init", "-q") + (repo / "data.txt").write_text("one\n") + _git(repo, "add", "data.txt") + _git(repo, "commit", "-qm", "v1") + _git(repo, "tag", "v1") + (repo / "data.txt").write_text("two\n") + _git(repo, "add", "data.txt") + _git(repo, "commit", "-qm", "v2") + _git(repo, "tag", "v2") + return home + + +def test_17_per_run_dir_built_and_exported_before_abra(tmp_path, monkeypatch): + """Case 17: setup_run_abra_dir builds the per-run dir correctly (servers/catalogue symlinks + resolve to the canonical tree, recipes/ empty + writable) and $ABRA_DIR is exported before + the first abra call — proven by a stub `abra` on PATH that records the env it saw.""" + home = _make_fake_home(tmp_path) + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs")) + monkeypatch.setenv("DRONE_BUILD_NUMBER", "777") + monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten") # so monkeypatch restores it + + d = run_recipe_ci.setup_run_abra_dir() + assert d == str(tmp_path / "runs" / "777" / "abra") + assert os.environ["ABRA_DIR"] == d + assert os.readlink(os.path.join(d, "servers")) == str(home / ".abra" / "servers") + assert os.readlink(os.path.join(d, "catalogue")) == str(home / ".abra" / "catalogue") + # symlinks RESOLVE (targets exist) and recipes/ is empty + writable + assert os.path.isdir(os.path.join(d, "servers", "default")) + assert os.path.isdir(os.path.join(d, "catalogue")) + assert os.listdir(os.path.join(d, "recipes")) == [] + probe = os.path.join(d, "recipes", ".write-probe") + open(probe, "w").close() + os.remove(probe) + # idempotent re-entry (Drone build-number retry): must not raise on existing symlinks + assert run_recipe_ci.setup_run_abra_dir() == d + + # stub abra records $ABRA_DIR at call time; fetch_recipe's catalogue branch invokes it + stub_dir = tmp_path / "bin" + stub_dir.mkdir() + log = tmp_path / "abra-env.log" + stub = stub_dir / "abra" + stub.write_text(f'#!/bin/sh\necho "$ABRA_DIR" >> {log}\nexit 0\n') + stub.chmod(stub.stat().st_mode | stat.S_IEXEC) + monkeypatch.setenv("PATH", f"{stub_dir}{os.pathsep}{os.environ['PATH']}") + monkeypatch.delenv("CCCI_SKIP_FETCH", raising=False) + run_recipe_ci.fetch_recipe(RECIPE, None, None) + assert log.read_text().strip() == d, "abra was called without the per-run ABRA_DIR exported" + + +def test_18_concurrent_same_recipe_fetch_no_cross_talk(tmp_path, monkeypatch, pool): + """Case 18: two CONCURRENT fetch+checkout flows of the SAME recipe into different ABRA_DIRs + produce two correct, divergent trees (v1 vs v2) — the old shared-tree corruption scenario, + now structurally safe with no lock. The canonical staged clone is untouched.""" + home = _make_fake_home(tmp_path) + canonical_repo = home / ".abra" / "recipes" / RECIPE + head_before = subprocess.run( + ["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True + ).stdout.strip() + + runs = {} + for name, ref in (("runA", "v1"), ("runB", "v2")): + abra_dir = tmp_path / name / "abra" + abra_dir.mkdir(parents=True) + _, out = pool.spawn( + "fetch-checkout", + RECIPE, + ref, + env_extra={ + "HOME": str(home), + "ABRA_DIR": str(abra_dir), + "CCCI_SKIP_FETCH": "1", + }, + ) + runs[name] = (out, ref, abra_dir) + + expect = {"v1": "one", "v2": "two"} + for name, (out, ref, abra_dir) in runs.items(): + line = wait_marker(out, "RESULT", timeout=30) + assert line, f"{name} never produced a RESULT" + _, head, content = line.split() + assert content == expect[ref], f"{name}@{ref}: tree content {content!r}" + tree = abra_dir / "recipes" / RECIPE + assert (tree / "data.txt").read_text().strip() == expect[ref] + assert ( + head + == subprocess.run( + ["git", "-C", tree, "rev-parse", "HEAD"], capture_output=True, text=True + ).stdout.strip() + ) + + # the two trees genuinely diverge AND the canonical staged clone is untouched + a = (runs["runA"][2] / "recipes" / RECIPE / "data.txt").read_text() + b = (runs["runB"][2] / "recipes" / RECIPE / "data.txt").read_text() + assert a != b + head_after = subprocess.run( + ["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True + ).stdout.strip() + assert head_after == head_before, "canonical clone must not be touched by per-run fetches" + + +def test_19_env_written_through_servers_symlink_lands_canonical(tmp_path, monkeypatch): + """Case 19: an app .env written through the per-run servers/ symlink (what abra does under + $ABRA_DIR) lands in the CANONICAL shared path — so janitor discovery and every + expanduser('~/.abra/servers/...') reader keep working unchanged.""" + home = _make_fake_home(tmp_path) + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs")) + monkeypatch.setenv("DRONE_BUILD_NUMBER", "778") + monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten") + d = run_recipe_ci.setup_run_abra_dir() + + domain = "test-abc123.ci.commoninternet.net" + via_symlink = os.path.join(d, "servers", "default", f"{domain}.env") + with open(via_symlink, "w") as f: + f.write("TYPE=fakerecipe:1.0.0\nDOMAIN=placeholder\n") + + canonical = home / ".abra" / "servers" / "default" / f"{domain}.env" + assert canonical.is_file(), ".env written via the symlink must land in the canonical path" + # the canonical-path readers/writers (abra.env_get/env_set use ~/.abra) see the same file + assert abra.env_get(domain, "TYPE") == "fakerecipe:1.0.0" + abra.env_set(domain, "DOMAIN", domain) + with open(via_symlink) as f: + assert f"DOMAIN={domain}" in f.read() + + +def test_18b_run_id_manual_fallback_is_per_process(tmp_path, monkeypatch): + """Companion to case 18: two concurrent MANUAL runs (no DRONE_BUILD_NUMBER) must not share an + abra dir either — the manual fallback is pid-suffixed.""" + home = _make_fake_home(tmp_path) + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs")) + monkeypatch.delenv("DRONE_BUILD_NUMBER", raising=False) + monkeypatch.delenv("CCCI_APP_DOMAIN", raising=False) + monkeypatch.delenv("CCCI_RUN_ID", raising=False) + monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten") + d = run_recipe_ci.setup_run_abra_dir() + assert f"manual-{os.getpid()}" in d diff --git a/tests/concurrency/test_janitor.py b/tests/concurrency/test_janitor.py new file mode 100644 index 0000000..bdf61b9 --- /dev/null +++ b/tests/concurrency/test_janitor.py @@ -0,0 +1,189 @@ +"""Janitor / flock-probe semantics (concurrency-restructure plan, cases 5-12). + +The janitor runs IN-PROCESS with its discovery monkeypatched (candidates injected via a stubbed +abra.app_ls + empty docker sweep) and teardown_app stubbed to record calls — but the LOCKS are +real kernel flocks, held by real helper subprocesses where a live owner is needed.""" + +from __future__ import annotations + +import os +import sys +import threading +import time + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from concutil import DOMAIN, lock_state, wait_marker # noqa: E402 +from harness import lifecycle # noqa: E402 + + +def _inject_candidates(monkeypatch, domains): + """Point janitor discovery at exactly `domains`: abra lists them, docker sweep is empty. + teardown_app is stubbed to a recorder; returns the calls list.""" + calls = [] + monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in domains]) + monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: []) + monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d)) + return calls + + +def test_5_orphan_reaped_lockfile_unlinked(lock_dir, pool, monkeypatch): + """Case 5: an orphan (lockfile exists, no holder — its run was SIGKILL'd) is reaped exactly + once and its lockfile unlinked.""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + p.kill() + p.wait(timeout=10) + calls = _inject_candidates(monkeypatch, [DOMAIN]) + lifecycle.janitor() + assert calls == [DOMAIN], f"teardown calls: {calls} (expected exactly one)" + assert lock_state(DOMAIN) == "absent", "reaped orphan's lockfile must be unlinked" + + +def test_6_live_run_never_reaped(lock_dir, pool, monkeypatch, capsys): + """Case 6: a held lock (live helper) is never reaped and is logged as live.""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + calls = _inject_candidates(monkeypatch, [DOMAIN]) + lifecycle.janitor() + assert calls == [] + assert "live concurrent run" in capsys.readouterr().out + assert lock_state(DOMAIN) == "held" + + +def test_7_new_run_blocks_until_reap_finishes(lock_dir, pool, monkeypatch): + """Case 7: the janitor reaps WHILE HOLDING the probe lock, so a new run of the same domain + blocks in acquire_app_lock until the reap completes — no window where a fresh app coexists + with a half-reaped one.""" + # Make an orphan. + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + p.kill() + p.wait(timeout=10) + + state = {"teardown_end": None, "acquirer_out": None} + + def slow_teardown(domain, verify=True): + # While the janitor holds the probe lock mid-reap, a new run starts acquiring. + _, aout = pool.spawn("hold", DOMAIN) + state["acquirer_out"] = aout + time.sleep(2.0) + state["teardown_end"] = time.time() + + monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}]) + monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: []) + monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown) + lifecycle.janitor() + + line = wait_marker(state["acquirer_out"], "ACQUIRED", timeout=15) + assert line, "new run never acquired after the reap" + acquired_ts = float(line.split()[1]) + assert ( + acquired_ts >= state["teardown_end"] + ), f"new run acquired at {acquired_ts} BEFORE the reap finished at {state['teardown_end']}" + # The new run must hold a lock the next probe can SEE (fresh inode at the path). + assert lock_state(DOMAIN) == "held" + + +def test_8_two_janitors_exactly_one_reaps(lock_dir, pool, monkeypatch): + """Case 8: two concurrent janitors arbitrate on the probe flock — exactly one reaps (the + other sees 'held' and leaves). Teardown is slowed so the runs genuinely overlap.""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + p.kill() + p.wait(timeout=10) + + calls = [] + calls_lock = threading.Lock() + + def slow_teardown(domain, verify=True): + with calls_lock: + calls.append(domain) + time.sleep(2.0) + + monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}]) + monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: []) + monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown) + + barrier = threading.Barrier(2) + + def run_janitor(): + barrier.wait() + lifecycle.janitor() + + t1, t2 = threading.Thread(target=run_janitor), threading.Thread(target=run_janitor) + t1.start(), t2.start() + t1.join(timeout=30), t2.join(timeout=30) + assert calls == [DOMAIN], f"expected exactly one reap, got {calls}" + assert lock_state(DOMAIN) == "absent" + + +def test_9_reboot_lockfile_absent_reaped_immediately(lock_dir, monkeypatch): + """Case 9: post-reboot simulation — the app exists but its lockfile is gone (/run/lock is + tmpfs). The probe trivially acquires -> immediate reap, NO age threshold (improvement over + the old 2h fallback).""" + assert lock_state(DOMAIN) == "absent" + calls = _inject_candidates(monkeypatch, [DOMAIN]) + t0 = time.time() + lifecycle.janitor() + assert calls == [DOMAIN] + assert time.time() - t0 < 5, "reap must be immediate (no age wait)" + + +def test_10_long_held_lock_flagged_never_stolen(lock_dir, pool, monkeypatch, capsys): + """Case 10: a lock held with mtime older than 120min is flagged as a possible leaked run — + and NOT reaped (never steal a held lock).""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001 + backdate = time.time() - (130 * 60) + os.utime(path, (backdate, backdate)) + calls = _inject_candidates(monkeypatch, [DOMAIN]) + lifecycle.janitor() + assert calls == [] + out_text = capsys.readouterr().out + assert "possible leaked run" in out_text and "lslocks" in out_text + assert lock_state(DOMAIN) == "held" + + +def test_11_warm_canonical_names_never_probed(lock_dir, monkeypatch): + """Case 11: RUN_APP_RE allowlist — warm/canonical-shaped names never become candidates, so + they are never probed (no lockfile is even created for them) and never reaped.""" + warmish = [ + "warm-keycloak.ci.commoninternet.net", + "keycloak.ci.commoninternet.net", + "warm-hedgedoc.ci.commoninternet.net", + "drone.ci.commoninternet.net", + ] + calls = [] + monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in warmish]) + monkeypatch.setattr( + lifecycle, + "_docker_names", + lambda kind, stack: ["warm-keycloak_ci_commoninternet_net_app"] + if kind == "service" + else [], + ) + monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d)) + lifecycle.janitor() + assert calls == [] + lockdir = os.environ["CCCI_APP_LOCK_DIR"] + assert [ + f for f in os.listdir(lockdir) if f.startswith("cc-ci-app-") + ] == [], "janitor must not create lockfiles for non-run-app names" + + +def test_12_degrades_safely_on_bad_lockfile_and_missing_dir(lock_dir, monkeypatch, capsys): + """Case 12: a garbled/unopenable lockfile (here: a DIRECTORY at the lockfile path) is skipped + with a log line; a missing lock dir doesn't crash the janitor either. Never a crash.""" + path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001 + os.makedirs(path) # open(path, "a") -> IsADirectoryError (an OSError) + calls = _inject_candidates(monkeypatch, [DOMAIN]) + lifecycle.janitor() # must not raise + assert calls == [] + assert "skipping" in capsys.readouterr().out + + os.rmdir(path) + monkeypatch.setenv("CCCI_APP_LOCK_DIR", os.path.join(os.environ["CCCI_APP_LOCK_DIR"], "gone")) + lifecycle.janitor() # missing dir: probe open fails -> skip; tidy glob -> empty. No crash. + assert calls == [] diff --git a/tests/concurrency/test_lifetime.py b/tests/concurrency/test_lifetime.py new file mode 100644 index 0000000..1b23c69 --- /dev/null +++ b/tests/concurrency/test_lifetime.py @@ -0,0 +1,82 @@ +"""Lifetime hardening (concurrency-restructure plan, cases 13-16): the REAL prctl/signal/alarm +guards installed by helper subprocesses; tests assert teardown ran, exit was non-zero, and the +lock was released.""" + +from __future__ import annotations + +import os +import signal +import sys + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from concutil import ( # noqa: E402 + DOMAIN, + wait_lock_state, + wait_marker, + wait_pid_gone, +) + + +def test_13_pdeathsig_parent_kill_terms_harness(lock_dir, pool): + """Case 13: wrapper-parent spawns a guarded harness-child; the parent is SIGKILL'd (the + harness gets no courtesy signal) -> the kernel's PDEATHSIG TERMs the child, its teardown + funnel runs, it exits, and the lock is released.""" + p, out = pool.spawn("wrapper", DOMAIN) + line = wait_marker(out, "WRAPPED") + assert line, "wrapper never spawned its child" + child_pid = int(line.split()[1]) + pool.track_pid(child_pid) + assert wait_marker(out, "READY"), "guarded child never got ready" + + p.kill() # parent dies WITHOUT signalling the child — only PDEATHSIG can save us + p.wait(timeout=10) + assert wait_pid_gone(child_pid), "guarded child must exit on parent death (PDEATHSIG)" + assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run" + assert wait_lock_state(DOMAIN, "free") == "free" + + +def test_14_already_orphaned_helper_refuses_to_run(lock_dir, pool): + """Case 14 (ppid race): a helper whose parent died BEFORE the prctl was armed (it starts + already reparented to pid 1) must refuse to run — PDEATHSIG would never fire for it.""" + # Spawn an intermediate parent that forks orphan-probe and exits immediately. + import subprocess + + out = os.path.join(pool.out_dir, "orphan.out") + intermediate = ( + "import subprocess, sys, os; " + "subprocess.Popen([sys.executable, os.environ['CCCI_HELPERS'], 'orphan-probe']); " + ) + env = dict( + os.environ, + CCCI_HELPER_OUT=out, + CCCI_HELPERS=os.path.join(os.path.dirname(__file__), "helpers.py"), + ) + subprocess.run([sys.executable, "-c", intermediate], env=env, timeout=15, check=True) + line = wait_marker(out, "REFUSED", timeout=20) + assert line, "orphaned helper did not refuse to run (or never reparented to pid 1)" + + +def test_15_deadline_alarm_fires_teardown_and_releases(lock_dir, pool): + """Case 15: the self-deadline (alarm). A guarded helper with a 2s deadline tears down via + the funnel (finally: ran), exits NON-zero, and its lock is released.""" + p, out = pool.spawn("guarded", DOMAIN, "2") + assert wait_marker(out, "READY") + rc = p.wait(timeout=20) + assert rc != 0, f"deadline exit must be non-zero (got {rc})" + assert rc == 128 + signal.SIGALRM, f"expected 142 (128+SIGALRM), got {rc}" + assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on deadline" + assert wait_lock_state(DOMAIN, "free") == "free" + + +def test_16_sigterm_runs_teardown_funnel_and_releases(lock_dir, pool): + """Case 16: SIGTERM (drone cancel path) -> the finally: teardown funnel runs, exit is + non-zero, lock released.""" + p, out = pool.spawn("guarded", DOMAIN, "3600") + assert wait_marker(out, "READY") + p.send_signal(signal.SIGTERM) + rc = p.wait(timeout=20) + assert rc != 0, f"SIGTERM exit must be non-zero (got {rc})" + assert rc == 128 + signal.SIGTERM, f"expected 143 (128+SIGTERM), got {rc}" + assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on SIGTERM" + assert wait_lock_state(DOMAIN, "free") == "free" diff --git a/tests/concurrency/test_locks.py b/tests/concurrency/test_locks.py new file mode 100644 index 0000000..8059dbb --- /dev/null +++ b/tests/concurrency/test_locks.py @@ -0,0 +1,85 @@ +"""Lock fundamentals (concurrency-restructure plan, cases 1-4). Real kernel flocks held by real +subprocesses — nothing mocked.""" + +from __future__ import annotations + +import fcntl +import os +import sys +import time + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from concutil import ( # noqa: E402 + DOMAIN, + lock_state, + wait_lock_state, + wait_marker, +) +from harness import lifecycle # noqa: E402 + + +def test_1_sigkill_releases_lock(lock_dir, pool): + """Case 1: acquire -> holder SIGKILL'd -> lock immediately acquirable (kernel auto-release). + The exact property the old pidfile registry approximated with /proc checks.""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED"), "holder never acquired" + assert lock_state(DOMAIN) == "held" + p.kill() + p.wait(timeout=10) + assert wait_lock_state(DOMAIN, "free") == "free" + + +def test_2_nb_probe_held_vs_unheld(lock_dir, pool): + """Case 2: LOCK_NB probe raises BlockingIOError against a held lock; succeeds when unheld.""" + p, out = pool.spawn("hold", DOMAIN) + assert wait_marker(out, "ACQUIRED") + path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001 + with open(path, "a") as f: + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + raise AssertionError("LOCK_NB succeeded against a held lock") + except BlockingIOError: + pass + p.kill() + p.wait(timeout=10) + assert wait_lock_state(DOMAIN, "free") == "free" + with open(path, "a") as f: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) # must not raise now + + +def test_3_lock_fd_not_inherited_by_children(lock_dir, pool): + """Case 3 (PEP 446): the holder spawns a subprocess child, the holder dies, the child lives — + and the lock is STILL released (the child never inherited the lock fd). This is what makes + 'held lock == live HARNESS owner' sound even though runs spawn abra/docker/pytest children.""" + p, out = pool.spawn("hold-with-child", DOMAIN) + assert wait_marker(out, "ACQUIRED") + child_line = wait_marker(out, "CHILD") + assert child_line, "holder never reported its child pid" + child_pid = int(child_line.split()[1]) + pool.track_pid(child_pid) + p.kill() + p.wait(timeout=10) + assert os.path.exists(f"/proc/{child_pid}"), "child should outlive the holder" + assert ( + wait_lock_state(DOMAIN, "free") == "free" + ), "lock must release on holder death even with a live child (PEP 446 non-inheritable fd)" + + +def test_4_second_acquire_blocks_until_first_exits(lock_dir, pool): + """Case 4: a second same-domain acquire blocks until the first holder exits — the + double-!testme serialisation property.""" + p1, out1 = pool.spawn("hold", DOMAIN) + assert wait_marker(out1, "ACQUIRED") + p2, out2 = pool.spawn("hold", DOMAIN) + # p2 must NOT acquire while p1 holds. + time.sleep(1.5) + assert wait_marker(out2, "ACQUIRED", timeout=0.1) is None, "second acquire did not block" + t_kill = time.time() + p1.kill() + p1.wait(timeout=10) + line = wait_marker(out2, "ACQUIRED", timeout=15) + assert line, "second acquire never completed after first holder exited" + acquired_ts = float(line.split()[1]) + assert acquired_ts >= t_kill - 0.05, "second holder acquired before the first exited" + assert lock_state(DOMAIN) == "held" From d3fe9e26bb0fbaedb37383539ba3973bc1c80aff Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:32:54 +0000 Subject: [PATCH 6/6] =?UTF-8?q?docs:=20P5=20concurrency=20spec=20rewrite?= =?UTF-8?q?=20=E2=80=94=20one=20lock,=20one=20structural=20isolation,=20th?= =?UTF-8?q?e=20invariant=20chain?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewritten to the restructured model: lifetime-hardening guards (PDEATHSIG/SIGTERM/SIGALRM + setsid/trap), per-run ABRA_DIR isolation (same-recipe runs now parallel), per-app-domain flock (double-!testme serialisation), flock-probe janitor decision table (incl. the inode-identity race rows), updated failure-mode table (cancel now tears down via the harness's own funnel; reboot reaps immediately; 60-min deadline bounds everything), single-knob config table, how to run tests/concurrency, fresh file/symbol index + deleted-symbol list for grep verification. Also drops the last stale concurrency.limit mention from the .drone.yml header comment. --- .drone.yml | 4 +- docs/concurrency.md | 311 ++++++++++++++++++++++++++------------------ 2 files changed, 188 insertions(+), 127 deletions(-) diff --git a/.drone.yml b/.drone.yml index d7c3674..cc709f3 100644 --- a/.drone.yml +++ b/.drone.yml @@ -35,8 +35,8 @@ steps: # the comment-bridge). Deploys the recipe at the PR head, runs install/upgrade/backup + any # recipe-local tests via the shared harness, then guarantees teardown (plan §4.2/§4.3). # -# Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) + -# concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by +# Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix, the +# single concurrency knob) allows two recipe runs in parallel. Concurrent-run safety is enforced by # the harness, not by serialisation: every run holds an exclusive flock on its app domain # (/run/lock/cc-ci-app-.lock) for its whole process lifetime, the run-start janitor probes # that lock to reap only orphans (held lock = live run, never touched), and recipe working trees diff --git a/docs/concurrency.md b/docs/concurrency.md index 64d4496..06858f1 100644 --- a/docs/concurrency.md +++ b/docs/concurrency.md @@ -1,167 +1,228 @@ # Concurrency: how parallel recipe CI runs stay safe -Spec of the concurrent-run system as of 2026-06-10 (commits `c0df77d`, `68ef0f8`, `e6d55b5`). -Written for review/restructuring — it documents what IS, including known limitations. +Spec of the concurrent-run system after the 2026-06-10 restructure (branch +`restructure/concurrency`; plan: cc-ci-plan `concurrency-restructure-full-plan.md`). The previous +registry + per-recipe-flock model is documented in this file's git history (`5b65c6c`). ## 1. Goal and design summary -Two recipe CI builds may run **at the same time** on the single cc-ci host (e.g. immich and -plausible under active development at once). Safety is enforced by the **harness**, not by -serialising everything: +Two recipe CI builds may run **at the same time** on the single cc-ci host. Safety is enforced by +the **harness**, not by serialising everything, and rests on ONE locking mechanism plus ONE +structural isolation: | Rule | Mechanism | |---|---| | Different recipes run in parallel | nothing blocks them (isolation, §3) | -| Same-recipe runs serialise | per-recipe `flock` (§4) | -| A starting run never reaps a live concurrent run's app | active-run registry + three-way janitor (§5) | -| A crashed run's leftovers still get reaped | registry owner-dead detection, age fallback (§5) | +| Same-RECIPE runs run in parallel too | per-run `ABRA_DIR` recipe trees (§4) — no shared tree, no lock | +| Same-DOMAIN runs (double-`!testme` of one PR) serialise | per-app-domain `flock` (§5) | +| A starting run never reaps a live concurrent run's app | janitor probes the app lock; held = live (§6) | +| A crashed/canceled/rebooted run's leftovers get reaped | lock auto-released by the kernel → probe acquires → reap (§6) | -There is **no daemon and no shared state service**: both mechanisms are kernel/file primitives -under `/run`, scoped to the harness process lifetime, so a `SIGKILL`'d run can never leak a stale -lock or a stale "I'm alive" claim. +The invariant chain that makes "held lock = live owner" sound: -## 2. Configuration knobs +``` +lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline +``` -| Knob | Where | Current | Meaning | -|---|---|---|---| -| `DRONE_RUNNER_CAPACITY` (aka `MAX_TESTS`) | `nix/modules/drone-runner.nix` (`maxTests` let-binding) | `2` | **THE cap.** Max builds the exec runner executes at once; Drone queues the rest in its native pending queue. Change requires `nixos-rebuild switch` on cc-ci. | -| `concurrency.limit` | `.drone.yml`, `recipe-ci` pipeline | `2` | Server-side cap on concurrent `recipe-ci` pipelines. Kept equal to capacity; redundant belt (the push pipeline shares runner capacity too, so lint builds can interleave). | -| `CCCI_JANITOR_MAX_AGE` | env, read in `lifecycle.janitor()` | unset → `7200`s | **Age fallback only** — applies solely to apps with no registry entry (§5 case 3). The capacity=1-era `"0"` override in `.drone.yml` is GONE; do not reintroduce it (it made a starting build reap in-flight runs). | -| `RECIPE_LOCK_DIR` | `lifecycle.py` constant | `/run/lock` | Where per-recipe lock files live. | -| `ACTIVE_RUN_DIR` | `lifecycle.py` constant | `/run/cc-ci-active` | Where active-run pidfiles live. | +- **lock ⊆ process**: locks are kernel flocks on fds the process holds (and PEP 446 makes those + fds non-inheritable, so abra/docker/pytest children never carry them). The kernel releases them + on process death, however it dies. There is no unlock code path and no stale-lock failure mode. +- **process ⊆ step**: `PR_SET_PDEATHSIG(SIGTERM)` + the `.drone.yml` setsid/trap wrap (§2) — a + dead or canceled build cannot leak a running harness. +- **step ⊆ 60 min**: `signal.alarm(3600)` self-deadline (§2). -Memory budget rationale for capacity=2 (Hetzner cpx22, ~7.6 GiB): a full immich stack measured -~1 GiB; two concurrent recipes fit. Revert `maxTests` to `"1"` if OOM/disk-I/O contention appears. +Never steal a held lock; manage the holder's lifetime. There is **no daemon and no shared state +service** — everything is kernel/file primitives under `/run/lock` and per-run directories. + +## 2. Mechanism 0: run-lifetime hardening (`runner/harness/lifetime.py`) + +`run_recipe_ci.main()` calls `lifetime.install_lifetime_guards()` before ANY abra call or lock +acquisition: + +1. **`PR_SET_PDEATHSIG(SIGTERM)`** (ctypes prctl, return code checked): if the parent — the drone + step shell — dies, the kernel TERMs the harness. A post-prctl `ppid == 1` re-check closes the + start race: a harness whose parent died *before* the prctl armed would never get the signal, + so it refuses to run orphaned. +2. **SIGTERM handler**: logs, then raises `SystemExit(143)` so the run's `finally:` teardown + funnel executes and the process exits non-zero. Re-entrant signals during teardown are logged + and IGNORED (`lifetime.begin_teardown()`, also set at the top of the run's `finally:` blocks) + so a second signal can't abort the cleanup the first one asked for. +3. **`signal.alarm(3600)` hard deadline**: SIGALRM funnels into the same teardown path with a + distinct log line (`== run exceeded 60-minute hard deadline — tearing down ==`), exit 142. + Recipes keep their own smaller per-tier timeouts; this bounds the whole run. Teardown time + after the deadline is deliberately not alarm-bounded — the janitor is the backstop if a + teardown wedges and the process is killed harder. + +The `.drone.yml` recipe-ci step runs the harness as `setsid cc-ci-run … &` with a +`trap 'kill -TERM -- "-$PID"' TERM EXIT; wait "$PID"` — a drone **cancel** (TERM to the step +shell) is forwarded to the harness's whole process group instead of leaking it (the exec runner +only kills the step shell). PDEATHSIG backstops the no-trap paths. ## 3. Isolation model: what is shared, what is per-run Per-run (no conflict possible): -- **App + stack + volumes + secrets.** The run app domain is deterministic and unique per - (recipe, pr, ref): `naming.app_domain()` → `-.ci.commoninternet.net`. - Everything abra creates is namespaced by it. Run apps are recognised by - `RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`; canonical/warm apps - (e.g. `warm-keycloak...`) deliberately do NOT match, so the janitor never touches them. -- **Drone build workspace.** The exec runner gives each build its own clone under - `/var/lib/drone-runner/drone-/` — harness code and test files are per-build. -- **Run artifacts.** `/var/lib/cc-ci-runs//`. +- **App + stack + volumes + secrets.** Run app domain = `naming.app_domain()` → + `-.ci.commoninternet.net`, unique per (recipe, pr, ref); + everything abra creates is namespaced by it. Run apps are recognised by + `RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`; warm/canonical apps + (e.g. `warm-keycloak...`) deliberately do NOT match → the janitor never probes them. +- **Recipe working trees** — `$ABRA_DIR/recipes/`, per run (§4). NEW in the restructure. +- **Drone build workspace** (`/var/lib/drone-runner/drone-/`) and **run artifacts** + (`/var/lib/cc-ci-runs//`). -Shared (the two hazards the mechanisms exist for): +Shared (by design, conflict-free): -- **`~/.abra/recipes/`** — ONE working tree per recipe (abra's own layout). The harness - `fetch_recipe()` `rm -rf`'s + reclones it at run start, and the upgrade tier `git checkout`s it - mid-run for the chaos redeploy. Two same-recipe runs would corrupt each other's deploy tree - (observed: immich builds 229/230 deployed a tree missing its config). → per-recipe flock (§4). -- **`HOME=/root`** — forced in `.drone.yml` so abra finds its server config under `/root/.abra`. - Safe *given* the above: app names are unique and same-recipe runs serialise, so no two builds - touch the same recipe checkout or app env file. +- **`/root/.abra/servers`** — app `.env` files, one per domain. The per-run `ABRA_DIR` symlinks + `servers/` here, so .env files land in the canonical path: janitor discovery (`abra app ls`) + and out-of-run tooling see every app. Per-domain filenames + the app-domain lock prevent write + conflicts. +- **`/root/.abra/catalogue`** — read-mostly, symlinked into each per-run dir. +- **`HOME=/root`** (forced in `.drone.yml`) — safe: nothing recipe-mutable lives under `~/.abra` + for a run anymore except through the two symlinks above. -## 4. Mechanism 1: per-recipe flock +## 4. Mechanism 1: per-run `ABRA_DIR` (replaces the per-recipe flock) -Code: `lifecycle.acquire_recipe_lock(recipe)`; taken in `run_recipe_ci.main()` **before** -`fetch_recipe()` (the first shared-tree mutation). +`run_recipe_ci.setup_run_abra_dir()` — called first thing in `main()`, before any abra call — +builds `//abra/` (run-id = Drone build number; `manual-` for hand runs): -- Lock file: `/run/lock/cc-ci-recipe-.lock`, exclusive `fcntl.flock`. -- Non-blocking attempt first; on `BlockingIOError` it logs - `== recipe lock: another run is in flight — waiting ... ==` and blocks. The waiting - run is visibly "stuck" in its drone log at that line — that is by design. -- The open file object is returned and kept alive (`_recipe_lock = ... # noqa: F841`) for the - **whole process lifetime**. Release is implicit: the kernel drops a flock when the fd closes — - including on crash or SIGKILL. **There is no stale-lock failure mode and no unlock code path.** -- Scope: serialises only runs of the SAME recipe. Different recipes never contend. +``` +abra/ + servers/ -> /root/.abra/servers (symlink; canonical shared .env path) + catalogue/ -> /root/.abra/catalogue (symlink; read-mostly) + recipes/ fresh, empty (THE isolation that matters) +``` -## 5. Mechanism 2: active-run registry + three-way janitor +and exports it as `$ABRA_DIR` — honored by the abra CLI itself and by every harness path helper +(`abra.abra_dir()` / `abra.recipe_dir()`; `generic._recipe_dir`, `prepull_images`, +`snapshot_recipe_tests`, `warm_reconcile._recipe_dir` all route through the same rule: +`$ABRA_DIR` if set, else `~/.abra`). -Why: every run starts with `lifecycle.janitor()` (called from both the cold path and the -warm/quick path in `run_recipe_ci.py`) to reap orphans left by crashed/SIGKILL'd runs (whose -`finally:` teardown never ran). Under capacity=2 "any run app that isn't mine" may be a LIVE -concurrent run — age alone can't tell. The registry makes ownership explicit. +- `fetch_recipe()` is now a plain clone into `$ABRA_DIR/recipes/` (PR-head clone+checkout + or `abra recipe fetch`); the upgrade tier's mid-run `git checkout`s happen in the run's own + tree. Two same-recipe runs can no longer corrupt each other — structurally, with no lock. The + old observed failure (immich builds 229/230 deploying a tree missing its config) is impossible. +- `CCCI_SKIP_FETCH=1` (test/Adversary staging) copies the canonically-staged + `~/.abra/recipes/` clone into the per-run tree. +- Out-of-run flows (warm_reconcile's systemd timer, manual abra) set no `ABRA_DIR` and keep using + the canonical `/root/.abra` unchanged. In-run flows that touch canonical state on purpose + (warm/canonical .env files) go through `servers/` and are unaffected. +- The per-run dir rides along the existing `/var/lib/cc-ci-runs//` retention. abra + auto-clones any recipe it needs to resolve (e.g. during `app ls`) into the per-run `recipes/` — + a few seconds of git per run, gone with the run dir. -Registry protocol (all in `lifecycle.py`): +## 5. Mechanism 2: per-app-domain flock (`lifecycle.acquire_app_lock`) -1. `register_run_app(domain)` — writes `/run/cc-ci-active/` containing the harness pid. - Called inside `deploy_app()` **before** the app is created, so no window exists where a - concurrent janitor can see the app without its registration. -2. `unregister_run_app(domain)` — removes the pidfile. Called at the end of `teardown_app()` - (every exit path funnels through teardown) and by the janitor after reaping. -3. `_run_owner_state(domain)` — classifies the owner: - - reads the pid; missing/garbled file → `"unknown"` - - `/proc//cmdline` gone → `"dead"` - - cmdline must contain `run_recipe_ci` → `"alive"`, else `"dead"` (**pid-reuse guard**: a - recycled pid won't look like a harness run) +- Lock file: `/run/lock/cc-ci-app-.lock` (dir overridable via `CCCI_APP_LOCK_DIR` for the + test suite), exclusive `fcntl.flock`, taken in `deploy_app()` **before the app is created** — a + concurrent janitor can never see a run app without its held lock. +- Blocks (with a log line: `== app lock: another run of is in flight — waiting ==`) when + another run of the SAME domain is in flight — the double-`!testme` serialisation point; the + waiting run is visibly parked at that line in its drone log, by design. +- The returned file object is ALSO retained in module-level `_held_app_locks` — if a caller + dropped it, GC would close the fd and silently release the lock. +- mtime is touched at acquisition: lock age feeds the janitor's long-held flag (§6). +- **Unlink/recreate race guard**: the janitor unlinks reaped lockfiles, so after EVERY + acquisition the locked fd is verified to still be the inode the path names + (`fstat().st_ino == stat().st_ino`); a waiter that won a just-unlinked inode closes it and + retries on the live path. (A lock on an unlinked inode protects nothing: a later opener gets a + fresh inode and would acquire "the same" lock.) +- Release is implicit: process exit (any kind). `teardown_app()` does NOT release or unlink — + a clean run's leftover lockfile is unheld and is unlinked on sight by the next janitor sweep. -Janitor decision table (`lifecycle.janitor()`): +## 6. The flock-probe janitor (`lifecycle.janitor`) -| Owner state | Meaning | Action | +Runs at every run start (cold + quick paths) and in the warm/upgrade sweeps. Candidate discovery +is unchanged from the old model: `abra app ls` + a docker-service sweep (catches stacks whose +`.env` is already gone), both matched against `RUN_APP_RE` — warm/canonical apps never match and +are never probed. + +Decision table (per candidate domain, `_probe_and_reap`): + +| Probe (`LOCK_EX\|LOCK_NB`) | Meaning | Action | |---|---|---| -| `alive` | live concurrent run | **never reap** (logs "is a live concurrent run — leaving it") | -| `dead` | crashed run's definite orphan | reap immediately (`teardown_app(verify=False)`) + unregister | -| `unknown` | pre-registry app, or post-reboot (`/run` is tmpfs) | age fallback: reap only if stack age ≥ `CCCI_JANITOR_MAX_AGE` (default 2h) | +| acquires (+ inode identity OK) | nobody holds it → owner died (kernel-guaranteed) | **reap**: `teardown_app(verify=False)` WHILE HOLDING the probe lock, then unlink the lockfile, then release | +| acquires, inode stale | another janitor reaped + unlinked while we raced | skip (reap already done; unlinking now would hit a newer run's file) | +| `BlockingIOError` (held) | live concurrent run | leave it; if lockfile mtime > 120 min (2× the hard deadline): `!! lock for held >120min — possible leaked run; inspect with lslocks` — flag, **never steal** | +| `open()` fails (`OSError`) | garbled/unopenable lockfile | skip + log, never crash | -Candidate discovery is unchanged from before: `abra app ls` matches against `RUN_APP_RE`, plus a -docker-service sweep that reconstructs domains for stacks whose `.env` was already deleted. - -## 6. Where convergence fits (adjacent, landed with this work) - -Parallel runs surfaced two swarm-convergence bugs that look like concurrency bugs but aren't — -documented here because any restructuring must keep them fixed (`services_converged()` in -`lifecycle.py`): - -- **N/N replicas ≠ converged** during a stop-first rolling update: the update is *registered* - instantly but the OLD task still shows 1/1 until swarm cycles it (build 238: backupbot exec'd a - pre-hook into a container killed seconds later → 409 → empty snapshot). `services_converged()` - therefore also inspects each service's `UpdateStatus.State`. -- **`paused` persists forever**: swarm's default `update-failure-action: pause` sets it on one - task flicker and it never clears, even at N/N healthy (build 241 hung 22 min). Only `updating` - and `rollback_started` block convergence; `paused`/`rollback_paused`/`completed` are settled — - the HTTP-health and tier assertions still gate actual app correctness. -- `backup_app()` additionally waits (bounded, 300s) for `services_converged()` before - `abra app backup create`, as defence in depth for the backupbot race. +- Reaping under the probe lock closes the janitor-vs-new-run race: a new run of that domain + blocks in `acquire_app_lock` until the reap finishes — no window where a fresh app coexists + with a half-reaped one. +- Two racing janitors arbitrate on the flock: one reaps, the other sees "held" and leaves; reaps + are idempotent (`teardown_app(verify=False)` tolerates half-gone stacks). +- After the candidates, a tidy sweep unlinks stale **unheld** `cc-ci-app-*.lock` files with no + app behind them (under their own probe lock + identity check), keeping `/run/lock` clean. +- **Post-reboot**: `/run/lock` is tmpfs → lockfiles gone → every surviving app probes as an + orphan → reaped immediately. (Improvement over the old 2-hour age fallback; there IS no age + logic anymore.) ## 7. Failure-mode guarantees | Event | Outcome | |---|---| -| Run crashes / SIGKILL mid-run | flock auto-released by kernel; pidfile remains but owner is `dead` → next janitor (any run's start) reaps app + pidfile | -| Drone build canceled via API | **known gap**: cancel kills the step's `sh` wrapper but can LEAK the python harness child — it keeps running (holding lock + registry) until killed by hand. See §8. | -| Host reboot | `/run` is tmpfs → locks and registry vanish (correct: no processes survived either). All surviving apps become `unknown` → 2h age fallback governs. | -| Two same-recipe `!testme`s | second blocks on the flock at run start (before touching the shared tree), runs after the first finishes | -| Janitor vs. app being created | impossible to mis-reap: registration happens before app creation, and an `alive` owner is never reaped | -| Pid reuse after crash | cmdline check (`run_recipe_ci`) classifies as `dead`, orphan still reaped | +| Run crashes / SIGKILL mid-run | flock auto-released by kernel → next janitor probe reaps app + lockfile | +| Drone build canceled via API | step trap TERMs the harness process group → SIGTERM funnel runs the run's own teardown (exit 143); if anything still leaks, PDEATHSIG + janitor reap (the old "cancel leaks the harness" gap is CLOSED) | +| Run exceeds 60 min | SIGALRM → distinct log line → own teardown → exit 142 | +| Host reboot | locks and lockfiles vanish (tmpfs, correct: no owners survived) → all surviving run apps reaped at the next run start, immediately | +| Two same-recipe `!testme`s (different PRs) | run in parallel — separate domains, separate per-run recipe trees | +| Double-`!testme` (same PR → same domain) | second blocks on the app lock before creating anything, visibly in its drone log, runs after the first finishes | +| Janitor vs. app being created | impossible to mis-reap: the lock is held before `app new`, and a held lock is never touched | +| Janitor unlink vs. blocked waiter | inode identity re-check on every acquisition → waiter retries on the live path | +| Lock held implausibly long (>120 min) | flagged loudly for a human (`lslocks`), never stolen | -## 8. Known limitations / restructuring candidates +## 8. Where convergence fits (adjacent; unchanged by the restructure) -1. **Drone cancel leaks the harness process.** The exec runner kills the step shell, not the - process tree; the leaked python continues deploying/holding the lock. Fix ideas: run the step - under `setsid` + a trap that kills the process group, or have the harness watch - `DRONE_BUILD_STATUS`/parent-death (`PR_SET_PDEATHSIG`). -2. **Head-of-line blocking on same-recipe serialisation.** A run waiting on the recipe flock - still occupies one of the 2 runner slots, so two builds of the SAME recipe temporarily starve - all other recipes. Alternatives: a Drone-level per-recipe fan-out (one pipeline `concurrency` - group per recipe is not natively expressible), or detect-and-requeue in the harness. -3. **The lock protects harness runs only.** Manual `abra`/`git` activity on - `~/.abra/recipes/` (operator or another agent) bypasses the flock — the - "park the checkout, then hands off" discipline is still required. A restructure could make the - harness deploy from a per-run copy of the recipe tree instead of the shared checkout - (eliminates the lock entirely, at the cost of diverging from abra's expected layout). -4. **`HOME=/root` is still shared.** Safe today by argument (§3), not by enforcement. Per-build - `ABRA_DIR` with a shared read-only server config would make isolation structural. -5. **Registry is advisory.** Nothing stops a non-harness actor from creating run-app-shaped - stacks the janitor will eventually age-reap; conversely the janitor trusts pidfiles it can - parse. Acceptable on a single-purpose CI host. -6. **Capacity is configured in two places** (`drone-runner.nix` + `.drone.yml`) that must be - kept in step by hand. +Two swarm-convergence behaviors in `services_converged()` look like concurrency bugs but aren't — +any future work must keep them fixed: -## 9. File / symbol index +- **N/N replicas ≠ converged** during a stop-first rolling update — `UpdateStatus.State` is also + inspected (build 238: backupbot exec'd into a container killed seconds later). +- **`paused` persists forever** (swarm's default `update-failure-action`) — only `updating` and + `rollback_started` block convergence; `paused`/`rollback_paused` are settled (build 241). +- `backup_app()` additionally waits (bounded 300s) for convergence before `backup create`. + +## 9. Configuration knobs + +| Knob | Where | Current | Meaning | +|---|---|---|---| +| `DRONE_RUNNER_CAPACITY` (aka `MAX_TESTS`) | `nix/modules/drone-runner.nix` (`maxTests`) | `2` | **THE single concurrency knob.** Max builds the exec runner executes at once; Drone queues the rest. (The `.drone.yml` `concurrency.limit` duplicate was removed.) Change requires `nixos-rebuild switch`. | +| `CCCI_APP_LOCK_DIR` | env, read at call time | unset → `/run/lock` | App-domain lockfile dir override — used by `tests/concurrency` to sandbox locks. Never set in production. | +| hard deadline | `lifetime.HARD_DEADLINE_SECONDS` | 3600 s | the whole-run alarm; long-held flag threshold is 2× this (`LONG_HELD_LOCK_SECONDS`) | + +## 10. Testing: `tests/concurrency/` + +Real-kernel suite (19 planned cases + companions): helper subprocesses hold REAL flocks and +install the REAL prctl/signal/alarm guards — flock itself is never mocked; the janitor runs with +injected candidates + stubbed teardown but probes real locks. **Not part of the default +`pytest tests/unit` gate** (it spawns processes and sleeps); run it explicitly: + +``` +cc-ci-run -m pytest tests/concurrency -q +``` + +Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non-inheritance; +same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock +blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist; +degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR +construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality. + +## 11. File / symbol index | What | Where | |---|---| -| `maxTests` / `DRONE_RUNNER_CAPACITY` | `nix/modules/drone-runner.nix` | -| `concurrency.limit`, `HOME=/root`, env | `.drone.yml` (`recipe-ci` pipeline) | -| Lock + registry constants & helpers | `runner/harness/lifecycle.py` (top, after `TeardownError`) | -| `acquire_recipe_lock` call site | `runner/run_recipe_ci.py` `main()`, before `fetch_recipe()` | -| `register_run_app` call site | `lifecycle.deploy_app()` (before app creation) | -| `unregister_run_app` call sites | `lifecycle.teardown_app()`, `lifecycle.janitor()` | -| Janitor + decision table | `lifecycle.janitor()`, `_run_owner_state()` | -| Run-app naming | `runner/harness/naming.py` (`app_domain`), `RUN_APP_RE` in `lifecycle.py` | -| Convergence (adjacent) | `lifecycle.services_converged()`, `lifecycle.backup_app()` | +| lifetime guards (PDEATHSIG, signal funnels, deadline) | `runner/harness/lifetime.py`; installed in `run_recipe_ci.main()` | +| setsid/trap cancel forwarding | `.drone.yml` (`recipe-ci` step) | +| `acquire_app_lock`, `_held_app_locks`, `_app_lock_path` | `runner/harness/lifecycle.py` | +| `acquire_app_lock` call site | `lifecycle.deploy_app()` (before app creation) | +| janitor + probe (`janitor`, `_probe_and_reap`, `LONG_HELD_LOCK_SECONDS`) | `runner/harness/lifecycle.py` | +| per-run ABRA_DIR (`setup_run_abra_dir`, `fetch_recipe`) | `runner/run_recipe_ci.py` | +| path resolution (`abra_dir`, `recipe_dir`) | `runner/harness/abra.py` (used by `generic`, `lifecycle.prepull_images`, `warm_reconcile`) | +| run-app naming | `runner/harness/naming.py` (`app_domain`), `RUN_APP_RE` in `lifecycle.py` | +| capacity knob | `nix/modules/drone-runner.nix` (`maxTests`) | +| convergence (adjacent) | `lifecycle.services_converged()`, `lifecycle.backup_app()` | +| the test suite | `tests/concurrency/` (`helpers.py` subprocess entrypoints, `concutil.py` probes) | + +Deleted in the restructure (grep should find NOTHING): `register_run_app`, `unregister_run_app`, +`_run_owner_state`, `ACTIVE_RUN_DIR`, `CCCI_JANITOR_MAX_AGE`, `_stack_age_seconds`, +`acquire_recipe_lock`, `RECIPE_LOCK_DIR`.