diff --git a/.drone.yml b/.drone.yml index d46adcc..d356171 100644 --- a/.drone.yml +++ b/.drone.yml @@ -37,10 +37,11 @@ steps: # # Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) + # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by -# the harness, not by serialisation: same-recipe runs serialise on a per-recipe flock -# (lifecycle.acquire_recipe_lock — the shared ~/.abra/recipes/ checkout is the conflict), -# and every run registers its app domain + pid in /run/cc-ci-active so the run-start janitor only -# reaps orphans whose owning run is DEAD (alive → never touched; unknown → age fallback, default 2h). +# the harness, not by serialisation: every run holds an exclusive flock on its app domain +# (/run/lock/cc-ci-app-.lock) for its whole process lifetime, the run-start janitor probes +# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs +# serialise on a per-recipe flock for the shared ~/.abra/recipes/ checkout +# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md. kind: pipeline type: exec name: recipe-ci diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 2346948..210a011 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -7,8 +7,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer from __future__ import annotations import contextlib -import datetime import fcntl +import glob import json import os import re @@ -18,7 +18,7 @@ import subprocess import time import urllib.request -from . import abra +from . import abra, lifetime GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci) # A run app domain is "-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the @@ -31,22 +31,22 @@ class TeardownError(RuntimeError): # --- Concurrent-run safety (capacity=2) ------------------------------------------------------- -# Two cooperating mechanisms, both process-lifetime-scoped so SIGKILL can't leak a stale lock: -# 1. Per-recipe flock: ~/.abra/recipes/ is ONE shared working tree that fetch_recipe -# rm-rf's/reclones and the upgrade tier git-checkouts mid-run. Concurrent runs of the SAME -# recipe would corrupt each other's deploy tree (observed: immich builds 229/230 deployed a -# tree missing its config), so they serialise on an exclusive flock; different recipes run in -# parallel. The kernel drops a flock when the holder dies, however it dies. -# 2. Active-run registry: each run registers its app domain + pid before creating the app, so the -# janitor can tell a live concurrent run from a crashed run's orphan (see janitor()). +# ONE mechanism, process-lifetime-scoped so SIGKILL can't leak a stale claim: every run holds an +# exclusive kernel flock on its app DOMAIN (/run/lock/cc-ci-app-.lock) for the whole run. +# A held lock implies a live owner — the kernel releases a flock when the holding process dies, +# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held → +# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never +# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each +# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR) +# serialise on this app lock. See docs/concurrency.md. RECIPE_LOCK_DIR = "/run/lock" -ACTIVE_RUN_DIR = "/run/cc-ci-active" def acquire_recipe_lock(recipe: str): - """Take the per-recipe exclusive lock; blocks (with a log line) if another run of the same - recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the - whole run; the lock is released only when the process exits and the fd closes.""" + """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes + checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and + with it this lock, structurally unnecessary); until then the caller keeps the returned file + alive for the whole run and release is implicit at process exit.""" path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run @@ -63,39 +63,55 @@ def acquire_recipe_lock(recipe: str): return f -def _registry_path(domain: str) -> str: - return os.path.join(ACTIVE_RUN_DIR, domain) +# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the +# caller drops the returned file object, GC would close the fd and silently release the lock — +# this list is the lock's owner of record. Never cleared; release is process exit. +_held_app_locks: list = [] -def register_run_app(domain: str) -> None: - """Record this process as the live owner of a run app (called BEFORE the app is created, so a - concurrent run's janitor can never observe the app without its registration).""" - with contextlib.suppress(OSError): - os.makedirs(ACTIVE_RUN_DIR, exist_ok=True) - with open(_registry_path(domain), "w") as f: - f.write(str(os.getpid())) +def _app_lock_dir() -> str: + """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so + post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the + tests/concurrency suite (and its helper subprocesses) can use a sandbox dir.""" + return os.environ.get("CCCI_APP_LOCK_DIR", "/run/lock") -def unregister_run_app(domain: str) -> None: - with contextlib.suppress(OSError): - os.remove(_registry_path(domain)) +def _app_lock_path(domain: str) -> str: + return os.path.join(_app_lock_dir(), f"cc-ci-app-{domain}.lock") -def _run_owner_state(domain: str) -> str: - """'alive' if the registered owner is a live run_recipe_ci process, 'dead' if registered but - gone (definite orphan), 'unknown' if never registered (pre-registry code or post-reboot).""" - try: - with open(_registry_path(domain)) as f: - pid = int(f.read().strip()) - except (OSError, ValueError): - return "unknown" - try: - with open(f"/proc/{pid}/cmdline", "rb") as f: - cmdline = f.read().decode(errors="replace").replace("\0", " ") - except OSError: - return "dead" - # Guard against pid reuse: the owner must still look like a harness run. - return "alive" if "run_recipe_ci" in cmdline else "dead" +def acquire_app_lock(domain: str): + """Take the per-app-domain exclusive lock; blocks (with a log line) if another run of the + same domain is in flight (double-!testme serialisation). Returns the open lock file, which is + ALSO retained in _held_app_locks so the flock lives exactly as long as the process. + + Unlink/recreate race guard: the janitor unlinks a reaped orphan's lockfile while holding its + flock, so a waiter blocked on the OLD inode can win a lock no later opener can observe (a new + open() at the path creates a FRESH inode). After every acquisition, verify the locked fd is + still the file at the path (st_ino match); if not, drop it and retry on the live path.""" + path = _app_lock_path(domain) + waited = False + while True: + # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. + f = open(path, "a") # noqa: SIM115 — deliberately held for the rest of the process + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + if not waited: + print(f"== app lock: another run of {domain} is in flight — waiting ==", flush=True) + waited = True + fcntl.flock(f, fcntl.LOCK_EX) + try: + if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino: + break # we hold the lock on the inode the path names — done + except FileNotFoundError: + pass + f.close() # locked a stale (unlinked) inode — retry on the live path + os.utime(f.fileno()) # mtime = acquisition time = lock age (janitor's long-held flag) + _held_app_locks.append(f) + if waited: + print(f"== app lock: acquired {path} ==", flush=True) + return f def _docker_names(kind: str, stack: str) -> list[str]: @@ -117,31 +133,6 @@ def _residual(domain: str) -> dict: } -def _stack_age_seconds(stack: str) -> float | None: - """Age of the stack's oldest service, or None if not present.""" - svcs = _docker_names("service", stack) - if not svcs: - return None - oldest = None - for s in svcs: - p = subprocess.run( - ["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"], - capture_output=True, - text=True, - ) - ts = p.stdout.strip() - try: - # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars - dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace( - tzinfo=datetime.UTC - ) - except ValueError: - continue - age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds() - oldest = age if oldest is None else max(oldest, age) - return oldest - - def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]: """Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe @@ -279,9 +270,10 @@ def deploy_app( past the 900s default. abra's INTERNAL TIMEOUT (recipe's TIMEOUT env, default 300s) is set via EXTRA_ENV; this is the Python subprocess wrapper's timeout so abra doesn't get SIGKILLed mid-deploy.""" _record_deploy() - # Register BEFORE the app exists: a concurrent run's janitor must never see this app without - # its live-owner registration (it would reap an in-flight deploy). - register_run_app(domain) + # Lock BEFORE the app exists: a concurrent run's janitor must never see this app without a + # held app lock (it would probe it as an orphan and reap an in-flight deploy). Also the + # double-!testme serialisation point: a second run of the same domain blocks here. + acquire_app_lock(domain) abra.app_config_remove(domain) # clear any stale .env from a prior crashed run abra.app_new(recipe, domain, version=version, secrets=secrets) # A pinned version must actually deploy that version: check the recipe out to the tag so the @@ -720,23 +712,84 @@ def teardown_app(domain: str, verify: bool = True) -> None: residual = _residual(domain) if any(residual.values()): raise TeardownError(f"teardown left residual for {domain}: {residual}") - # The app is gone — drop its active-run registration (janitor() also clears it when reaping). - unregister_run_app(domain) + # No unregistration step: the app lock releases implicitly at process exit. The clean run's + # leftover lockfile (unheld) is unlinked on sight by the next janitor's stale-lockfile sweep. -def janitor(max_age_seconds: int | None = None) -> None: - """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme. Safe under - CONCURRENT runs (capacity=2): every harness run registers its app in the active-run registry - (register_run_app), so the janitor distinguishes the three cases instead of using age alone: - - registered + owner run_recipe_ci process ALIVE -> in-flight concurrent run: never reap - - registered + owner DEAD (crashed/SIGKILLed run) -> definite orphan: reap immediately - - no registry entry (pre-registry code, reboot) -> fall back to the age threshold - Reaps via docker primitives so it works even when the .env is gone (A2/A3). Age fallback default - 2h, env-overridable via CCCI_JANITOR_MAX_AGE.""" - import os +# A lock held longer than 2x the 60-min hard deadline can only be a leaked run (the deadline +# bounds every healthy run). Flag it for a human — NEVER steal a held lock. +LONG_HELD_LOCK_SECONDS = 2 * lifetime.HARD_DEADLINE_SECONDS - if max_age_seconds is None: - max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200")) + +def _probe_and_reap(domain: str) -> None: + """Probe one run app's lock; reap iff nobody holds it (kernel-guaranteed orphan). + + Reaping happens WHILE HOLDING the probe lock, closing the janitor-vs-new-run race: a new run + of the same domain blocks in acquire_app_lock until the reap finishes, so a fresh app never + coexists with a half-reaped one. The lockfile is unlinked before release (still holding the + lock); a waiter that blocked on the unlinked inode re-checks identity and retries. Two racing + janitors arbitrate on the same flock: one reaps, the other sees 'held' and leaves — + teardown_app(verify=False) is idempotent either way.""" + path = _app_lock_path(domain) + try: + # PEP 446: non-inheritable fd, same as acquire_app_lock. + f = open(path, "a") # noqa: SIM115 — closed in the finally below, lock released with it + except OSError as e: + print(f"!! janitor: cannot open lockfile {path} ({e}) — skipping {domain}", flush=True) + return + try: + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + # Held -> live run. Never steal; flag if it has been held implausibly long. + try: + held_for = time.time() - os.stat(path).st_mtime + except OSError: + held_for = 0 + if held_for > LONG_HELD_LOCK_SECONDS: + print( + f"!! lock for {domain} held >{LONG_HELD_LOCK_SECONDS // 60}min — possible " + "leaked run; inspect with lslocks", + flush=True, + ) + else: + print( + f" janitor: {domain} lock held — live concurrent run, leaving it", flush=True + ) + return + # Acquired — but only the inode the PATH names counts (another janitor may have reaped + # and unlinked this inode while we raced; a lock on an unlinked inode protects nothing + # and unlinking the path now would delete a NEWER run's lockfile). + try: + if os.fstat(f.fileno()).st_ino != os.stat(path).st_ino: + return + except FileNotFoundError: + return + # Orphan: no live owner (the kernel released the lock when the owner died). Reap while + # holding the probe lock, then unlink the lockfile before releasing. + print(f" janitor: {domain} lock acquirable — orphan, reaping", flush=True) + with contextlib.suppress(Exception): + teardown_app(domain, verify=False) + with contextlib.suppress(OSError): + os.unlink(path) + finally: + f.close() + + +def janitor() -> None: + """Reap orphaned run apps from crashed/rebooted runs; the kernel flock is the only liveness + oracle. For every candidate run app, probe its app-domain lock (LOCK_NB): + + acquirable -> nobody holds it -> orphan -> reap under the probe lock + unlink lockfile + held -> live concurrent run -> leave it (warn if held >2x the hard deadline) + + Candidate discovery is unchanged: `abra app ls` + a docker-service sweep (catches stacks + whose .env is already gone), both matched against RUN_APP_RE — warm/canonical apps never + match and are never probed. Post-reboot, /run/lock (tmpfs) is empty, so every surviving app + probes as an orphan and is reaped immediately (no age threshold). Stale lockfiles with no + app behind them are unlinked on sight. Degrades safely: an unreadable lockfile/dir is + skipped with a log line, never a crash. Reaps via docker primitives so it works even when + the .env is gone (A2/A3).""" seen = set() for app in abra.app_ls(): name = app.get("appName") or app.get("domain") or "" @@ -750,18 +803,22 @@ def janitor(max_age_seconds: int | None = None) -> None: seen.add(f"{m.group(1)}.ci.commoninternet.net") for name in seen: - owner = _run_owner_state(name) - if owner == "alive": - print(f" janitor: {name} is a live concurrent run — leaving it", flush=True) - continue - if owner == "unknown": - # No registry entry (manual run on pre-registry code, or post-reboot): only the age - # threshold protects it, as before. - stack = _stack_name(name) - age = _stack_age_seconds(stack) - if age is not None and age < max_age_seconds: - continue # young and of unknown provenance; leave it - # owner == "dead" (a crashed/killed run's definite orphan) or old enough -> reap - with contextlib.suppress(Exception): - teardown_app(name, verify=False) - unregister_run_app(name) + _probe_and_reap(name) + + # Tidy /run/lock: a clean run's leftover lockfile is unheld and appless — unlink it (under + # its own probe lock, with the same identity check as above). + with contextlib.suppress(OSError): + for path in glob.glob(os.path.join(_app_lock_dir(), "cc-ci-app-*.lock")): + domain = os.path.basename(path)[len("cc-ci-app-") : -len(".lock")] + if domain in seen: + continue # handled (or deliberately left) above + with contextlib.suppress(OSError): + f = open(path, "a") # noqa: SIM115 — closed below, lock released with it + try: + fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) + if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino: + os.unlink(path) + except (BlockingIOError, FileNotFoundError): + pass # held (live run pre-deploy) or already gone — leave it + finally: + f.close()