feat(harness): P2 flock-probe janitor — the kernel flock IS the liveness oracle

- acquire_app_lock(domain): exclusive flock on /run/lock/cc-ci-app-<domain>.lock, taken in deploy_app exactly where register_run_app was (BEFORE app creation); blocks with a log line when another run of the same domain is in flight (double-!testme serialisation). The file object is retained in module-level _held_app_locks so GC can never close the fd and silently release the lock. mtime is touched at acquisition (lock age for the long-held flag). - janitor(): probes each candidate's lock (discovery unchanged: abra app ls + docker-service sweep vs RUN_APP_RE). Acquirable -> orphan -> teardown_app(verify=False) WHILE HOLDING the probe lock (a new same-domain run blocks until the reap finishes), then unlink before release. Held -> live run -> leave it; held >120min (2x hard deadline) -> warn, never steal. Stale unheld lockfiles with no app are unlinked on sight. Unreadable lockfile -> skip + log. - unlink/recreate race guard (both sides): after ANY acquisition, verify the locked fd still is the inode the path names (fstat vs stat); a waiter that won a just-unlinked inode retries on the live path, and a probe that won one skips (unlinking now would hit a newer run's file). - deleted: register_run_app, unregister_run_app, _run_owner_state, _registry_path, ACTIVE_RUN_DIR, CCCI_JANITOR_MAX_AGE + age fallback, _stack_age_seconds, pid-reuse guard. teardown_app no longer unregisters (release is process exit). janitor() takes no args now. - post-reboot: /run/lock is tmpfs -> lockfiles gone -> probe trivially acquires -> immediate reap (improvement over the old 2h age fallback).
2026-06-10 04:11:31 +00:00
parent b492f995bd
commit b302f3ab63
2 changed files with 160 additions and 102 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -37,10 +37,11 @@ steps:
 #
 # Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) +
 # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by
-# the harness, not by serialisation: same-recipe runs serialise on a per-recipe flock
-# (lifecycle.acquire_recipe_lock — the shared ~/.abra/recipes/<recipe> checkout is the conflict),
-# and every run registers its app domain + pid in /run/cc-ci-active so the run-start janitor only
-# reaps orphans whose owning run is DEAD (alive → never touched; unknown → age fallback, default 2h).
+# the harness, not by serialisation: every run holds an exclusive flock on its app domain
+# (/run/lock/cc-ci-app-<domain>.lock) for its whole process lifetime, the run-start janitor probes
+# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs
+# serialise on a per-recipe flock for the shared ~/.abra/recipes/<recipe> checkout
+# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md.
 kind: pipeline
 type: exec
 name: recipe-ci
--- a/runner/harness/lifecycle.py
+++ b/runner/harness/lifecycle.py
@ -7,8 +7,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
 from __future__ import annotations

 import contextlib
-import datetime
 import fcntl
+import glob
 import json
 import os
 import re
@ -18,7 +18,7 @@ import subprocess
 import time
 import urllib.request

-from . import abra
+from . import abra, lifetime

 GATEWAY_IP = "143.244.213.108"  # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
 # A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
@ -31,22 +31,22 @@ class TeardownError(RuntimeError):


 # --- Concurrent-run safety (capacity=2) -------------------------------------------------------
-# Two cooperating mechanisms, both process-lifetime-scoped so SIGKILL can't leak a stale lock:
-#  1. Per-recipe flock: ~/.abra/recipes/<recipe> is ONE shared working tree that fetch_recipe
-#     rm-rf's/reclones and the upgrade tier git-checkouts mid-run. Concurrent runs of the SAME
-#     recipe would corrupt each other's deploy tree (observed: immich builds 229/230 deployed a
-#     tree missing its config), so they serialise on an exclusive flock; different recipes run in
-#     parallel. The kernel drops a flock when the holder dies, however it dies.
-#  2. Active-run registry: each run registers its app domain + pid before creating the app, so the
-#     janitor can tell a live concurrent run from a crashed run's orphan (see janitor()).
+# ONE mechanism, process-lifetime-scoped so SIGKILL can't leak a stale claim: every run holds an
+# exclusive kernel flock on its app DOMAIN (/run/lock/cc-ci-app-<domain>.lock) for the whole run.
+# A held lock implies a live owner — the kernel releases a flock when the holding process dies,
+# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held →
+# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never
+# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each
+# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR)
+# serialise on this app lock. See docs/concurrency.md.
 RECIPE_LOCK_DIR = "/run/lock"
-ACTIVE_RUN_DIR = "/run/cc-ci-active"


 def acquire_recipe_lock(recipe: str):
-    """Take the per-recipe exclusive lock; blocks (with a log line) if another run of the same
-    recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the
-    whole run; the lock is released only when the process exits and the fd closes."""
+    """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes
+    checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and
+    with it this lock, structurally unnecessary); until then the caller keeps the returned file
+    alive for the whole run and release is implicit at process exit."""
    path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock")
    # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
    f = open(path, "w")  # noqa: SIM115 — deliberately held for the lifetime of the run
@ -63,39 +63,55 @@ def acquire_recipe_lock(recipe: str):
    return f


-def _registry_path(domain: str) -> str:
-    return os.path.join(ACTIVE_RUN_DIR, domain)
+# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the
+# caller drops the returned file object, GC would close the fd and silently release the lock —
+# this list is the lock's owner of record. Never cleared; release is process exit.
+_held_app_locks: list = []


-def register_run_app(domain: str) -> None:
-    """Record this process as the live owner of a run app (called BEFORE the app is created, so a
-    concurrent run's janitor can never observe the app without its registration)."""
-    with contextlib.suppress(OSError):
-        os.makedirs(ACTIVE_RUN_DIR, exist_ok=True)
-        with open(_registry_path(domain), "w") as f:
-            f.write(str(os.getpid()))
+def _app_lock_dir() -> str:
+    """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so
+    post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the
+    tests/concurrency suite (and its helper subprocesses) can use a sandbox dir."""
+    return os.environ.get("CCCI_APP_LOCK_DIR", "/run/lock")


-def unregister_run_app(domain: str) -> None:
-    with contextlib.suppress(OSError):
-        os.remove(_registry_path(domain))
+def _app_lock_path(domain: str) -> str:
+    return os.path.join(_app_lock_dir(), f"cc-ci-app-{domain}.lock")


-def _run_owner_state(domain: str) -> str:
-    """'alive' if the registered owner is a live run_recipe_ci process, 'dead' if registered but
-    gone (definite orphan), 'unknown' if never registered (pre-registry code or post-reboot)."""
-    try:
-        with open(_registry_path(domain)) as f:
-            pid = int(f.read().strip())
-    except (OSError, ValueError):
-        return "unknown"
-    try:
-        with open(f"/proc/{pid}/cmdline", "rb") as f:
-            cmdline = f.read().decode(errors="replace").replace("\0", " ")
-    except OSError:
-        return "dead"
-    # Guard against pid reuse: the owner must still look like a harness run.
-    return "alive" if "run_recipe_ci" in cmdline else "dead"
+def acquire_app_lock(domain: str):
+    """Take the per-app-domain exclusive lock; blocks (with a log line) if another run of the
+    same domain is in flight (double-!testme serialisation). Returns the open lock file, which is
+    ALSO retained in _held_app_locks so the flock lives exactly as long as the process.
+
+    Unlink/recreate race guard: the janitor unlinks a reaped orphan's lockfile while holding its
+    flock, so a waiter blocked on the OLD inode can win a lock no later opener can observe (a new
+    open() at the path creates a FRESH inode). After every acquisition, verify the locked fd is
+    still the file at the path (st_ino match); if not, drop it and retry on the live path."""
+    path = _app_lock_path(domain)
+    waited = False
+    while True:
+        # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
+        f = open(path, "a")  # noqa: SIM115 — deliberately held for the rest of the process
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            if not waited:
+                print(f"== app lock: another run of {domain} is in flight — waiting ==", flush=True)
+                waited = True
+            fcntl.flock(f, fcntl.LOCK_EX)
+        try:
+            if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
+                break  # we hold the lock on the inode the path names — done
+        except FileNotFoundError:
+            pass
+        f.close()  # locked a stale (unlinked) inode — retry on the live path
+    os.utime(f.fileno())  # mtime = acquisition time = lock age (janitor's long-held flag)
+    _held_app_locks.append(f)
+    if waited:
+        print(f"== app lock: acquired {path} ==", flush=True)
+    return f


 def _docker_names(kind: str, stack: str) -> list[str]:
@ -117,31 +133,6 @@ def _residual(domain: str) -> dict:
    }


-def _stack_age_seconds(stack: str) -> float | None:
-    """Age of the stack's oldest service, or None if not present."""
-    svcs = _docker_names("service", stack)
-    if not svcs:
-        return None
-    oldest = None
-    for s in svcs:
-        p = subprocess.run(
-            ["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
-            capture_output=True,
-            text=True,
-        )
-        ts = p.stdout.strip()
-        try:
-            # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
-            dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
-                tzinfo=datetime.UTC
-            )
-        except ValueError:
-            continue
-        age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds()
-        oldest = age if oldest is None else max(oldest, age)
-    return oldest
-
-
 def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
    """Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe
    with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe
@ -279,9 +270,10 @@ def deploy_app(
    past the 900s default. abra's INTERNAL TIMEOUT (recipe's TIMEOUT env, default 300s) is set via
    EXTRA_ENV; this is the Python subprocess wrapper's timeout so abra doesn't get SIGKILLed mid-deploy."""
    _record_deploy()
-    # Register BEFORE the app exists: a concurrent run's janitor must never see this app without
-    # its live-owner registration (it would reap an in-flight deploy).
-    register_run_app(domain)
+    # Lock BEFORE the app exists: a concurrent run's janitor must never see this app without a
+    # held app lock (it would probe it as an orphan and reap an in-flight deploy). Also the
+    # double-!testme serialisation point: a second run of the same domain blocks here.
+    acquire_app_lock(domain)
    abra.app_config_remove(domain)  # clear any stale .env from a prior crashed run
    abra.app_new(recipe, domain, version=version, secrets=secrets)
    # A pinned version must actually deploy that version: check the recipe out to the tag so the
@ -720,23 +712,84 @@ def teardown_app(domain: str, verify: bool = True) -> None:
        residual = _residual(domain)
        if any(residual.values()):
            raise TeardownError(f"teardown left residual for {domain}: {residual}")
-    # The app is gone — drop its active-run registration (janitor() also clears it when reaping).
-    unregister_run_app(domain)
+    # No unregistration step: the app lock releases implicitly at process exit. The clean run's
+    # leftover lockfile (unheld) is unlinked on sight by the next janitor's stale-lockfile sweep.


-def janitor(max_age_seconds: int | None = None) -> None:
-    """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme. Safe under
-    CONCURRENT runs (capacity=2): every harness run registers its app in the active-run registry
-    (register_run_app), so the janitor distinguishes the three cases instead of using age alone:
-      - registered + owner run_recipe_ci process ALIVE  -> in-flight concurrent run: never reap
-      - registered + owner DEAD (crashed/SIGKILLed run) -> definite orphan: reap immediately
-      - no registry entry (pre-registry code, reboot)   -> fall back to the age threshold
-    Reaps via docker primitives so it works even when the .env is gone (A2/A3). Age fallback default
-    2h, env-overridable via CCCI_JANITOR_MAX_AGE."""
-    import os
+# A lock held longer than 2x the 60-min hard deadline can only be a leaked run (the deadline
+# bounds every healthy run). Flag it for a human — NEVER steal a held lock.
+LONG_HELD_LOCK_SECONDS = 2 * lifetime.HARD_DEADLINE_SECONDS

-    if max_age_seconds is None:
-        max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
+
+def _probe_and_reap(domain: str) -> None:
+    """Probe one run app's lock; reap iff nobody holds it (kernel-guaranteed orphan).
+
+    Reaping happens WHILE HOLDING the probe lock, closing the janitor-vs-new-run race: a new run
+    of the same domain blocks in acquire_app_lock until the reap finishes, so a fresh app never
+    coexists with a half-reaped one. The lockfile is unlinked before release (still holding the
+    lock); a waiter that blocked on the unlinked inode re-checks identity and retries. Two racing
+    janitors arbitrate on the same flock: one reaps, the other sees 'held' and leaves —
+    teardown_app(verify=False) is idempotent either way."""
+    path = _app_lock_path(domain)
+    try:
+        # PEP 446: non-inheritable fd, same as acquire_app_lock.
+        f = open(path, "a")  # noqa: SIM115 — closed in the finally below, lock released with it
+    except OSError as e:
+        print(f"!! janitor: cannot open lockfile {path} ({e}) — skipping {domain}", flush=True)
+        return
+    try:
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            # Held -> live run. Never steal; flag if it has been held implausibly long.
+            try:
+                held_for = time.time() - os.stat(path).st_mtime
+            except OSError:
+                held_for = 0
+            if held_for > LONG_HELD_LOCK_SECONDS:
+                print(
+                    f"!! lock for {domain} held >{LONG_HELD_LOCK_SECONDS // 60}min — possible "
+                    "leaked run; inspect with lslocks",
+                    flush=True,
+                )
+            else:
+                print(
+                    f"  janitor: {domain} lock held — live concurrent run, leaving it", flush=True
+                )
+            return
+        # Acquired — but only the inode the PATH names counts (another janitor may have reaped
+        # and unlinked this inode while we raced; a lock on an unlinked inode protects nothing
+        # and unlinking the path now would delete a NEWER run's lockfile).
+        try:
+            if os.fstat(f.fileno()).st_ino != os.stat(path).st_ino:
+                return
+        except FileNotFoundError:
+            return
+        # Orphan: no live owner (the kernel released the lock when the owner died). Reap while
+        # holding the probe lock, then unlink the lockfile before releasing.
+        print(f"  janitor: {domain} lock acquirable — orphan, reaping", flush=True)
+        with contextlib.suppress(Exception):
+            teardown_app(domain, verify=False)
+        with contextlib.suppress(OSError):
+            os.unlink(path)
+    finally:
+        f.close()
+
+
+def janitor() -> None:
+    """Reap orphaned run apps from crashed/rebooted runs; the kernel flock is the only liveness
+    oracle. For every candidate run app, probe its app-domain lock (LOCK_NB):
+
+      acquirable -> nobody holds it -> orphan -> reap under the probe lock + unlink lockfile
+      held       -> live concurrent run -> leave it (warn if held >2x the hard deadline)
+
+    Candidate discovery is unchanged: `abra app ls` + a docker-service sweep (catches stacks
+    whose .env is already gone), both matched against RUN_APP_RE — warm/canonical apps never
+    match and are never probed. Post-reboot, /run/lock (tmpfs) is empty, so every surviving app
+    probes as an orphan and is reaped immediately (no age threshold). Stale lockfiles with no
+    app behind them are unlinked on sight. Degrades safely: an unreadable lockfile/dir is
+    skipped with a log line, never a crash. Reaps via docker primitives so it works even when
+    the .env is gone (A2/A3)."""
    seen = set()
    for app in abra.app_ls():
        name = app.get("appName") or app.get("domain") or ""
@ -750,18 +803,22 @@ def janitor(max_age_seconds: int | None = None) -> None:
            seen.add(f"{m.group(1)}.ci.commoninternet.net")

    for name in seen:
-        owner = _run_owner_state(name)
-        if owner == "alive":
-            print(f"  janitor: {name} is a live concurrent run — leaving it", flush=True)
-            continue
-        if owner == "unknown":
-            # No registry entry (manual run on pre-registry code, or post-reboot): only the age
-            # threshold protects it, as before.
-            stack = _stack_name(name)
-            age = _stack_age_seconds(stack)
-            if age is not None and age < max_age_seconds:
-                continue  # young and of unknown provenance; leave it
-        # owner == "dead" (a crashed/killed run's definite orphan) or old enough -> reap
-        with contextlib.suppress(Exception):
-            teardown_app(name, verify=False)
-        unregister_run_app(name)
+        _probe_and_reap(name)
+
+    # Tidy /run/lock: a clean run's leftover lockfile is unheld and appless — unlink it (under
+    # its own probe lock, with the same identity check as above).
+    with contextlib.suppress(OSError):
+        for path in glob.glob(os.path.join(_app_lock_dir(), "cc-ci-app-*.lock")):
+            domain = os.path.basename(path)[len("cc-ci-app-") : -len(".lock")]
+            if domain in seen:
+                continue  # handled (or deliberately left) above
+            with contextlib.suppress(OSError):
+                f = open(path, "a")  # noqa: SIM115 — closed below, lock released with it
+                try:
+                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                    if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
+                        os.unlink(path)
+                except (BlockingIOError, FileNotFoundError):
+                    pass  # held (live run pre-deploy) or already gone — leave it
+                finally:
+                    f.close()