From b492f995bdfc4be2248938d8b74ed0da15147fb1 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:04:28 +0000
Subject: [PATCH 1/6] =?UTF-8?q?feat(harness):=20P1=20lock-lifetime=20harde?=
 =?UTF-8?q?ning=20=E2=80=94=20PDEATHSIG=20+=20SIGTERM/SIGALRM=20teardown?=
 =?UTF-8?q?=20funnel=20+=2060-min=20hard=20deadline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- new harness/lifetime.py: install_lifetime_guards() arms PR_SET_PDEATHSIG(SIGTERM) (with
  post-prctl ppid==1 orphan refusal), a SIGTERM handler raising SystemExit through the run's
  finally: teardown funnel (exit 143), and signal.alarm(3600) funnelling SIGALRM the same way
  with a distinct deadline log line (exit 142). Re-entrant signals during teardown are logged
  and ignored (begin_teardown guard) so a second signal can't abort the running cleanup.
- run_recipe_ci.main(): guards installed first thing, before any abra call/lock; both teardown
  finally: blocks (cold + quick) mark begin_teardown().
- .drone.yml recipe-ci step: harness runs under setsid in its own process group; a trap forwards
  the step shell's TERM/EXIT to the whole group so drone cancel reaches the harness instead of
  leaking it (docs/concurrency.md §8.1).
- PEP 446 note on the recipe-lock open(): the fd is non-inheritable, children never carry it.
---
 .drone.yml                  | 11 ++++-
 runner/harness/lifecycle.py |  1 +
 runner/harness/lifetime.py  | 95 +++++++++++++++++++++++++++++++++++++
 runner/run_recipe_ci.py     |  9 ++++
 4 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 runner/harness/lifetime.py

diff --git a/.drone.yml b/.drone.yml
index 5c4c88b..d46adcc 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -70,4 +70,13 @@ steps:
       # build's custom params. CCCI_QUICK=1 makes run_recipe_ci take the opt-in fast lane (WC7);
       # absent => full cold (default). run_quick ignores STAGES (always upgrade+custom).
       - 'echo "recipe-ci: RECIPE=$RECIPE REF=$REF PR=$PR SRC=$SRC stages=$STAGES quick=${CCCI_QUICK:-0}"'
-      - cc-ci-run runner/run_recipe_ci.py
+      # P1 lock-lifetime hardening: run the harness in its own session/process group (setsid) and
+      # forward a drone cancel (TERM to this step shell) to the WHOLE group, so the harness's
+      # SIGTERM handler runs its teardown funnel instead of being leaked (the exec runner kills
+      # only the step shell, not the tree). PDEATHSIG inside the harness backstops the case where
+      # this shell dies without the trap firing. `wait` propagates the harness exit code.
+      - |
+        setsid cc-ci-run runner/run_recipe_ci.py &
+        PID=$!
+        trap 'kill -TERM -- "-$PID" 2>/dev/null' TERM EXIT
+        wait "$PID"
diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py
index 5c44c81..2346948 100644
--- a/runner/harness/lifecycle.py
+++ b/runner/harness/lifecycle.py
@@ -48,6 +48,7 @@ def acquire_recipe_lock(recipe: str):
     recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the
     whole run; the lock is released only when the process exits and the fd closes."""
     path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock")
+    # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
     f = open(path, "w")  # noqa: SIM115 — deliberately held for the lifetime of the run
     try:
         fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
diff --git a/runner/harness/lifetime.py b/runner/harness/lifetime.py
new file mode 100644
index 0000000..4ad7f60
--- /dev/null
+++ b/runner/harness/lifetime.py
@@ -0,0 +1,95 @@
+"""Run-lifetime hardening (concurrency restructure P1).
+
+The concurrency model's invariant chain is:
+
+    lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline
+
+Locks are kernel flocks released on process exit, so the only thing that needs managing is the
+PROCESS lifetime. Three guards, installed at run startup (before any abra call) by
+`install_lifetime_guards()`:
+
+  1. `PR_SET_PDEATHSIG(SIGTERM)`: if the parent (the drone step shell) dies — cancel, runner
+     crash, host shutdown of the step — the kernel delivers SIGTERM to the harness, so a dead
+     build can never leak a running harness that holds locks. Paired with a ppid==1 re-check
+     AFTER the prctl: a parent that died BEFORE the prctl took effect would never trigger the
+     death signal, so a harness that finds itself already reparented refuses to run.
+  2. SIGTERM handler: raise SystemExit so the run's `finally:` teardown funnel executes and the
+     process exits non-zero. Re-entrant deliveries during teardown are logged and IGNORED so a
+     second signal can't abort the cleanup the first one asked for (`begin_teardown()` guards
+     this; the run's own `finally:` blocks also call it so a signal landing mid-normal-teardown
+     can't abort that either).
+  3. `signal.alarm(3600)`: self-imposed hard deadline. SIGALRM funnels into the same teardown
+     path with a distinct log line. Teardown time after the deadline is not alarm-bounded —
+     interrupting a teardown buys nothing; the janitor (flock probe) is the backstop if a
+     teardown wedges and the process is killed harder.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import os
+import signal
+import sys
+
+HARD_DEADLINE_SECONDS = 60 * 60
+
+_PR_SET_PDEATHSIG = 1  # linux/prctl.h
+
+_state = {"tearing_down": False}
+
+
+def begin_teardown() -> None:
+    """Mark the teardown funnel as running. From here on SIGTERM/SIGALRM must NOT raise — it
+    would abort the very cleanup it asks for — so the handlers log and return instead. Called by
+    the handlers themselves before raising, and at the top of the run's `finally:` blocks."""
+    _state["tearing_down"] = True
+
+
+def _funnel_handler(log_line: str, exit_code: int):
+    """A signal handler that routes into the teardown funnel exactly once: log, then raise
+    SystemExit (propagates through the run's try/finally → teardown executes → non-zero exit).
+    While teardown is already running, further signals are logged and swallowed."""
+
+    def handler(signum: int, frame) -> None:  # noqa: ARG001
+        print(log_line, flush=True)
+        if _state["tearing_down"]:
+            print(
+                f"== signal {signum} during teardown — ignored (teardown continues, "
+                "exit stays non-zero) ==",
+                flush=True,
+            )
+            return
+        begin_teardown()
+        raise SystemExit(exit_code)
+
+    return handler
+
+
+def install_lifetime_guards(deadline_seconds: int = HARD_DEADLINE_SECONDS) -> None:
+    """Install all three lifetime guards (see module docstring). Must run at harness startup,
+    before any abra call and before any lock is taken."""
+    libc = ctypes.CDLL("libc.so.6", use_errno=True)
+    if libc.prctl(_PR_SET_PDEATHSIG, signal.SIGTERM, 0, 0, 0) != 0:
+        err = ctypes.get_errno()
+        raise OSError(err, f"prctl(PR_SET_PDEATHSIG, SIGTERM) failed: {os.strerror(err)}")
+    # The prctl is armed now — but only fires for a parent death AFTER this point. If the parent
+    # already died, we are reparented (ppid 1) and would never get the signal: refuse to run, an
+    # orphaned harness would hold locks/apps with nothing managing its lifetime.
+    if os.getppid() == 1:
+        sys.exit("parent died before prctl(PR_SET_PDEATHSIG) — refusing to run orphaned")
+    signal.signal(
+        signal.SIGTERM,
+        _funnel_handler(
+            "== SIGTERM received (drone cancel / parent death) — tearing down ==",
+            128 + signal.SIGTERM,
+        ),
+    )
+    minutes = deadline_seconds // 60
+    signal.signal(
+        signal.SIGALRM,
+        _funnel_handler(
+            f"== run exceeded {minutes}-minute hard deadline — tearing down ==",
+            128 + signal.SIGALRM,
+        ),
+    )
+    signal.alarm(deadline_seconds)
diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py
index c7e55da..5884e63 100644
--- a/runner/run_recipe_ci.py
+++ b/runner/run_recipe_ci.py
@@ -47,6 +47,7 @@ from harness import (  # noqa: E402
     discovery,
     generic,
     lifecycle,
+    lifetime,
     naming,
     warm,
     warmsnap,
@@ -658,6 +659,8 @@ def run_quick(
             results["upgrade"] = "fail"
             results["custom"] = "skip"
     finally:
+        # Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py).
+        lifetime.begin_teardown()
         # F2-11 skip count (read before deciding pass/fail)
         requires_deps_skipped = 0
         try:
@@ -821,6 +824,9 @@ def promote_canonical(recipe: str, head_ref: str | None) -> None:
 
 
 def main() -> int:
+    # P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard
+    # deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py).
+    lifetime.install_lifetime_guards()
     recipe = os.environ.get("RECIPE")
     if not recipe:
         print("RECIPE env is required", file=sys.stderr)
@@ -1123,6 +1129,9 @@ def main() -> int:
                 if op in stages:
                     results[op] = "skip"
     finally:
+        # From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored
+        # (lifetime.py) so a second signal can't abort the cleanup the first one asked for.
+        lifetime.begin_teardown()
         # Teardown the recipe under test FIRST, then deps in reverse declaration order.
         # Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier
         # failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are

From b302f3ab635c82dba9f2e60f3a6e964921ea07e4 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:11:31 +0000
Subject: [PATCH 2/6] =?UTF-8?q?feat(harness):=20P2=20flock-probe=20janitor?=
 =?UTF-8?q?=20=E2=80=94=20the=20kernel=20flock=20IS=20the=20liveness=20ora?=
 =?UTF-8?q?cle?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- acquire_app_lock(domain): exclusive flock on /run/lock/cc-ci-app-<domain>.lock, taken in
  deploy_app exactly where register_run_app was (BEFORE app creation); blocks with a log line
  when another run of the same domain is in flight (double-!testme serialisation). The file
  object is retained in module-level _held_app_locks so GC can never close the fd and silently
  release the lock. mtime is touched at acquisition (lock age for the long-held flag).
- janitor(): probes each candidate's lock (discovery unchanged: abra app ls + docker-service
  sweep vs RUN_APP_RE). Acquirable -> orphan -> teardown_app(verify=False) WHILE HOLDING the
  probe lock (a new same-domain run blocks until the reap finishes), then unlink before release.
  Held -> live run -> leave it; held >120min (2x hard deadline) -> warn, never steal. Stale
  unheld lockfiles with no app are unlinked on sight. Unreadable lockfile -> skip + log.
- unlink/recreate race guard (both sides): after ANY acquisition, verify the locked fd still is
  the inode the path names (fstat vs stat); a waiter that won a just-unlinked inode retries on
  the live path, and a probe that won one skips (unlinking now would hit a newer run's file).
- deleted: register_run_app, unregister_run_app, _run_owner_state, _registry_path,
  ACTIVE_RUN_DIR, CCCI_JANITOR_MAX_AGE + age fallback, _stack_age_seconds, pid-reuse guard.
  teardown_app no longer unregisters (release is process exit). janitor() takes no args now.
- post-reboot: /run/lock is tmpfs -> lockfiles gone -> probe trivially acquires -> immediate
  reap (improvement over the old 2h age fallback).
---
 .drone.yml                  |   9 +-
 runner/harness/lifecycle.py | 253 ++++++++++++++++++++++--------------
 2 files changed, 160 insertions(+), 102 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index d46adcc..d356171 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -37,10 +37,11 @@ steps:
 #
 # Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) +
 # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by
-# the harness, not by serialisation: same-recipe runs serialise on a per-recipe flock
-# (lifecycle.acquire_recipe_lock — the shared ~/.abra/recipes/<recipe> checkout is the conflict),
-# and every run registers its app domain + pid in /run/cc-ci-active so the run-start janitor only
-# reaps orphans whose owning run is DEAD (alive → never touched; unknown → age fallback, default 2h).
+# the harness, not by serialisation: every run holds an exclusive flock on its app domain
+# (/run/lock/cc-ci-app-<domain>.lock) for its whole process lifetime, the run-start janitor probes
+# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs
+# serialise on a per-recipe flock for the shared ~/.abra/recipes/<recipe> checkout
+# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md.
 kind: pipeline
 type: exec
 name: recipe-ci
diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py
index 2346948..210a011 100644
--- a/runner/harness/lifecycle.py
+++ b/runner/harness/lifecycle.py
@@ -7,8 +7,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
 from __future__ import annotations
 
 import contextlib
-import datetime
 import fcntl
+import glob
 import json
 import os
 import re
@@ -18,7 +18,7 @@ import subprocess
 import time
 import urllib.request
 
-from . import abra
+from . import abra, lifetime
 
 GATEWAY_IP = "143.244.213.108"  # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
 # A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
@@ -31,22 +31,22 @@ class TeardownError(RuntimeError):
 
 
 # --- Concurrent-run safety (capacity=2) -------------------------------------------------------
-# Two cooperating mechanisms, both process-lifetime-scoped so SIGKILL can't leak a stale lock:
-#  1. Per-recipe flock: ~/.abra/recipes/<recipe> is ONE shared working tree that fetch_recipe
-#     rm-rf's/reclones and the upgrade tier git-checkouts mid-run. Concurrent runs of the SAME
-#     recipe would corrupt each other's deploy tree (observed: immich builds 229/230 deployed a
-#     tree missing its config), so they serialise on an exclusive flock; different recipes run in
-#     parallel. The kernel drops a flock when the holder dies, however it dies.
-#  2. Active-run registry: each run registers its app domain + pid before creating the app, so the
-#     janitor can tell a live concurrent run from a crashed run's orphan (see janitor()).
+# ONE mechanism, process-lifetime-scoped so SIGKILL can't leak a stale claim: every run holds an
+# exclusive kernel flock on its app DOMAIN (/run/lock/cc-ci-app-<domain>.lock) for the whole run.
+# A held lock implies a live owner — the kernel releases a flock when the holding process dies,
+# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held →
+# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never
+# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each
+# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR)
+# serialise on this app lock. See docs/concurrency.md.
 RECIPE_LOCK_DIR = "/run/lock"
-ACTIVE_RUN_DIR = "/run/cc-ci-active"
 
 
 def acquire_recipe_lock(recipe: str):
-    """Take the per-recipe exclusive lock; blocks (with a log line) if another run of the same
-    recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the
-    whole run; the lock is released only when the process exits and the fd closes."""
+    """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes
+    checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and
+    with it this lock, structurally unnecessary); until then the caller keeps the returned file
+    alive for the whole run and release is implicit at process exit."""
     path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock")
     # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
     f = open(path, "w")  # noqa: SIM115 — deliberately held for the lifetime of the run
@@ -63,39 +63,55 @@ def acquire_recipe_lock(recipe: str):
     return f
 
 
-def _registry_path(domain: str) -> str:
-    return os.path.join(ACTIVE_RUN_DIR, domain)
+# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the
+# caller drops the returned file object, GC would close the fd and silently release the lock —
+# this list is the lock's owner of record. Never cleared; release is process exit.
+_held_app_locks: list = []
 
 
-def register_run_app(domain: str) -> None:
-    """Record this process as the live owner of a run app (called BEFORE the app is created, so a
-    concurrent run's janitor can never observe the app without its registration)."""
-    with contextlib.suppress(OSError):
-        os.makedirs(ACTIVE_RUN_DIR, exist_ok=True)
-        with open(_registry_path(domain), "w") as f:
-            f.write(str(os.getpid()))
+def _app_lock_dir() -> str:
+    """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so
+    post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the
+    tests/concurrency suite (and its helper subprocesses) can use a sandbox dir."""
+    return os.environ.get("CCCI_APP_LOCK_DIR", "/run/lock")
 
 
-def unregister_run_app(domain: str) -> None:
-    with contextlib.suppress(OSError):
-        os.remove(_registry_path(domain))
+def _app_lock_path(domain: str) -> str:
+    return os.path.join(_app_lock_dir(), f"cc-ci-app-{domain}.lock")
 
 
-def _run_owner_state(domain: str) -> str:
-    """'alive' if the registered owner is a live run_recipe_ci process, 'dead' if registered but
-    gone (definite orphan), 'unknown' if never registered (pre-registry code or post-reboot)."""
-    try:
-        with open(_registry_path(domain)) as f:
-            pid = int(f.read().strip())
-    except (OSError, ValueError):
-        return "unknown"
-    try:
-        with open(f"/proc/{pid}/cmdline", "rb") as f:
-            cmdline = f.read().decode(errors="replace").replace("\0", " ")
-    except OSError:
-        return "dead"
-    # Guard against pid reuse: the owner must still look like a harness run.
-    return "alive" if "run_recipe_ci" in cmdline else "dead"
+def acquire_app_lock(domain: str):
+    """Take the per-app-domain exclusive lock; blocks (with a log line) if another run of the
+    same domain is in flight (double-!testme serialisation). Returns the open lock file, which is
+    ALSO retained in _held_app_locks so the flock lives exactly as long as the process.
+
+    Unlink/recreate race guard: the janitor unlinks a reaped orphan's lockfile while holding its
+    flock, so a waiter blocked on the OLD inode can win a lock no later opener can observe (a new
+    open() at the path creates a FRESH inode). After every acquisition, verify the locked fd is
+    still the file at the path (st_ino match); if not, drop it and retry on the live path."""
+    path = _app_lock_path(domain)
+    waited = False
+    while True:
+        # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
+        f = open(path, "a")  # noqa: SIM115 — deliberately held for the rest of the process
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            if not waited:
+                print(f"== app lock: another run of {domain} is in flight — waiting ==", flush=True)
+                waited = True
+            fcntl.flock(f, fcntl.LOCK_EX)
+        try:
+            if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
+                break  # we hold the lock on the inode the path names — done
+        except FileNotFoundError:
+            pass
+        f.close()  # locked a stale (unlinked) inode — retry on the live path
+    os.utime(f.fileno())  # mtime = acquisition time = lock age (janitor's long-held flag)
+    _held_app_locks.append(f)
+    if waited:
+        print(f"== app lock: acquired {path} ==", flush=True)
+    return f
 
 
 def _docker_names(kind: str, stack: str) -> list[str]:
@@ -117,31 +133,6 @@ def _residual(domain: str) -> dict:
     }
 
 
-def _stack_age_seconds(stack: str) -> float | None:
-    """Age of the stack's oldest service, or None if not present."""
-    svcs = _docker_names("service", stack)
-    if not svcs:
-        return None
-    oldest = None
-    for s in svcs:
-        p = subprocess.run(
-            ["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
-            capture_output=True,
-            text=True,
-        )
-        ts = p.stdout.strip()
-        try:
-            # docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
-            dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
-                tzinfo=datetime.UTC
-            )
-        except ValueError:
-            continue
-        age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds()
-        oldest = age if oldest is None else max(oldest, age)
-    return oldest
-
-
 def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
     """Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe
     with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe
@@ -279,9 +270,10 @@ def deploy_app(
     past the 900s default. abra's INTERNAL TIMEOUT (recipe's TIMEOUT env, default 300s) is set via
     EXTRA_ENV; this is the Python subprocess wrapper's timeout so abra doesn't get SIGKILLed mid-deploy."""
     _record_deploy()
-    # Register BEFORE the app exists: a concurrent run's janitor must never see this app without
-    # its live-owner registration (it would reap an in-flight deploy).
-    register_run_app(domain)
+    # Lock BEFORE the app exists: a concurrent run's janitor must never see this app without a
+    # held app lock (it would probe it as an orphan and reap an in-flight deploy). Also the
+    # double-!testme serialisation point: a second run of the same domain blocks here.
+    acquire_app_lock(domain)
     abra.app_config_remove(domain)  # clear any stale .env from a prior crashed run
     abra.app_new(recipe, domain, version=version, secrets=secrets)
     # A pinned version must actually deploy that version: check the recipe out to the tag so the
@@ -720,23 +712,84 @@ def teardown_app(domain: str, verify: bool = True) -> None:
         residual = _residual(domain)
         if any(residual.values()):
             raise TeardownError(f"teardown left residual for {domain}: {residual}")
-    # The app is gone — drop its active-run registration (janitor() also clears it when reaping).
-    unregister_run_app(domain)
+    # No unregistration step: the app lock releases implicitly at process exit. The clean run's
+    # leftover lockfile (unheld) is unlinked on sight by the next janitor's stale-lockfile sweep.
 
 
-def janitor(max_age_seconds: int | None = None) -> None:
-    """Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme. Safe under
-    CONCURRENT runs (capacity=2): every harness run registers its app in the active-run registry
-    (register_run_app), so the janitor distinguishes the three cases instead of using age alone:
-      - registered + owner run_recipe_ci process ALIVE  -> in-flight concurrent run: never reap
-      - registered + owner DEAD (crashed/SIGKILLed run) -> definite orphan: reap immediately
-      - no registry entry (pre-registry code, reboot)   -> fall back to the age threshold
-    Reaps via docker primitives so it works even when the .env is gone (A2/A3). Age fallback default
-    2h, env-overridable via CCCI_JANITOR_MAX_AGE."""
-    import os
+# A lock held longer than 2x the 60-min hard deadline can only be a leaked run (the deadline
+# bounds every healthy run). Flag it for a human — NEVER steal a held lock.
+LONG_HELD_LOCK_SECONDS = 2 * lifetime.HARD_DEADLINE_SECONDS
 
-    if max_age_seconds is None:
-        max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
+
+def _probe_and_reap(domain: str) -> None:
+    """Probe one run app's lock; reap iff nobody holds it (kernel-guaranteed orphan).
+
+    Reaping happens WHILE HOLDING the probe lock, closing the janitor-vs-new-run race: a new run
+    of the same domain blocks in acquire_app_lock until the reap finishes, so a fresh app never
+    coexists with a half-reaped one. The lockfile is unlinked before release (still holding the
+    lock); a waiter that blocked on the unlinked inode re-checks identity and retries. Two racing
+    janitors arbitrate on the same flock: one reaps, the other sees 'held' and leaves —
+    teardown_app(verify=False) is idempotent either way."""
+    path = _app_lock_path(domain)
+    try:
+        # PEP 446: non-inheritable fd, same as acquire_app_lock.
+        f = open(path, "a")  # noqa: SIM115 — closed in the finally below, lock released with it
+    except OSError as e:
+        print(f"!! janitor: cannot open lockfile {path} ({e}) — skipping {domain}", flush=True)
+        return
+    try:
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            # Held -> live run. Never steal; flag if it has been held implausibly long.
+            try:
+                held_for = time.time() - os.stat(path).st_mtime
+            except OSError:
+                held_for = 0
+            if held_for > LONG_HELD_LOCK_SECONDS:
+                print(
+                    f"!! lock for {domain} held >{LONG_HELD_LOCK_SECONDS // 60}min — possible "
+                    "leaked run; inspect with lslocks",
+                    flush=True,
+                )
+            else:
+                print(
+                    f"  janitor: {domain} lock held — live concurrent run, leaving it", flush=True
+                )
+            return
+        # Acquired — but only the inode the PATH names counts (another janitor may have reaped
+        # and unlinked this inode while we raced; a lock on an unlinked inode protects nothing
+        # and unlinking the path now would delete a NEWER run's lockfile).
+        try:
+            if os.fstat(f.fileno()).st_ino != os.stat(path).st_ino:
+                return
+        except FileNotFoundError:
+            return
+        # Orphan: no live owner (the kernel released the lock when the owner died). Reap while
+        # holding the probe lock, then unlink the lockfile before releasing.
+        print(f"  janitor: {domain} lock acquirable — orphan, reaping", flush=True)
+        with contextlib.suppress(Exception):
+            teardown_app(domain, verify=False)
+        with contextlib.suppress(OSError):
+            os.unlink(path)
+    finally:
+        f.close()
+
+
+def janitor() -> None:
+    """Reap orphaned run apps from crashed/rebooted runs; the kernel flock is the only liveness
+    oracle. For every candidate run app, probe its app-domain lock (LOCK_NB):
+
+      acquirable -> nobody holds it -> orphan -> reap under the probe lock + unlink lockfile
+      held       -> live concurrent run -> leave it (warn if held >2x the hard deadline)
+
+    Candidate discovery is unchanged: `abra app ls` + a docker-service sweep (catches stacks
+    whose .env is already gone), both matched against RUN_APP_RE — warm/canonical apps never
+    match and are never probed. Post-reboot, /run/lock (tmpfs) is empty, so every surviving app
+    probes as an orphan and is reaped immediately (no age threshold). Stale lockfiles with no
+    app behind them are unlinked on sight. Degrades safely: an unreadable lockfile/dir is
+    skipped with a log line, never a crash. Reaps via docker primitives so it works even when
+    the .env is gone (A2/A3)."""
     seen = set()
     for app in abra.app_ls():
         name = app.get("appName") or app.get("domain") or ""
@@ -750,18 +803,22 @@ def janitor(max_age_seconds: int | None = None) -> None:
             seen.add(f"{m.group(1)}.ci.commoninternet.net")
 
     for name in seen:
-        owner = _run_owner_state(name)
-        if owner == "alive":
-            print(f"  janitor: {name} is a live concurrent run — leaving it", flush=True)
-            continue
-        if owner == "unknown":
-            # No registry entry (manual run on pre-registry code, or post-reboot): only the age
-            # threshold protects it, as before.
-            stack = _stack_name(name)
-            age = _stack_age_seconds(stack)
-            if age is not None and age < max_age_seconds:
-                continue  # young and of unknown provenance; leave it
-        # owner == "dead" (a crashed/killed run's definite orphan) or old enough -> reap
-        with contextlib.suppress(Exception):
-            teardown_app(name, verify=False)
-        unregister_run_app(name)
+        _probe_and_reap(name)
+
+    # Tidy /run/lock: a clean run's leftover lockfile is unheld and appless — unlink it (under
+    # its own probe lock, with the same identity check as above).
+    with contextlib.suppress(OSError):
+        for path in glob.glob(os.path.join(_app_lock_dir(), "cc-ci-app-*.lock")):
+            domain = os.path.basename(path)[len("cc-ci-app-") : -len(".lock")]
+            if domain in seen:
+                continue  # handled (or deliberately left) above
+            with contextlib.suppress(OSError):
+                f = open(path, "a")  # noqa: SIM115 — closed below, lock released with it
+                try:
+                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                    if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
+                        os.unlink(path)
+                except (BlockingIOError, FileNotFoundError):
+                    pass  # held (live run pre-deploy) or already gone — leave it
+                finally:
+                    f.close()

From 17ebdf39acf19448c8fb8b7d15b356fb9ef06610 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:18:33 +0000
Subject: [PATCH 3/6] =?UTF-8?q?feat(harness):=20P3=20per-run=20ABRA=5FDIR?=
 =?UTF-8?q?=20=E2=80=94=20structural=20recipe-tree=20isolation,=20recipe?=
 =?UTF-8?q?=20flock=20deleted?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_recipe_ci.setup_run_abra_dir(): builds <runs_dir>/<run-id>/abra with servers/ and
  catalogue/ symlinked to the canonical ~/.abra (app .env files keep landing in the shared
  canonical path, so janitor discovery and env-based teardown are unchanged; per-domain
  filenames + the P2 app-domain lock prevent write conflicts) and a FRESH empty recipes/ —
  each run clones + checkouts its own recipe trees. Exported as $ABRA_DIR (honored by the
  abra CLI, verified on-host) before ANY abra call. Manual runs get manual-<pid> isolation.
- fetch_recipe(): plain clone into $ABRA_DIR/recipes/<recipe> — no shared-tree rm-rf, no lock.
  CCCI_SKIP_FETCH=1 now copies the canonically-staged clone into the per-run tree (same staging
  workflow, run reads staged state).
- abra.abra_dir()/recipe_dir(): single resolution rule ($ABRA_DIR else ~/.abra), used by
  recipe_checkout, has_lightweight_version_tags, recipe_head_commit, recipe_versions,
  generic._recipe_dir, lifecycle.prepull_images, snapshot_recipe_tests, and
  warm_reconcile._recipe_dir (which keeps the canonical default for its own systemd runs but
  follows the per-run tree when imported by promote_canonical inside a run).
- deleted: lifecycle.acquire_recipe_lock, RECIPE_LOCK_DIR, the main() call site and the
  must-lock-before-fetch ordering rule.
- tests/{ghost,discourse}/install_steps.sh: RECIPE_DIR resolves ${ABRA_DIR:-$HOME/.abra} so the
  compose.ccci.yml overlay lands in the tree the run actually deploys from (mechanical path fix
  required by per-run trees; no assertion/gate touched — see DECISIONS.md).
- .drone.yml comments updated (HOME=/root rationale now via the servers symlink).
---
 .drone.yml                       | 14 +++----
 runner/harness/abra.py           | 32 ++++++++------
 runner/harness/generic.py        |  2 +-
 runner/harness/lifecycle.py      | 34 +++------------
 runner/run_recipe_ci.py          | 72 +++++++++++++++++++++++++-------
 runner/warm_reconcile.py         |  8 +++-
 tests/discourse/install_steps.sh |  4 +-
 tests/ghost/install_steps.sh     |  4 +-
 8 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index d356171..9908c8b 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -39,9 +39,8 @@ steps:
 # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by
 # the harness, not by serialisation: every run holds an exclusive flock on its app domain
 # (/run/lock/cc-ci-app-<domain>.lock) for its whole process lifetime, the run-start janitor probes
-# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs
-# serialise on a per-recipe flock for the shared ~/.abra/recipes/<recipe> checkout
-# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md.
+# that lock to reap only orphans (held lock = live run, never touched), and recipe working trees
+# are per-run ($ABRA_DIR/recipes — no shared checkout, no recipe lock). See docs/concurrency.md.
 kind: pipeline
 type: exec
 name: recipe-ci
@@ -61,10 +60,11 @@ steps:
   - name: ci
     environment:
       STAGES: install,upgrade,backup,restore,custom
-      # The exec runner points HOME at a per-build workspace; force it to /root so abra finds its
-      # server config + recipes under /root/.abra (as the manual M4/M5 runs did). Safe with
-      # capacity=2: app names are unique per (recipe,pr,ref) and same-recipe runs serialise on the
-      # per-recipe flock, so concurrent builds never touch the same recipe checkout or app.
+      # The exec runner points HOME at a per-build workspace; force it to /root so abra's server
+      # config is found via the per-run ABRA_DIR's servers/ symlink -> /root/.abra/servers.
+      # Recipe trees are PER-RUN ($ABRA_DIR/recipes, exported by run_recipe_ci before any abra
+      # call), so concurrent builds never share a recipe checkout; app .env files are per-domain
+      # in the shared canonical servers/ path, guarded by the app-domain flock.
       HOME: /root
     commands:
       # RECIPE/REF/PR/SRC (+ CCCI_QUICK for `!testme --quick`) are injected as env vars from the
diff --git a/runner/harness/abra.py b/runner/harness/abra.py
index 1c92066..b9d6454 100644
--- a/runner/harness/abra.py
+++ b/runner/harness/abra.py
@@ -10,6 +10,7 @@ Bakes in the known abra gotchas (re-verify per installed abra version, currently
 from __future__ import annotations
 
 import json
+import os
 import subprocess
 
 ABRA = "abra"
@@ -19,6 +20,20 @@ class AbraError(RuntimeError):
     pass
 
 
+def abra_dir() -> str:
+    """abra's state dir, resolved the same way the abra CLI resolves it: $ABRA_DIR if set, else
+    ~/.abra. Inside a CI run, run_recipe_ci exports a PER-RUN $ABRA_DIR (fresh recipes/, shared
+    servers/+catalogue/ symlinks) before any abra call, so every helper here and every abra
+    subprocess agree on the same tree; outside a run (warm_reconcile's systemd timer, manual use)
+    both fall back to the canonical /root/.abra."""
+    return os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra")
+
+
+def recipe_dir(recipe: str) -> str:
+    """The current ABRA_DIR's working tree for a recipe (per-run inside a CI run)."""
+    return os.path.join(abra_dir(), "recipes", recipe)
+
+
 def _run_pty(
     args: list[str], timeout: int = 900, check: bool = True
 ) -> subprocess.CompletedProcess:
@@ -77,9 +92,7 @@ def recipe_checkout(recipe: str, version: str) -> None:
     a chaos (`-C`) deploy ignores ENV VERSION and uses the current checkout — together that silently
     deployed LATEST for a 'previous-version' base, making the upgrade a no-op (Adversary F1d-2). With
     this checkout + a non-chaos deploy, a pinned deploy genuinely deploys that version."""
-    import os
-
-    path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    path = recipe_dir(recipe)
     # -f (force): the version-pinning checkout must yield the EXACT ref tree. Without it, a cc-ci
     # install_steps-provided overlay (e.g. discourse's compose.ccci.yml, copied into the pinned base)
     # is an UNTRACKED file that collides with the same path TRACKED in a later ref, and
@@ -100,9 +113,7 @@ def has_lightweight_version_tags(recipe: str) -> bool:
     'reference not found'.) The caller (deploy_app) uses this to fall back to a chaos base deploy
     (which skips lint and deploys the explicitly-checked-out pinned version — see lifecycle.deploy_app).
     Read-only: just `git tag` + `cat-file -t`; no fetch/mutation, so it can't trigger abra's revert."""
-    import os
-
-    path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    path = recipe_dir(recipe)
     tags = subprocess.run(
         ["git", "-C", path, "tag", "-l"], capture_output=True, text=True
     ).stdout.split()
@@ -231,9 +242,7 @@ def recipe_head_commit(recipe: str) -> str | None:
     """The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or
     the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the
     prev-tag base deploy reset the working tree (HC1)."""
-    import os
-
-    path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    path = recipe_dir(recipe)
     proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True)
     out = proc.stdout.strip()
     return out or None
@@ -241,10 +250,7 @@ def recipe_head_commit(recipe: str) -> str | None:
 
 def recipe_versions(recipe: str) -> list[str]:
     """Published versions of a recipe, oldest→newest (from the recipe git tags)."""
-    import os
-    import subprocess
-
-    path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    path = recipe_dir(recipe)
     proc = subprocess.run(
         ["git", "-C", path, "tag", "--sort=creatordate"], capture_output=True, text=True
     )
diff --git a/runner/harness/generic.py b/runner/harness/generic.py
index f42b896..d468dbf 100644
--- a/runner/harness/generic.py
+++ b/runner/harness/generic.py
@@ -25,7 +25,7 @@ _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
 
 
 def _recipe_dir(recipe: str) -> str:
-    return os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    return abra.recipe_dir(recipe)  # the per-run tree inside a CI run ($ABRA_DIR)
 
 
 def backup_capable(recipe: str, meta: dict | None = None) -> bool:
diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py
index 210a011..10a1d4b 100644
--- a/runner/harness/lifecycle.py
+++ b/runner/harness/lifecycle.py
@@ -37,31 +37,9 @@ class TeardownError(RuntimeError):
 # however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held →
 # leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never
 # steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each
-# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR)
-# serialise on this app lock. See docs/concurrency.md.
-RECIPE_LOCK_DIR = "/run/lock"
-
-
-def acquire_recipe_lock(recipe: str):
-    """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes
-    checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and
-    with it this lock, structurally unnecessary); until then the caller keeps the returned file
-    alive for the whole run and release is implicit at process exit."""
-    path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock")
-    # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
-    f = open(path, "w")  # noqa: SIM115 — deliberately held for the lifetime of the run
-    try:
-        fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
-    except BlockingIOError:
-        print(
-            f"== recipe lock: another {recipe} run is in flight — waiting for {path} "
-            "(shared ~/.abra/recipes checkout) ==",
-            flush=True,
-        )
-        fcntl.flock(f, fcntl.LOCK_EX)
-        print(f"== recipe lock: acquired {path} ==", flush=True)
-    return f
-
+# run deploys from its own per-run ABRA_DIR — there is no shared recipe tree and no recipe lock),
+# and same-domain runs (double-!testme of one PR) serialise on this app lock.
+# See docs/concurrency.md.
 
 # Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the
 # caller drops the returned file object, GC would close the fd and silently release the lock —
@@ -209,9 +187,9 @@ def prepull_images(recipe: str, domain: str) -> None:
     app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE).
     Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real
     pull error (don't mask it)."""
-    import os
-
-    recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    recipe_dir = abra.recipe_dir(recipe)  # per-run tree inside a CI run
+    # The app .env lives in the CANONICAL servers path (the per-run ABRA_DIR's servers/ is a
+    # symlink to it, so abra and this path agree on the same file).
     env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
     if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
         print(f"  prepull: recipe dir or .env missing for {recipe} — skipping", flush=True)
diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py
index 5884e63..9f8f539 100644
--- a/runner/run_recipe_ci.py
+++ b/runner/run_recipe_ci.py
@@ -138,18 +138,62 @@ def _gitea_token() -> str | None:
     return tok or None
 
 
+def setup_run_abra_dir() -> str:
+    """P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
+
+    `<runs_dir>/<run-id>/abra/` with:
+      servers/   -> symlink to the canonical ~/.abra/servers. App .env files land in the shared
+                    canonical path, so janitor discovery (`abra app ls`) and env-based teardown
+                    work unchanged from any process; per-domain filenames + the app-domain lock
+                    prevent write conflicts.
+      catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly).
+      recipes/   fresh + empty — THE isolation that matters: each run clones and git-checkouts
+                 its own recipe trees, so concurrent runs (same recipe included) can never
+                 corrupt each other's deploy tree. Replaces the per-recipe flock.
+    Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper
+    (abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention."""
+    canonical = os.path.expanduser("~/.abra")
+    rid = results_mod.run_id()
+    if rid == "manual":
+        rid = f"manual-{os.getpid()}"  # two concurrent hand-runs must not share a tree
+    run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra")
+    os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True)
+    for shared in ("servers", "catalogue"):
+        link = os.path.join(run_abra_dir, shared)
+        if not os.path.islink(link):
+            os.symlink(os.path.join(canonical, shared), link)
+    os.environ["ABRA_DIR"] = run_abra_dir
+    print(
+        f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==",
+        flush=True,
+    )
+    return run_abra_dir
+
+
 def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
-    """Make the recipe available at the code under test. If SRC+REF point at the mirror PR,
+    """Make the recipe available at the code under test in THIS RUN's recipe tree
+    ($ABRA_DIR/recipes/<recipe>): a plain clone — no locking needed, no rm-rf of any shared
+    state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered
+    `abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR,
     clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot
     token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed)."""
-    recipes_dir = os.path.expanduser("~/.abra/recipes")
-    os.makedirs(recipes_dir, exist_ok=True)
-    dest = os.path.join(recipes_dir, recipe)
-    # CCCI_SKIP_FETCH=1: use the local recipe clone as-is (lets a test/Adversary stage a fake/broken
-    # ref — e.g. a simulated broken PR head for the --quick rollback proof — without it being clobbered
-    # by a re-fetch). Never set in production CI.
+    dest = abra.recipe_dir(recipe)
+    os.makedirs(os.path.dirname(dest), exist_ok=True)
+    # CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a
+    # fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it
+    # being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/<recipe>;
+    # copy it into the per-run tree so the rest of the run reads the staged state. Never set in
+    # production CI.
     if os.environ.get("CCCI_SKIP_FETCH") == "1":
-        print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True)
+        canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}")
+        subprocess.run(["rm", "-rf", dest], check=False)
+        if os.path.isdir(canonical):
+            shutil.copytree(canonical, dest, symlinks=True)
+        print(
+            f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is "
+            f"(copied {canonical} -> per-run tree)",
+            flush=True,
+        )
         return
     if src and ref:
         url = f"https://git.autonomic.zone/{src}.git"
@@ -178,7 +222,7 @@ def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
 def snapshot_recipe_tests(recipe: str) -> str | None:
     """Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the
     recipe to a version tag during the run. Returns the snapshot path, or None if no tests/."""
-    src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests")
+    src = os.path.join(abra.recipe_dir(recipe), "tests")
     if not os.path.isdir(src):
         return None
     has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile(
@@ -841,12 +885,10 @@ def main() -> int:
     print(
         f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}"
     )
-    # Concurrent-run safety: runs of the SAME recipe serialise on a per-recipe flock — they share
-    # ONE ~/.abra/recipes/<recipe> working tree which fetch_recipe (below) rm-rf's/reclones and the
-    # upgrade tier git-checkouts mid-run. Must be taken BEFORE fetch_recipe. Different recipes run
-    # in parallel (capacity=2). The reference must stay alive for the whole run: the kernel drops
-    # the flock when the fd closes (including on any crash/SIGKILL — no stale-lock failure mode).
-    _recipe_lock = lifecycle.acquire_recipe_lock(recipe)  # noqa: F841
+    # Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR
+    # (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs
+    # serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md).
+    setup_run_abra_dir()
     fetch_recipe(recipe, ref, src)
     # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test
     # (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout
diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py
index d9e58b7..d41fef0 100644
--- a/runner/warm_reconcile.py
+++ b/runner/warm_reconcile.py
@@ -199,7 +199,13 @@ def _run(cmd, timeout=120, check=False):
 
 
 def _recipe_dir(recipe: str) -> str:
-    return os.path.expanduser(f"~/.abra/recipes/{recipe}")
+    # Resolve like the abra CLI does: $ABRA_DIR (the per-run tree when imported by a CI run,
+    # e.g. promote_canonical) else the canonical ~/.abra (this module's own systemd-timer runs,
+    # which set no ABRA_DIR). Keeps fetch_recipe (an `abra` subprocess) and the git readers
+    # below pointed at the SAME tree in both contexts.
+    return os.path.join(
+        os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra"), "recipes", recipe
+    )
 
 
 def recipe_tags(recipe: str) -> list[str]:
diff --git a/tests/discourse/install_steps.sh b/tests/discourse/install_steps.sh
index 330a5cc..930e663 100755
--- a/tests/discourse/install_steps.sh
+++ b/tests/discourse/install_steps.sh
@@ -15,7 +15,9 @@ set -euo pipefail
 
 : "${CCCI_RECIPE:?missing CCCI_RECIPE}"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}"
+# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else
+# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from.
+RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"
 
 if [ ! -d "$RECIPE_DIR" ]; then
   echo "  discourse install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2
diff --git a/tests/ghost/install_steps.sh b/tests/ghost/install_steps.sh
index ef10674..2c2dc50 100755
--- a/tests/ghost/install_steps.sh
+++ b/tests/ghost/install_steps.sh
@@ -15,7 +15,9 @@ set -euo pipefail
 
 : "${CCCI_RECIPE:?missing CCCI_RECIPE}"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}"
+# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else
+# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from.
+RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"
 
 if [ ! -d "$RECIPE_DIR" ]; then
   echo "  ghost install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2

From 91d3cc7e992b93dfce33d9cb603161ce66c4c10d Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:19:35 +0000
Subject: [PATCH 4/6] =?UTF-8?q?chore(ci):=20P4=20config=20cleanup=20?=
 =?UTF-8?q?=E2=80=94=20DRONE=5FRUNNER=5FCAPACITY=20is=20the=20single=20con?=
 =?UTF-8?q?currency=20knob?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove concurrency.limit from the recipe-ci pipeline (.drone.yml): it duplicated
DRONE_RUNNER_CAPACITY (nix/modules/drone-runner.nix maxTests) and the two had to be kept in
step by hand (docs/concurrency.md §8.6). maxTests comment updated to state it is the single
knob and to describe the new safety model.
---
 .drone.yml                   |  4 ++--
 nix/modules/drone-runner.nix | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 9908c8b..d7c3674 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -53,8 +53,8 @@ trigger:
   event:
     - custom
 
-concurrency:
-  limit: 2
+# NB deliberately NO `concurrency.limit` here: DRONE_RUNNER_CAPACITY (nix/modules/drone-runner.nix
+# maxTests) is the single concurrency knob (P4 — two knobs in two files drifted).
 
 steps:
   - name: ci
diff --git a/nix/modules/drone-runner.nix b/nix/modules/drone-runner.nix
index b6f91b4..3c14f01 100644
--- a/nix/modules/drone-runner.nix
+++ b/nix/modules/drone-runner.nix
@@ -8,18 +8,18 @@
 { pkgs, config, lib, ... }:
 let
   # MAX_TESTS (plan §4.2/§4.3 resource safety): max CI builds the exec runner runs at once. Drone
-  # queues the rest in its native pending-build queue (no custom queue). THE concurrency cap that
-  # bounds how many test apps can be live at once.
+  # queues the rest in its native pending-build queue (no custom queue). THE SINGLE concurrency
+  # knob — nothing else caps recipe-ci parallelism (the .drone.yml concurrency.limit was removed:
+  # one knob, one place). Bounds how many test apps can be live at once.
   #
   # Raised to 2 (operator request 2026-06-09) so two recipes can be tested in parallel (e.g. immich
   # and plausible under active development at once). Verified safe on the current node (Hetzner cpx22,
   # ~7.6 GiB / 4 vCPU — NOTE: smaller than the original 28 GiB this was written for): a full immich CI
   # stack measured ~1 GiB (server+ML+pg+redis) with multiple GiB free, so two concurrent recipes fit.
-  # The concurrency PRECONDITION holds: the run-start janitor is age-based (default 2h) + run-app-name
-  # scoped, so it never reaps a concurrent in-flight run (harness.lifecycle.janitor). TRADE-OFF: with
-  # capacity>1 a SIGKILL'd build (no teardown) leaves an orphan the run-start sweep can't reap
-  # immediately (it might be a live run) — bounded instead by the 2h janitor + the /upgrade-all
-  # start/end reap + sweep-orphans. Revert to "1" if OOM / disk-I/O contention is observed under load.
+  # Concurrent-run safety is the harness's job at ANY capacity (docs/concurrency.md): per-run
+  # ABRA_DIR recipe trees, per-app-domain flocks, and a flock-probe janitor that reaps a crashed
+  # build's orphan immediately (held lock = live run, never touched). Revert to "1" if OOM /
+  # disk-I/O contention is observed under load.
   maxTests = "2";
 in
 {

From 84d90fb6552a71dbd44adb23a7767aac8289f270 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:29:36 +0000
Subject: [PATCH 5/6] =?UTF-8?q?test(concurrency):=20real-kernel=20suite=20?=
 =?UTF-8?q?for=20the=20restructured=20model=20=E2=80=94=2020=20tests,=2019?=
 =?UTF-8?q?=20plan=20cases?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tests/concurrency/ — NOT in the default `pytest tests/unit` gate; run explicitly with
`pytest tests/concurrency -q`. flock/prctl/alarm are never mocked: helper subprocesses
(helpers.py) hold real locks and install the real lifetime guards; locks live in a per-test
tmp dir via CCCI_APP_LOCK_DIR; every helper (and recorded grandchild) is reaped by fixture
cleanup.

- test_locks.py (cases 1-4): SIGKILL auto-release; LOCK_NB held/unheld semantics; PEP 446
  fd-not-inherited (holder's child survives, lock still releases); same-domain second acquire
  blocks until first holder exits.
- test_janitor.py (cases 5-12): orphan reaped once + lockfile unlinked; live holder never
  reaped + logged; new-run acquire blocks until a slow reap completes (reap-under-probe-lock);
  two overlapping janitors -> exactly one reaps (flock arbitration); reboot sim (no lockfile)
  reaps immediately with no age wait; >120min-held lock flagged 'possible leaked run' and NOT
  stolen; warm/canonical names never probed (no lockfile even created); directory-as-lockfile
  and missing lock dir degrade to skip+log, never crash.
- test_lifetime.py (cases 13-16): PDEATHSIG (wrapper parent SIGKILL'd -> guarded child TERM'd,
  teardown marker, lock released); already-orphaned helper REFUSES to run (ppid race); 2s
  deadline alarm -> teardown + exit 142 + lock released; SIGTERM -> teardown + exit 143 +
  lock released.
- test_abra_dir.py (cases 17-19 + 18b): per-run dir built + $ABRA_DIR exported before the
  first abra call (recording stub abra on PATH); two CONCURRENT same-recipe fetch+checkout
  flows into different ABRA_DIRs -> divergent correct trees, canonical staged clone untouched;
  .env written through the servers/ symlink lands in the canonical path (env_get/env_set
  agree); manual runs get pid-suffixed dirs.

On cc-ci: pytest tests/concurrency -q -> 20 passed; tests/unit -> 138 passed; lint PASS.
---
 tests/concurrency/concutil.py      | 108 +++++++++++++++++
 tests/concurrency/conftest.py      |  34 ++++++
 tests/concurrency/helpers.py       | 120 ++++++++++++++++++
 tests/concurrency/test_abra_dir.py | 175 ++++++++++++++++++++++++++
 tests/concurrency/test_janitor.py  | 189 +++++++++++++++++++++++++++++
 tests/concurrency/test_lifetime.py |  82 +++++++++++++
 tests/concurrency/test_locks.py    |  85 +++++++++++++
 7 files changed, 793 insertions(+)
 create mode 100644 tests/concurrency/concutil.py
 create mode 100644 tests/concurrency/conftest.py
 create mode 100644 tests/concurrency/helpers.py
 create mode 100644 tests/concurrency/test_abra_dir.py
 create mode 100644 tests/concurrency/test_janitor.py
 create mode 100644 tests/concurrency/test_lifetime.py
 create mode 100644 tests/concurrency/test_locks.py

diff --git a/tests/concurrency/concutil.py b/tests/concurrency/concutil.py
new file mode 100644
index 0000000..374e747
--- /dev/null
+++ b/tests/concurrency/concutil.py
@@ -0,0 +1,108 @@
+"""Shared utilities for the real-kernel concurrency suite (imported by the test modules; the
+fixtures in conftest.py wrap these). No flock mocking anywhere — probes use real LOCK_NB."""
+
+from __future__ import annotations
+
+import contextlib
+import fcntl
+import os
+import signal
+import subprocess
+import sys
+import time
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from harness import lifecycle  # noqa: E402
+
+HELPERS = os.path.join(os.path.dirname(__file__), "helpers.py")
+DOMAIN = "test-abc123.ci.commoninternet.net"  # matches RUN_APP_RE
+
+
+class HelperPool:
+    """Spawns helpers.py subprocesses and GUARANTEES their cleanup (incl. recorded grandchild
+    pids from `hold-with-child`/`wrapper` markers) — no leaked children in the test VM."""
+
+    def __init__(self, out_dir: str):
+        self.out_dir = out_dir
+        self.procs: list[subprocess.Popen] = []
+        self.extra_pids: list[int] = []
+        self._n = 0
+
+    def spawn(self, *args: str, env_extra: dict | None = None) -> tuple[subprocess.Popen, str]:
+        """Start `helpers.py <args...>`; returns (proc, marker_file)."""
+        self._n += 1
+        out = os.path.join(self.out_dir, f"helper-{self._n}.out")
+        env = dict(os.environ, CCCI_HELPER_OUT=out, **(env_extra or {}))
+        p = subprocess.Popen(  # noqa: S603
+            [sys.executable, HELPERS, *args],
+            env=env,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+        self.procs.append(p)
+        return p, out
+
+    def track_pid(self, pid: int) -> None:
+        self.extra_pids.append(pid)
+
+    def cleanup(self) -> None:
+        for p in self.procs:
+            if p.poll() is None:
+                p.kill()
+            with contextlib.suppress(subprocess.TimeoutExpired):
+                p.wait(timeout=10)
+        for pid in self.extra_pids:
+            with contextlib.suppress(OSError):
+                os.kill(pid, signal.SIGKILL)
+
+
+def wait_marker(out: str, token: str, timeout: float = 15.0) -> str | None:
+    """Poll a helper's marker file for a line containing `token`; returns the line or None."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            with open(out) as f:
+                for line in f:
+                    if token in line:
+                        return line.strip()
+        except OSError:
+            pass
+        time.sleep(0.1)
+    return None
+
+
+def lock_state(domain: str) -> str:
+    """'held' | 'free' | 'absent' for the domain's lockfile, probed with a REAL LOCK_NB."""
+    path = lifecycle._app_lock_path(domain)  # noqa: SLF001
+    if not os.path.exists(path):
+        return "absent"
+    with open(path, "a") as f:
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            return "free"
+        except BlockingIOError:
+            return "held"
+
+
+def wait_lock_state(domain: str, want: str, timeout: float = 10.0) -> str:
+    """Poll until lock_state(domain) == want (kernel release on process death is fast, but give
+    the scheduler room). Returns the final observed state."""
+    deadline = time.time() + timeout
+    state = lock_state(domain)
+    while state != want and time.time() < deadline:
+        time.sleep(0.1)
+        state = lock_state(domain)
+    return state
+
+
+def pid_alive(pid: int) -> bool:
+    return os.path.exists(f"/proc/{pid}")
+
+
+def wait_pid_gone(pid: int, timeout: float = 15.0) -> bool:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if not pid_alive(pid):
+            return True
+        time.sleep(0.1)
+    return False
diff --git a/tests/concurrency/conftest.py b/tests/concurrency/conftest.py
new file mode 100644
index 0000000..ab7be03
--- /dev/null
+++ b/tests/concurrency/conftest.py
@@ -0,0 +1,34 @@
+"""Fixtures for the real-kernel concurrency suite (concurrency-restructure plan, 19 cases).
+
+NOT part of the default `pytest tests/unit` gate — run explicitly with `pytest tests/concurrency
+-q` (docs/concurrency.md). Locks live in a per-test tmp dir (CCCI_APP_LOCK_DIR); helper
+subprocesses hold REAL flocks / install the REAL prctl+signal guards and are always reaped in
+fixture finalizers (no leaked children in the test VM).
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from concutil import HelperPool  # noqa: E402
+
+
+@pytest.fixture
+def lock_dir(tmp_path, monkeypatch):
+    """Sandbox lock dir, exported so BOTH this process's lifecycle calls and helper subprocesses
+    (which inherit os.environ) resolve their lockfiles here — never /run/lock."""
+    d = tmp_path / "locks"
+    d.mkdir()
+    monkeypatch.setenv("CCCI_APP_LOCK_DIR", str(d))
+    return str(d)
+
+
+@pytest.fixture
+def pool(tmp_path):
+    hp = HelperPool(str(tmp_path))
+    yield hp
+    hp.cleanup()
diff --git a/tests/concurrency/helpers.py b/tests/concurrency/helpers.py
new file mode 100644
index 0000000..559801f
--- /dev/null
+++ b/tests/concurrency/helpers.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""Subprocess helpers for tests/concurrency — REAL kernel locks and the REAL lifetime guards in
+separate processes (flock/prctl are never mocked; tests assert on actual kernel behavior).
+
+Invoked as:  python3 helpers.py <command> <args...>
+
+Env contract (set by the spawning test):
+  CCCI_APP_LOCK_DIR  sandbox lock dir (never /run/lock in tests)
+  CCCI_HELPER_OUT    marker file this helper APPENDS progress lines to (ACQUIRED/READY/...)
+
+Commands:
+  hold <domain>                 acquire the app lock, mark `ACQUIRED <ts>`, sleep forever
+  hold-with-child <domain>      acquire the lock, spawn a plain sleeping subprocess child, mark
+                                `ACQUIRED <ts>` + `CHILD <pid>` (PEP 446: the child must NOT
+                                inherit the lock fd), sleep forever
+  guarded <domain> <deadline>   install the REAL lifetime guards (alarm=<deadline>s), acquire the
+                                lock, mark `READY`; when the teardown funnel runs (`finally:`),
+                                mark `TEARDOWN` before exiting
+  wrapper <domain>              spawn `guarded <domain> 3600` as MY child, mark `WRAPPED <pid>`,
+                                sleep — the test kills me to prove PDEATHSIG TERMs the child
+  orphan-probe                  wait (bounded) until reparented (ppid==1), then install the
+                                guards; mark `REFUSED` if they exit (expected) or `GUARDS_OK`
+  fetch-checkout <recipe> <ref> run run_recipe_ci.fetch_recipe (the test sets CCCI_SKIP_FETCH=1
+                                + a per-"run" ABRA_DIR), git-checkout <ref>, mark
+                                `RESULT <head> <data.txt content>`
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+import time
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "runner"))
+from harness import abra, lifecycle, lifetime  # noqa: E402
+
+OUT = os.environ.get("CCCI_HELPER_OUT")
+
+
+def mark(line: str) -> None:
+    if OUT:
+        with open(OUT, "a") as f:
+            f.write(line + "\n")
+            f.flush()
+    print(line, flush=True)
+
+
+def cmd_hold(domain: str) -> None:
+    lifecycle.acquire_app_lock(domain)
+    mark(f"ACQUIRED {time.time()}")
+    time.sleep(3600)
+
+
+def cmd_hold_with_child(domain: str) -> None:
+    lifecycle.acquire_app_lock(domain)
+    child = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(3600)"])
+    mark(f"ACQUIRED {time.time()}")
+    mark(f"CHILD {child.pid}")
+    time.sleep(3600)
+
+
+def cmd_guarded(domain: str, deadline: str) -> None:
+    lifetime.install_lifetime_guards(deadline_seconds=int(deadline))
+    lifecycle.acquire_app_lock(domain)
+    mark("READY")
+    try:
+        time.sleep(3600)
+    finally:
+        mark("TEARDOWN")
+
+
+def cmd_wrapper(domain: str) -> None:
+    p = subprocess.Popen(  # noqa: S603
+        [sys.executable, os.path.abspath(__file__), "guarded", domain, "3600"],
+        env=os.environ.copy(),
+    )
+    mark(f"WRAPPED {p.pid}")
+    time.sleep(3600)
+
+
+def cmd_orphan_probe() -> None:
+    # Our spawner exits immediately after fork; wait (bounded) until we are reparented so the
+    # prctl is installed with the parent ALREADY dead — the exact race the ppid check closes.
+    for _ in range(200):
+        if os.getppid() == 1:
+            break
+        time.sleep(0.05)
+    else:
+        mark("NEVER_REPARENTED")  # e.g. a subreaper environment — test will fail visibly
+        return
+    try:
+        lifetime.install_lifetime_guards()
+    except SystemExit:
+        mark("REFUSED")
+        raise
+    mark("GUARDS_OK")
+
+
+def cmd_fetch_checkout(recipe: str, ref: str) -> None:
+    import run_recipe_ci
+
+    run_recipe_ci.fetch_recipe(recipe, None, None)
+    abra.recipe_checkout(recipe, ref)
+    head = abra.recipe_head_commit(recipe)
+    with open(os.path.join(abra.recipe_dir(recipe), "data.txt")) as f:
+        content = f.read().strip()
+    mark(f"RESULT {head} {content}")
+
+
+if __name__ == "__main__":
+    cmd, *args = sys.argv[1:]
+    {
+        "hold": cmd_hold,
+        "hold-with-child": cmd_hold_with_child,
+        "guarded": cmd_guarded,
+        "wrapper": cmd_wrapper,
+        "orphan-probe": cmd_orphan_probe,
+        "fetch-checkout": cmd_fetch_checkout,
+    }[cmd](*args)
diff --git a/tests/concurrency/test_abra_dir.py b/tests/concurrency/test_abra_dir.py
new file mode 100644
index 0000000..9867fe1
--- /dev/null
+++ b/tests/concurrency/test_abra_dir.py
@@ -0,0 +1,175 @@
+"""Per-run ABRA_DIR isolation (concurrency-restructure plan, cases 17-19). Real directories,
+real symlinks, real git — abra itself is replaced by a recording stub where a CLI call is
+involved (case 17), because these cases test OUR dir/env plumbing, not abra."""
+
+from __future__ import annotations
+
+import os
+import stat
+import subprocess
+import sys
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+import run_recipe_ci  # noqa: E402
+from concutil import wait_marker  # noqa: E402
+from harness import abra  # noqa: E402
+
+RECIPE = "fakerecipe"
+
+
+def _git(cwd, *args):
+    subprocess.run(
+        ["git", "-c", "user.email=t@t", "-c", "user.name=t", *args],
+        cwd=cwd,
+        check=True,
+        capture_output=True,
+    )
+
+
+def _make_fake_home(tmp_path):
+    """A fake $HOME with a canonical ~/.abra: servers/default + catalogue dirs, and a recipe git
+    repo with two tags whose data.txt differs (v1 -> 'one', v2 -> 'two', HEAD at v2)."""
+    home = tmp_path / "home"
+    (home / ".abra" / "servers" / "default").mkdir(parents=True)
+    (home / ".abra" / "catalogue").mkdir(parents=True)
+    repo = home / ".abra" / "recipes" / RECIPE
+    repo.mkdir(parents=True)
+    _git(repo, "init", "-q")
+    (repo / "data.txt").write_text("one\n")
+    _git(repo, "add", "data.txt")
+    _git(repo, "commit", "-qm", "v1")
+    _git(repo, "tag", "v1")
+    (repo / "data.txt").write_text("two\n")
+    _git(repo, "add", "data.txt")
+    _git(repo, "commit", "-qm", "v2")
+    _git(repo, "tag", "v2")
+    return home
+
+
+def test_17_per_run_dir_built_and_exported_before_abra(tmp_path, monkeypatch):
+    """Case 17: setup_run_abra_dir builds the per-run dir correctly (servers/catalogue symlinks
+    resolve to the canonical tree, recipes/ empty + writable) and $ABRA_DIR is exported before
+    the first abra call — proven by a stub `abra` on PATH that records the env it saw."""
+    home = _make_fake_home(tmp_path)
+    monkeypatch.setenv("HOME", str(home))
+    monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
+    monkeypatch.setenv("DRONE_BUILD_NUMBER", "777")
+    monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten")  # so monkeypatch restores it
+
+    d = run_recipe_ci.setup_run_abra_dir()
+    assert d == str(tmp_path / "runs" / "777" / "abra")
+    assert os.environ["ABRA_DIR"] == d
+    assert os.readlink(os.path.join(d, "servers")) == str(home / ".abra" / "servers")
+    assert os.readlink(os.path.join(d, "catalogue")) == str(home / ".abra" / "catalogue")
+    # symlinks RESOLVE (targets exist) and recipes/ is empty + writable
+    assert os.path.isdir(os.path.join(d, "servers", "default"))
+    assert os.path.isdir(os.path.join(d, "catalogue"))
+    assert os.listdir(os.path.join(d, "recipes")) == []
+    probe = os.path.join(d, "recipes", ".write-probe")
+    open(probe, "w").close()
+    os.remove(probe)
+    # idempotent re-entry (Drone build-number retry): must not raise on existing symlinks
+    assert run_recipe_ci.setup_run_abra_dir() == d
+
+    # stub abra records $ABRA_DIR at call time; fetch_recipe's catalogue branch invokes it
+    stub_dir = tmp_path / "bin"
+    stub_dir.mkdir()
+    log = tmp_path / "abra-env.log"
+    stub = stub_dir / "abra"
+    stub.write_text(f'#!/bin/sh\necho "$ABRA_DIR" >> {log}\nexit 0\n')
+    stub.chmod(stub.stat().st_mode | stat.S_IEXEC)
+    monkeypatch.setenv("PATH", f"{stub_dir}{os.pathsep}{os.environ['PATH']}")
+    monkeypatch.delenv("CCCI_SKIP_FETCH", raising=False)
+    run_recipe_ci.fetch_recipe(RECIPE, None, None)
+    assert log.read_text().strip() == d, "abra was called without the per-run ABRA_DIR exported"
+
+
+def test_18_concurrent_same_recipe_fetch_no_cross_talk(tmp_path, monkeypatch, pool):
+    """Case 18: two CONCURRENT fetch+checkout flows of the SAME recipe into different ABRA_DIRs
+    produce two correct, divergent trees (v1 vs v2) — the old shared-tree corruption scenario,
+    now structurally safe with no lock. The canonical staged clone is untouched."""
+    home = _make_fake_home(tmp_path)
+    canonical_repo = home / ".abra" / "recipes" / RECIPE
+    head_before = subprocess.run(
+        ["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True
+    ).stdout.strip()
+
+    runs = {}
+    for name, ref in (("runA", "v1"), ("runB", "v2")):
+        abra_dir = tmp_path / name / "abra"
+        abra_dir.mkdir(parents=True)
+        _, out = pool.spawn(
+            "fetch-checkout",
+            RECIPE,
+            ref,
+            env_extra={
+                "HOME": str(home),
+                "ABRA_DIR": str(abra_dir),
+                "CCCI_SKIP_FETCH": "1",
+            },
+        )
+        runs[name] = (out, ref, abra_dir)
+
+    expect = {"v1": "one", "v2": "two"}
+    for name, (out, ref, abra_dir) in runs.items():
+        line = wait_marker(out, "RESULT", timeout=30)
+        assert line, f"{name} never produced a RESULT"
+        _, head, content = line.split()
+        assert content == expect[ref], f"{name}@{ref}: tree content {content!r}"
+        tree = abra_dir / "recipes" / RECIPE
+        assert (tree / "data.txt").read_text().strip() == expect[ref]
+        assert (
+            head
+            == subprocess.run(
+                ["git", "-C", tree, "rev-parse", "HEAD"], capture_output=True, text=True
+            ).stdout.strip()
+        )
+
+    # the two trees genuinely diverge AND the canonical staged clone is untouched
+    a = (runs["runA"][2] / "recipes" / RECIPE / "data.txt").read_text()
+    b = (runs["runB"][2] / "recipes" / RECIPE / "data.txt").read_text()
+    assert a != b
+    head_after = subprocess.run(
+        ["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True
+    ).stdout.strip()
+    assert head_after == head_before, "canonical clone must not be touched by per-run fetches"
+
+
+def test_19_env_written_through_servers_symlink_lands_canonical(tmp_path, monkeypatch):
+    """Case 19: an app .env written through the per-run servers/ symlink (what abra does under
+    $ABRA_DIR) lands in the CANONICAL shared path — so janitor discovery and every
+    expanduser('~/.abra/servers/...') reader keep working unchanged."""
+    home = _make_fake_home(tmp_path)
+    monkeypatch.setenv("HOME", str(home))
+    monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
+    monkeypatch.setenv("DRONE_BUILD_NUMBER", "778")
+    monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten")
+    d = run_recipe_ci.setup_run_abra_dir()
+
+    domain = "test-abc123.ci.commoninternet.net"
+    via_symlink = os.path.join(d, "servers", "default", f"{domain}.env")
+    with open(via_symlink, "w") as f:
+        f.write("TYPE=fakerecipe:1.0.0\nDOMAIN=placeholder\n")
+
+    canonical = home / ".abra" / "servers" / "default" / f"{domain}.env"
+    assert canonical.is_file(), ".env written via the symlink must land in the canonical path"
+    # the canonical-path readers/writers (abra.env_get/env_set use ~/.abra) see the same file
+    assert abra.env_get(domain, "TYPE") == "fakerecipe:1.0.0"
+    abra.env_set(domain, "DOMAIN", domain)
+    with open(via_symlink) as f:
+        assert f"DOMAIN={domain}" in f.read()
+
+
+def test_18b_run_id_manual_fallback_is_per_process(tmp_path, monkeypatch):
+    """Companion to case 18: two concurrent MANUAL runs (no DRONE_BUILD_NUMBER) must not share an
+    abra dir either — the manual fallback is pid-suffixed."""
+    home = _make_fake_home(tmp_path)
+    monkeypatch.setenv("HOME", str(home))
+    monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
+    monkeypatch.delenv("DRONE_BUILD_NUMBER", raising=False)
+    monkeypatch.delenv("CCCI_APP_DOMAIN", raising=False)
+    monkeypatch.delenv("CCCI_RUN_ID", raising=False)
+    monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten")
+    d = run_recipe_ci.setup_run_abra_dir()
+    assert f"manual-{os.getpid()}" in d
diff --git a/tests/concurrency/test_janitor.py b/tests/concurrency/test_janitor.py
new file mode 100644
index 0000000..bdf61b9
--- /dev/null
+++ b/tests/concurrency/test_janitor.py
@@ -0,0 +1,189 @@
+"""Janitor / flock-probe semantics (concurrency-restructure plan, cases 5-12).
+
+The janitor runs IN-PROCESS with its discovery monkeypatched (candidates injected via a stubbed
+abra.app_ls + empty docker sweep) and teardown_app stubbed to record calls — but the LOCKS are
+real kernel flocks, held by real helper subprocesses where a live owner is needed."""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from concutil import DOMAIN, lock_state, wait_marker  # noqa: E402
+from harness import lifecycle  # noqa: E402
+
+
+def _inject_candidates(monkeypatch, domains):
+    """Point janitor discovery at exactly `domains`: abra lists them, docker sweep is empty.
+    teardown_app is stubbed to a recorder; returns the calls list."""
+    calls = []
+    monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in domains])
+    monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
+    monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d))
+    return calls
+
+
+def test_5_orphan_reaped_lockfile_unlinked(lock_dir, pool, monkeypatch):
+    """Case 5: an orphan (lockfile exists, no holder — its run was SIGKILL'd) is reaped exactly
+    once and its lockfile unlinked."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    p.kill()
+    p.wait(timeout=10)
+    calls = _inject_candidates(monkeypatch, [DOMAIN])
+    lifecycle.janitor()
+    assert calls == [DOMAIN], f"teardown calls: {calls} (expected exactly one)"
+    assert lock_state(DOMAIN) == "absent", "reaped orphan's lockfile must be unlinked"
+
+
+def test_6_live_run_never_reaped(lock_dir, pool, monkeypatch, capsys):
+    """Case 6: a held lock (live helper) is never reaped and is logged as live."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    calls = _inject_candidates(monkeypatch, [DOMAIN])
+    lifecycle.janitor()
+    assert calls == []
+    assert "live concurrent run" in capsys.readouterr().out
+    assert lock_state(DOMAIN) == "held"
+
+
+def test_7_new_run_blocks_until_reap_finishes(lock_dir, pool, monkeypatch):
+    """Case 7: the janitor reaps WHILE HOLDING the probe lock, so a new run of the same domain
+    blocks in acquire_app_lock until the reap completes — no window where a fresh app coexists
+    with a half-reaped one."""
+    # Make an orphan.
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    p.kill()
+    p.wait(timeout=10)
+
+    state = {"teardown_end": None, "acquirer_out": None}
+
+    def slow_teardown(domain, verify=True):
+        # While the janitor holds the probe lock mid-reap, a new run starts acquiring.
+        _, aout = pool.spawn("hold", DOMAIN)
+        state["acquirer_out"] = aout
+        time.sleep(2.0)
+        state["teardown_end"] = time.time()
+
+    monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}])
+    monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
+    monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown)
+    lifecycle.janitor()
+
+    line = wait_marker(state["acquirer_out"], "ACQUIRED", timeout=15)
+    assert line, "new run never acquired after the reap"
+    acquired_ts = float(line.split()[1])
+    assert (
+        acquired_ts >= state["teardown_end"]
+    ), f"new run acquired at {acquired_ts} BEFORE the reap finished at {state['teardown_end']}"
+    # The new run must hold a lock the next probe can SEE (fresh inode at the path).
+    assert lock_state(DOMAIN) == "held"
+
+
+def test_8_two_janitors_exactly_one_reaps(lock_dir, pool, monkeypatch):
+    """Case 8: two concurrent janitors arbitrate on the probe flock — exactly one reaps (the
+    other sees 'held' and leaves). Teardown is slowed so the runs genuinely overlap."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    p.kill()
+    p.wait(timeout=10)
+
+    calls = []
+    calls_lock = threading.Lock()
+
+    def slow_teardown(domain, verify=True):
+        with calls_lock:
+            calls.append(domain)
+        time.sleep(2.0)
+
+    monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}])
+    monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
+    monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown)
+
+    barrier = threading.Barrier(2)
+
+    def run_janitor():
+        barrier.wait()
+        lifecycle.janitor()
+
+    t1, t2 = threading.Thread(target=run_janitor), threading.Thread(target=run_janitor)
+    t1.start(), t2.start()
+    t1.join(timeout=30), t2.join(timeout=30)
+    assert calls == [DOMAIN], f"expected exactly one reap, got {calls}"
+    assert lock_state(DOMAIN) == "absent"
+
+
+def test_9_reboot_lockfile_absent_reaped_immediately(lock_dir, monkeypatch):
+    """Case 9: post-reboot simulation — the app exists but its lockfile is gone (/run/lock is
+    tmpfs). The probe trivially acquires -> immediate reap, NO age threshold (improvement over
+    the old 2h fallback)."""
+    assert lock_state(DOMAIN) == "absent"
+    calls = _inject_candidates(monkeypatch, [DOMAIN])
+    t0 = time.time()
+    lifecycle.janitor()
+    assert calls == [DOMAIN]
+    assert time.time() - t0 < 5, "reap must be immediate (no age wait)"
+
+
+def test_10_long_held_lock_flagged_never_stolen(lock_dir, pool, monkeypatch, capsys):
+    """Case 10: a lock held with mtime older than 120min is flagged as a possible leaked run —
+    and NOT reaped (never steal a held lock)."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    path = lifecycle._app_lock_path(DOMAIN)  # noqa: SLF001
+    backdate = time.time() - (130 * 60)
+    os.utime(path, (backdate, backdate))
+    calls = _inject_candidates(monkeypatch, [DOMAIN])
+    lifecycle.janitor()
+    assert calls == []
+    out_text = capsys.readouterr().out
+    assert "possible leaked run" in out_text and "lslocks" in out_text
+    assert lock_state(DOMAIN) == "held"
+
+
+def test_11_warm_canonical_names_never_probed(lock_dir, monkeypatch):
+    """Case 11: RUN_APP_RE allowlist — warm/canonical-shaped names never become candidates, so
+    they are never probed (no lockfile is even created for them) and never reaped."""
+    warmish = [
+        "warm-keycloak.ci.commoninternet.net",
+        "keycloak.ci.commoninternet.net",
+        "warm-hedgedoc.ci.commoninternet.net",
+        "drone.ci.commoninternet.net",
+    ]
+    calls = []
+    monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in warmish])
+    monkeypatch.setattr(
+        lifecycle,
+        "_docker_names",
+        lambda kind, stack: ["warm-keycloak_ci_commoninternet_net_app"]
+        if kind == "service"
+        else [],
+    )
+    monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d))
+    lifecycle.janitor()
+    assert calls == []
+    lockdir = os.environ["CCCI_APP_LOCK_DIR"]
+    assert [
+        f for f in os.listdir(lockdir) if f.startswith("cc-ci-app-")
+    ] == [], "janitor must not create lockfiles for non-run-app names"
+
+
+def test_12_degrades_safely_on_bad_lockfile_and_missing_dir(lock_dir, monkeypatch, capsys):
+    """Case 12: a garbled/unopenable lockfile (here: a DIRECTORY at the lockfile path) is skipped
+    with a log line; a missing lock dir doesn't crash the janitor either. Never a crash."""
+    path = lifecycle._app_lock_path(DOMAIN)  # noqa: SLF001
+    os.makedirs(path)  # open(path, "a") -> IsADirectoryError (an OSError)
+    calls = _inject_candidates(monkeypatch, [DOMAIN])
+    lifecycle.janitor()  # must not raise
+    assert calls == []
+    assert "skipping" in capsys.readouterr().out
+
+    os.rmdir(path)
+    monkeypatch.setenv("CCCI_APP_LOCK_DIR", os.path.join(os.environ["CCCI_APP_LOCK_DIR"], "gone"))
+    lifecycle.janitor()  # missing dir: probe open fails -> skip; tidy glob -> empty. No crash.
+    assert calls == []
diff --git a/tests/concurrency/test_lifetime.py b/tests/concurrency/test_lifetime.py
new file mode 100644
index 0000000..1b23c69
--- /dev/null
+++ b/tests/concurrency/test_lifetime.py
@@ -0,0 +1,82 @@
+"""Lifetime hardening (concurrency-restructure plan, cases 13-16): the REAL prctl/signal/alarm
+guards installed by helper subprocesses; tests assert teardown ran, exit was non-zero, and the
+lock was released."""
+
+from __future__ import annotations
+
+import os
+import signal
+import sys
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from concutil import (  # noqa: E402
+    DOMAIN,
+    wait_lock_state,
+    wait_marker,
+    wait_pid_gone,
+)
+
+
+def test_13_pdeathsig_parent_kill_terms_harness(lock_dir, pool):
+    """Case 13: wrapper-parent spawns a guarded harness-child; the parent is SIGKILL'd (the
+    harness gets no courtesy signal) -> the kernel's PDEATHSIG TERMs the child, its teardown
+    funnel runs, it exits, and the lock is released."""
+    p, out = pool.spawn("wrapper", DOMAIN)
+    line = wait_marker(out, "WRAPPED")
+    assert line, "wrapper never spawned its child"
+    child_pid = int(line.split()[1])
+    pool.track_pid(child_pid)
+    assert wait_marker(out, "READY"), "guarded child never got ready"
+
+    p.kill()  # parent dies WITHOUT signalling the child — only PDEATHSIG can save us
+    p.wait(timeout=10)
+    assert wait_pid_gone(child_pid), "guarded child must exit on parent death (PDEATHSIG)"
+    assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run"
+    assert wait_lock_state(DOMAIN, "free") == "free"
+
+
+def test_14_already_orphaned_helper_refuses_to_run(lock_dir, pool):
+    """Case 14 (ppid race): a helper whose parent died BEFORE the prctl was armed (it starts
+    already reparented to pid 1) must refuse to run — PDEATHSIG would never fire for it."""
+    # Spawn an intermediate parent that forks orphan-probe and exits immediately.
+    import subprocess
+
+    out = os.path.join(pool.out_dir, "orphan.out")
+    intermediate = (
+        "import subprocess, sys, os; "
+        "subprocess.Popen([sys.executable, os.environ['CCCI_HELPERS'], 'orphan-probe']); "
+    )
+    env = dict(
+        os.environ,
+        CCCI_HELPER_OUT=out,
+        CCCI_HELPERS=os.path.join(os.path.dirname(__file__), "helpers.py"),
+    )
+    subprocess.run([sys.executable, "-c", intermediate], env=env, timeout=15, check=True)
+    line = wait_marker(out, "REFUSED", timeout=20)
+    assert line, "orphaned helper did not refuse to run (or never reparented to pid 1)"
+
+
+def test_15_deadline_alarm_fires_teardown_and_releases(lock_dir, pool):
+    """Case 15: the self-deadline (alarm). A guarded helper with a 2s deadline tears down via
+    the funnel (finally: ran), exits NON-zero, and its lock is released."""
+    p, out = pool.spawn("guarded", DOMAIN, "2")
+    assert wait_marker(out, "READY")
+    rc = p.wait(timeout=20)
+    assert rc != 0, f"deadline exit must be non-zero (got {rc})"
+    assert rc == 128 + signal.SIGALRM, f"expected 142 (128+SIGALRM), got {rc}"
+    assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on deadline"
+    assert wait_lock_state(DOMAIN, "free") == "free"
+
+
+def test_16_sigterm_runs_teardown_funnel_and_releases(lock_dir, pool):
+    """Case 16: SIGTERM (drone cancel path) -> the finally: teardown funnel runs, exit is
+    non-zero, lock released."""
+    p, out = pool.spawn("guarded", DOMAIN, "3600")
+    assert wait_marker(out, "READY")
+    p.send_signal(signal.SIGTERM)
+    rc = p.wait(timeout=20)
+    assert rc != 0, f"SIGTERM exit must be non-zero (got {rc})"
+    assert rc == 128 + signal.SIGTERM, f"expected 143 (128+SIGTERM), got {rc}"
+    assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on SIGTERM"
+    assert wait_lock_state(DOMAIN, "free") == "free"
diff --git a/tests/concurrency/test_locks.py b/tests/concurrency/test_locks.py
new file mode 100644
index 0000000..8059dbb
--- /dev/null
+++ b/tests/concurrency/test_locks.py
@@ -0,0 +1,85 @@
+"""Lock fundamentals (concurrency-restructure plan, cases 1-4). Real kernel flocks held by real
+subprocesses — nothing mocked."""
+
+from __future__ import annotations
+
+import fcntl
+import os
+import sys
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
+from concutil import (  # noqa: E402
+    DOMAIN,
+    lock_state,
+    wait_lock_state,
+    wait_marker,
+)
+from harness import lifecycle  # noqa: E402
+
+
+def test_1_sigkill_releases_lock(lock_dir, pool):
+    """Case 1: acquire -> holder SIGKILL'd -> lock immediately acquirable (kernel auto-release).
+    The exact property the old pidfile registry approximated with /proc checks."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED"), "holder never acquired"
+    assert lock_state(DOMAIN) == "held"
+    p.kill()
+    p.wait(timeout=10)
+    assert wait_lock_state(DOMAIN, "free") == "free"
+
+
+def test_2_nb_probe_held_vs_unheld(lock_dir, pool):
+    """Case 2: LOCK_NB probe raises BlockingIOError against a held lock; succeeds when unheld."""
+    p, out = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    path = lifecycle._app_lock_path(DOMAIN)  # noqa: SLF001
+    with open(path, "a") as f:
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            raise AssertionError("LOCK_NB succeeded against a held lock")
+        except BlockingIOError:
+            pass
+    p.kill()
+    p.wait(timeout=10)
+    assert wait_lock_state(DOMAIN, "free") == "free"
+    with open(path, "a") as f:
+        fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)  # must not raise now
+
+
+def test_3_lock_fd_not_inherited_by_children(lock_dir, pool):
+    """Case 3 (PEP 446): the holder spawns a subprocess child, the holder dies, the child lives —
+    and the lock is STILL released (the child never inherited the lock fd). This is what makes
+    'held lock == live HARNESS owner' sound even though runs spawn abra/docker/pytest children."""
+    p, out = pool.spawn("hold-with-child", DOMAIN)
+    assert wait_marker(out, "ACQUIRED")
+    child_line = wait_marker(out, "CHILD")
+    assert child_line, "holder never reported its child pid"
+    child_pid = int(child_line.split()[1])
+    pool.track_pid(child_pid)
+    p.kill()
+    p.wait(timeout=10)
+    assert os.path.exists(f"/proc/{child_pid}"), "child should outlive the holder"
+    assert (
+        wait_lock_state(DOMAIN, "free") == "free"
+    ), "lock must release on holder death even with a live child (PEP 446 non-inheritable fd)"
+
+
+def test_4_second_acquire_blocks_until_first_exits(lock_dir, pool):
+    """Case 4: a second same-domain acquire blocks until the first holder exits — the
+    double-!testme serialisation property."""
+    p1, out1 = pool.spawn("hold", DOMAIN)
+    assert wait_marker(out1, "ACQUIRED")
+    p2, out2 = pool.spawn("hold", DOMAIN)
+    # p2 must NOT acquire while p1 holds.
+    time.sleep(1.5)
+    assert wait_marker(out2, "ACQUIRED", timeout=0.1) is None, "second acquire did not block"
+    t_kill = time.time()
+    p1.kill()
+    p1.wait(timeout=10)
+    line = wait_marker(out2, "ACQUIRED", timeout=15)
+    assert line, "second acquire never completed after first holder exited"
+    acquired_ts = float(line.split()[1])
+    assert acquired_ts >= t_kill - 0.05, "second holder acquired before the first exited"
+    assert lock_state(DOMAIN) == "held"

From d3fe9e26bb0fbaedb37383539ba3973bc1c80aff Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@git.autonomic.zone>
Date: Wed, 10 Jun 2026 04:32:54 +0000
Subject: [PATCH 6/6] =?UTF-8?q?docs:=20P5=20concurrency=20spec=20rewrite?=
 =?UTF-8?q?=20=E2=80=94=20one=20lock,=20one=20structural=20isolation,=20th?=
 =?UTF-8?q?e=20invariant=20chain?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewritten to the restructured model: lifetime-hardening guards (PDEATHSIG/SIGTERM/SIGALRM +
setsid/trap), per-run ABRA_DIR isolation (same-recipe runs now parallel), per-app-domain flock
(double-!testme serialisation), flock-probe janitor decision table (incl. the inode-identity
race rows), updated failure-mode table (cancel now tears down via the harness's own funnel;
reboot reaps immediately; 60-min deadline bounds everything), single-knob config table, how to
run tests/concurrency, fresh file/symbol index + deleted-symbol list for grep verification.
Also drops the last stale concurrency.limit mention from the .drone.yml header comment.
---
 .drone.yml          |   4 +-
 docs/concurrency.md | 311 ++++++++++++++++++++++++++------------------
 2 files changed, 188 insertions(+), 127 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index d7c3674..cc709f3 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -35,8 +35,8 @@ steps:
 # the comment-bridge). Deploys the recipe at the PR head, runs install/upgrade/backup + any
 # recipe-local tests via the shared harness, then guarantees teardown (plan §4.2/§4.3).
 #
-# Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix) +
-# concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by
+# Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix, the
+# single concurrency knob) allows two recipe runs in parallel. Concurrent-run safety is enforced by
 # the harness, not by serialisation: every run holds an exclusive flock on its app domain
 # (/run/lock/cc-ci-app-<domain>.lock) for its whole process lifetime, the run-start janitor probes
 # that lock to reap only orphans (held lock = live run, never touched), and recipe working trees
diff --git a/docs/concurrency.md b/docs/concurrency.md
index 64d4496..06858f1 100644
--- a/docs/concurrency.md
+++ b/docs/concurrency.md
@@ -1,167 +1,228 @@
 # Concurrency: how parallel recipe CI runs stay safe
 
-Spec of the concurrent-run system as of 2026-06-10 (commits `c0df77d`, `68ef0f8`, `e6d55b5`).
-Written for review/restructuring — it documents what IS, including known limitations.
+Spec of the concurrent-run system after the 2026-06-10 restructure (branch
+`restructure/concurrency`; plan: cc-ci-plan `concurrency-restructure-full-plan.md`). The previous
+registry + per-recipe-flock model is documented in this file's git history (`5b65c6c`).
 
 ## 1. Goal and design summary
 
-Two recipe CI builds may run **at the same time** on the single cc-ci host (e.g. immich and
-plausible under active development at once). Safety is enforced by the **harness**, not by
-serialising everything:
+Two recipe CI builds may run **at the same time** on the single cc-ci host. Safety is enforced by
+the **harness**, not by serialising everything, and rests on ONE locking mechanism plus ONE
+structural isolation:
 
 | Rule | Mechanism |
 |---|---|
 | Different recipes run in parallel | nothing blocks them (isolation, §3) |
-| Same-recipe runs serialise | per-recipe `flock` (§4) |
-| A starting run never reaps a live concurrent run's app | active-run registry + three-way janitor (§5) |
-| A crashed run's leftovers still get reaped | registry owner-dead detection, age fallback (§5) |
+| Same-RECIPE runs run in parallel too | per-run `ABRA_DIR` recipe trees (§4) — no shared tree, no lock |
+| Same-DOMAIN runs (double-`!testme` of one PR) serialise | per-app-domain `flock` (§5) |
+| A starting run never reaps a live concurrent run's app | janitor probes the app lock; held = live (§6) |
+| A crashed/canceled/rebooted run's leftovers get reaped | lock auto-released by the kernel → probe acquires → reap (§6) |
 
-There is **no daemon and no shared state service**: both mechanisms are kernel/file primitives
-under `/run`, scoped to the harness process lifetime, so a `SIGKILL`'d run can never leak a stale
-lock or a stale "I'm alive" claim.
+The invariant chain that makes "held lock = live owner" sound:
 
-## 2. Configuration knobs
+```
+lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline
+```
 
-| Knob | Where | Current | Meaning |
-|---|---|---|---|
-| `DRONE_RUNNER_CAPACITY` (aka `MAX_TESTS`) | `nix/modules/drone-runner.nix` (`maxTests` let-binding) | `2` | **THE cap.** Max builds the exec runner executes at once; Drone queues the rest in its native pending queue. Change requires `nixos-rebuild switch` on cc-ci. |
-| `concurrency.limit` | `.drone.yml`, `recipe-ci` pipeline | `2` | Server-side cap on concurrent `recipe-ci` pipelines. Kept equal to capacity; redundant belt (the push pipeline shares runner capacity too, so lint builds can interleave). |
-| `CCCI_JANITOR_MAX_AGE` | env, read in `lifecycle.janitor()` | unset → `7200`s | **Age fallback only** — applies solely to apps with no registry entry (§5 case 3). The capacity=1-era `"0"` override in `.drone.yml` is GONE; do not reintroduce it (it made a starting build reap in-flight runs). |
-| `RECIPE_LOCK_DIR` | `lifecycle.py` constant | `/run/lock` | Where per-recipe lock files live. |
-| `ACTIVE_RUN_DIR` | `lifecycle.py` constant | `/run/cc-ci-active` | Where active-run pidfiles live. |
+- **lock ⊆ process**: locks are kernel flocks on fds the process holds (and PEP 446 makes those
+  fds non-inheritable, so abra/docker/pytest children never carry them). The kernel releases them
+  on process death, however it dies. There is no unlock code path and no stale-lock failure mode.
+- **process ⊆ step**: `PR_SET_PDEATHSIG(SIGTERM)` + the `.drone.yml` setsid/trap wrap (§2) — a
+  dead or canceled build cannot leak a running harness.
+- **step ⊆ 60 min**: `signal.alarm(3600)` self-deadline (§2).
 
-Memory budget rationale for capacity=2 (Hetzner cpx22, ~7.6 GiB): a full immich stack measured
-~1 GiB; two concurrent recipes fit. Revert `maxTests` to `"1"` if OOM/disk-I/O contention appears.
+Never steal a held lock; manage the holder's lifetime. There is **no daemon and no shared state
+service** — everything is kernel/file primitives under `/run/lock` and per-run directories.
+
+## 2. Mechanism 0: run-lifetime hardening (`runner/harness/lifetime.py`)
+
+`run_recipe_ci.main()` calls `lifetime.install_lifetime_guards()` before ANY abra call or lock
+acquisition:
+
+1. **`PR_SET_PDEATHSIG(SIGTERM)`** (ctypes prctl, return code checked): if the parent — the drone
+   step shell — dies, the kernel TERMs the harness. A post-prctl `ppid == 1` re-check closes the
+   start race: a harness whose parent died *before* the prctl armed would never get the signal,
+   so it refuses to run orphaned.
+2. **SIGTERM handler**: logs, then raises `SystemExit(143)` so the run's `finally:` teardown
+   funnel executes and the process exits non-zero. Re-entrant signals during teardown are logged
+   and IGNORED (`lifetime.begin_teardown()`, also set at the top of the run's `finally:` blocks)
+   so a second signal can't abort the cleanup the first one asked for.
+3. **`signal.alarm(3600)` hard deadline**: SIGALRM funnels into the same teardown path with a
+   distinct log line (`== run exceeded 60-minute hard deadline — tearing down ==`), exit 142.
+   Recipes keep their own smaller per-tier timeouts; this bounds the whole run. Teardown time
+   after the deadline is deliberately not alarm-bounded — the janitor is the backstop if a
+   teardown wedges and the process is killed harder.
+
+The `.drone.yml` recipe-ci step runs the harness as `setsid cc-ci-run … &` with a
+`trap 'kill -TERM -- "-$PID"' TERM EXIT; wait "$PID"` — a drone **cancel** (TERM to the step
+shell) is forwarded to the harness's whole process group instead of leaking it (the exec runner
+only kills the step shell). PDEATHSIG backstops the no-trap paths.
 
 ## 3. Isolation model: what is shared, what is per-run
 
 Per-run (no conflict possible):
 
-- **App + stack + volumes + secrets.** The run app domain is deterministic and unique per
-  (recipe, pr, ref): `naming.app_domain()` → `<recipe[:4]>-<sha1(recipe|pr|ref)[:6]>.ci.commoninternet.net`.
-  Everything abra creates is namespaced by it. Run apps are recognised by
-  `RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`; canonical/warm apps
-  (e.g. `warm-keycloak...`) deliberately do NOT match, so the janitor never touches them.
-- **Drone build workspace.** The exec runner gives each build its own clone under
-  `/var/lib/drone-runner/drone-<id>/` — harness code and test files are per-build.
-- **Run artifacts.** `/var/lib/cc-ci-runs/<build-number>/`.
+- **App + stack + volumes + secrets.** Run app domain = `naming.app_domain()` →
+  `<recipe[:4]>-<sha1(recipe|pr|ref)[:6]>.ci.commoninternet.net`, unique per (recipe, pr, ref);
+  everything abra creates is namespaced by it. Run apps are recognised by
+  `RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`; warm/canonical apps
+  (e.g. `warm-keycloak...`) deliberately do NOT match → the janitor never probes them.
+- **Recipe working trees** — `$ABRA_DIR/recipes/<recipe>`, per run (§4). NEW in the restructure.
+- **Drone build workspace** (`/var/lib/drone-runner/drone-<id>/`) and **run artifacts**
+  (`/var/lib/cc-ci-runs/<run-id>/`).
 
-Shared (the two hazards the mechanisms exist for):
+Shared (by design, conflict-free):
 
-- **`~/.abra/recipes/<recipe>`** — ONE working tree per recipe (abra's own layout). The harness
-  `fetch_recipe()` `rm -rf`'s + reclones it at run start, and the upgrade tier `git checkout`s it
-  mid-run for the chaos redeploy. Two same-recipe runs would corrupt each other's deploy tree
-  (observed: immich builds 229/230 deployed a tree missing its config). → per-recipe flock (§4).
-- **`HOME=/root`** — forced in `.drone.yml` so abra finds its server config under `/root/.abra`.
-  Safe *given* the above: app names are unique and same-recipe runs serialise, so no two builds
-  touch the same recipe checkout or app env file.
+- **`/root/.abra/servers`** — app `.env` files, one per domain. The per-run `ABRA_DIR` symlinks
+  `servers/` here, so .env files land in the canonical path: janitor discovery (`abra app ls`)
+  and out-of-run tooling see every app. Per-domain filenames + the app-domain lock prevent write
+  conflicts.
+- **`/root/.abra/catalogue`** — read-mostly, symlinked into each per-run dir.
+- **`HOME=/root`** (forced in `.drone.yml`) — safe: nothing recipe-mutable lives under `~/.abra`
+  for a run anymore except through the two symlinks above.
 
-## 4. Mechanism 1: per-recipe flock
+## 4. Mechanism 1: per-run `ABRA_DIR` (replaces the per-recipe flock)
 
-Code: `lifecycle.acquire_recipe_lock(recipe)`; taken in `run_recipe_ci.main()` **before**
-`fetch_recipe()` (the first shared-tree mutation).
+`run_recipe_ci.setup_run_abra_dir()` — called first thing in `main()`, before any abra call —
+builds `<runs_dir>/<run-id>/abra/` (run-id = Drone build number; `manual-<pid>` for hand runs):
 
-- Lock file: `/run/lock/cc-ci-recipe-<recipe>.lock`, exclusive `fcntl.flock`.
-- Non-blocking attempt first; on `BlockingIOError` it logs
-  `== recipe lock: another <recipe> run is in flight — waiting ... ==` and blocks. The waiting
-  run is visibly "stuck" in its drone log at that line — that is by design.
-- The open file object is returned and kept alive (`_recipe_lock = ...  # noqa: F841`) for the
-  **whole process lifetime**. Release is implicit: the kernel drops a flock when the fd closes —
-  including on crash or SIGKILL. **There is no stale-lock failure mode and no unlock code path.**
-- Scope: serialises only runs of the SAME recipe. Different recipes never contend.
+```
+abra/
+  servers/    -> /root/.abra/servers     (symlink; canonical shared .env path)
+  catalogue/  -> /root/.abra/catalogue   (symlink; read-mostly)
+  recipes/    fresh, empty               (THE isolation that matters)
+```
 
-## 5. Mechanism 2: active-run registry + three-way janitor
+and exports it as `$ABRA_DIR` — honored by the abra CLI itself and by every harness path helper
+(`abra.abra_dir()` / `abra.recipe_dir()`; `generic._recipe_dir`, `prepull_images`,
+`snapshot_recipe_tests`, `warm_reconcile._recipe_dir` all route through the same rule:
+`$ABRA_DIR` if set, else `~/.abra`).
 
-Why: every run starts with `lifecycle.janitor()` (called from both the cold path and the
-warm/quick path in `run_recipe_ci.py`) to reap orphans left by crashed/SIGKILL'd runs (whose
-`finally:` teardown never ran). Under capacity=2 "any run app that isn't mine" may be a LIVE
-concurrent run — age alone can't tell. The registry makes ownership explicit.
+- `fetch_recipe()` is now a plain clone into `$ABRA_DIR/recipes/<recipe>` (PR-head clone+checkout
+  or `abra recipe fetch`); the upgrade tier's mid-run `git checkout`s happen in the run's own
+  tree. Two same-recipe runs can no longer corrupt each other — structurally, with no lock. The
+  old observed failure (immich builds 229/230 deploying a tree missing its config) is impossible.
+- `CCCI_SKIP_FETCH=1` (test/Adversary staging) copies the canonically-staged
+  `~/.abra/recipes/<recipe>` clone into the per-run tree.
+- Out-of-run flows (warm_reconcile's systemd timer, manual abra) set no `ABRA_DIR` and keep using
+  the canonical `/root/.abra` unchanged. In-run flows that touch canonical state on purpose
+  (warm/canonical .env files) go through `servers/` and are unaffected.
+- The per-run dir rides along the existing `/var/lib/cc-ci-runs/<run-id>/` retention. abra
+  auto-clones any recipe it needs to resolve (e.g. during `app ls`) into the per-run `recipes/` —
+  a few seconds of git per run, gone with the run dir.
 
-Registry protocol (all in `lifecycle.py`):
+## 5. Mechanism 2: per-app-domain flock (`lifecycle.acquire_app_lock`)
 
-1. `register_run_app(domain)` — writes `/run/cc-ci-active/<domain>` containing the harness pid.
-   Called inside `deploy_app()` **before** the app is created, so no window exists where a
-   concurrent janitor can see the app without its registration.
-2. `unregister_run_app(domain)` — removes the pidfile. Called at the end of `teardown_app()`
-   (every exit path funnels through teardown) and by the janitor after reaping.
-3. `_run_owner_state(domain)` — classifies the owner:
-   - reads the pid; missing/garbled file → `"unknown"`
-   - `/proc/<pid>/cmdline` gone → `"dead"`
-   - cmdline must contain `run_recipe_ci` → `"alive"`, else `"dead"` (**pid-reuse guard**: a
-     recycled pid won't look like a harness run)
+- Lock file: `/run/lock/cc-ci-app-<domain>.lock` (dir overridable via `CCCI_APP_LOCK_DIR` for the
+  test suite), exclusive `fcntl.flock`, taken in `deploy_app()` **before the app is created** — a
+  concurrent janitor can never see a run app without its held lock.
+- Blocks (with a log line: `== app lock: another run of <domain> is in flight — waiting ==`) when
+  another run of the SAME domain is in flight — the double-`!testme` serialisation point; the
+  waiting run is visibly parked at that line in its drone log, by design.
+- The returned file object is ALSO retained in module-level `_held_app_locks` — if a caller
+  dropped it, GC would close the fd and silently release the lock.
+- mtime is touched at acquisition: lock age feeds the janitor's long-held flag (§6).
+- **Unlink/recreate race guard**: the janitor unlinks reaped lockfiles, so after EVERY
+  acquisition the locked fd is verified to still be the inode the path names
+  (`fstat().st_ino == stat().st_ino`); a waiter that won a just-unlinked inode closes it and
+  retries on the live path. (A lock on an unlinked inode protects nothing: a later opener gets a
+  fresh inode and would acquire "the same" lock.)
+- Release is implicit: process exit (any kind). `teardown_app()` does NOT release or unlink —
+  a clean run's leftover lockfile is unheld and is unlinked on sight by the next janitor sweep.
 
-Janitor decision table (`lifecycle.janitor()`):
+## 6. The flock-probe janitor (`lifecycle.janitor`)
 
-| Owner state | Meaning | Action |
+Runs at every run start (cold + quick paths) and in the warm/upgrade sweeps. Candidate discovery
+is unchanged from the old model: `abra app ls` + a docker-service sweep (catches stacks whose
+`.env` is already gone), both matched against `RUN_APP_RE` — warm/canonical apps never match and
+are never probed.
+
+Decision table (per candidate domain, `_probe_and_reap`):
+
+| Probe (`LOCK_EX\|LOCK_NB`) | Meaning | Action |
 |---|---|---|
-| `alive` | live concurrent run | **never reap** (logs "is a live concurrent run — leaving it") |
-| `dead` | crashed run's definite orphan | reap immediately (`teardown_app(verify=False)`) + unregister |
-| `unknown` | pre-registry app, or post-reboot (`/run` is tmpfs) | age fallback: reap only if stack age ≥ `CCCI_JANITOR_MAX_AGE` (default 2h) |
+| acquires (+ inode identity OK) | nobody holds it → owner died (kernel-guaranteed) | **reap**: `teardown_app(verify=False)` WHILE HOLDING the probe lock, then unlink the lockfile, then release |
+| acquires, inode stale | another janitor reaped + unlinked while we raced | skip (reap already done; unlinking now would hit a newer run's file) |
+| `BlockingIOError` (held) | live concurrent run | leave it; if lockfile mtime > 120 min (2× the hard deadline): `!! lock for <domain> held >120min — possible leaked run; inspect with lslocks` — flag, **never steal** |
+| `open()` fails (`OSError`) | garbled/unopenable lockfile | skip + log, never crash |
 
-Candidate discovery is unchanged from before: `abra app ls` matches against `RUN_APP_RE`, plus a
-docker-service sweep that reconstructs domains for stacks whose `.env` was already deleted.
-
-## 6. Where convergence fits (adjacent, landed with this work)
-
-Parallel runs surfaced two swarm-convergence bugs that look like concurrency bugs but aren't —
-documented here because any restructuring must keep them fixed (`services_converged()` in
-`lifecycle.py`):
-
-- **N/N replicas ≠ converged** during a stop-first rolling update: the update is *registered*
-  instantly but the OLD task still shows 1/1 until swarm cycles it (build 238: backupbot exec'd a
-  pre-hook into a container killed seconds later → 409 → empty snapshot). `services_converged()`
-  therefore also inspects each service's `UpdateStatus.State`.
-- **`paused` persists forever**: swarm's default `update-failure-action: pause` sets it on one
-  task flicker and it never clears, even at N/N healthy (build 241 hung 22 min). Only `updating`
-  and `rollback_started` block convergence; `paused`/`rollback_paused`/`completed` are settled —
-  the HTTP-health and tier assertions still gate actual app correctness.
-- `backup_app()` additionally waits (bounded, 300s) for `services_converged()` before
-  `abra app backup create`, as defence in depth for the backupbot race.
+- Reaping under the probe lock closes the janitor-vs-new-run race: a new run of that domain
+  blocks in `acquire_app_lock` until the reap finishes — no window where a fresh app coexists
+  with a half-reaped one.
+- Two racing janitors arbitrate on the flock: one reaps, the other sees "held" and leaves; reaps
+  are idempotent (`teardown_app(verify=False)` tolerates half-gone stacks).
+- After the candidates, a tidy sweep unlinks stale **unheld** `cc-ci-app-*.lock` files with no
+  app behind them (under their own probe lock + identity check), keeping `/run/lock` clean.
+- **Post-reboot**: `/run/lock` is tmpfs → lockfiles gone → every surviving app probes as an
+  orphan → reaped immediately. (Improvement over the old 2-hour age fallback; there IS no age
+  logic anymore.)
 
 ## 7. Failure-mode guarantees
 
 | Event | Outcome |
 |---|---|
-| Run crashes / SIGKILL mid-run | flock auto-released by kernel; pidfile remains but owner is `dead` → next janitor (any run's start) reaps app + pidfile |
-| Drone build canceled via API | **known gap**: cancel kills the step's `sh` wrapper but can LEAK the python harness child — it keeps running (holding lock + registry) until killed by hand. See §8. |
-| Host reboot | `/run` is tmpfs → locks and registry vanish (correct: no processes survived either). All surviving apps become `unknown` → 2h age fallback governs. |
-| Two same-recipe `!testme`s | second blocks on the flock at run start (before touching the shared tree), runs after the first finishes |
-| Janitor vs. app being created | impossible to mis-reap: registration happens before app creation, and an `alive` owner is never reaped |
-| Pid reuse after crash | cmdline check (`run_recipe_ci`) classifies as `dead`, orphan still reaped |
+| Run crashes / SIGKILL mid-run | flock auto-released by kernel → next janitor probe reaps app + lockfile |
+| Drone build canceled via API | step trap TERMs the harness process group → SIGTERM funnel runs the run's own teardown (exit 143); if anything still leaks, PDEATHSIG + janitor reap (the old "cancel leaks the harness" gap is CLOSED) |
+| Run exceeds 60 min | SIGALRM → distinct log line → own teardown → exit 142 |
+| Host reboot | locks and lockfiles vanish (tmpfs, correct: no owners survived) → all surviving run apps reaped at the next run start, immediately |
+| Two same-recipe `!testme`s (different PRs) | run in parallel — separate domains, separate per-run recipe trees |
+| Double-`!testme` (same PR → same domain) | second blocks on the app lock before creating anything, visibly in its drone log, runs after the first finishes |
+| Janitor vs. app being created | impossible to mis-reap: the lock is held before `app new`, and a held lock is never touched |
+| Janitor unlink vs. blocked waiter | inode identity re-check on every acquisition → waiter retries on the live path |
+| Lock held implausibly long (>120 min) | flagged loudly for a human (`lslocks`), never stolen |
 
-## 8. Known limitations / restructuring candidates
+## 8. Where convergence fits (adjacent; unchanged by the restructure)
 
-1. **Drone cancel leaks the harness process.** The exec runner kills the step shell, not the
-   process tree; the leaked python continues deploying/holding the lock. Fix ideas: run the step
-   under `setsid` + a trap that kills the process group, or have the harness watch
-   `DRONE_BUILD_STATUS`/parent-death (`PR_SET_PDEATHSIG`).
-2. **Head-of-line blocking on same-recipe serialisation.** A run waiting on the recipe flock
-   still occupies one of the 2 runner slots, so two builds of the SAME recipe temporarily starve
-   all other recipes. Alternatives: a Drone-level per-recipe fan-out (one pipeline `concurrency`
-   group per recipe is not natively expressible), or detect-and-requeue in the harness.
-3. **The lock protects harness runs only.** Manual `abra`/`git` activity on
-   `~/.abra/recipes/<recipe>` (operator or another agent) bypasses the flock — the
-   "park the checkout, then hands off" discipline is still required. A restructure could make the
-   harness deploy from a per-run copy of the recipe tree instead of the shared checkout
-   (eliminates the lock entirely, at the cost of diverging from abra's expected layout).
-4. **`HOME=/root` is still shared.** Safe today by argument (§3), not by enforcement. Per-build
-   `ABRA_DIR` with a shared read-only server config would make isolation structural.
-5. **Registry is advisory.** Nothing stops a non-harness actor from creating run-app-shaped
-   stacks the janitor will eventually age-reap; conversely the janitor trusts pidfiles it can
-   parse. Acceptable on a single-purpose CI host.
-6. **Capacity is configured in two places** (`drone-runner.nix` + `.drone.yml`) that must be
-   kept in step by hand.
+Two swarm-convergence behaviors in `services_converged()` look like concurrency bugs but aren't —
+any future work must keep them fixed:
 
-## 9. File / symbol index
+- **N/N replicas ≠ converged** during a stop-first rolling update — `UpdateStatus.State` is also
+  inspected (build 238: backupbot exec'd into a container killed seconds later).
+- **`paused` persists forever** (swarm's default `update-failure-action`) — only `updating` and
+  `rollback_started` block convergence; `paused`/`rollback_paused` are settled (build 241).
+- `backup_app()` additionally waits (bounded 300s) for convergence before `backup create`.
+
+## 9. Configuration knobs
+
+| Knob | Where | Current | Meaning |
+|---|---|---|---|
+| `DRONE_RUNNER_CAPACITY` (aka `MAX_TESTS`) | `nix/modules/drone-runner.nix` (`maxTests`) | `2` | **THE single concurrency knob.** Max builds the exec runner executes at once; Drone queues the rest. (The `.drone.yml` `concurrency.limit` duplicate was removed.) Change requires `nixos-rebuild switch`. |
+| `CCCI_APP_LOCK_DIR` | env, read at call time | unset → `/run/lock` | App-domain lockfile dir override — used by `tests/concurrency` to sandbox locks. Never set in production. |
+| hard deadline | `lifetime.HARD_DEADLINE_SECONDS` | 3600 s | the whole-run alarm; long-held flag threshold is 2× this (`LONG_HELD_LOCK_SECONDS`) |
+
+## 10. Testing: `tests/concurrency/`
+
+Real-kernel suite (19 planned cases + companions): helper subprocesses hold REAL flocks and
+install the REAL prctl/signal/alarm guards — flock itself is never mocked; the janitor runs with
+injected candidates + stubbed teardown but probes real locks. **Not part of the default
+`pytest tests/unit` gate** (it spawns processes and sleeps); run it explicitly:
+
+```
+cc-ci-run -m pytest tests/concurrency -q
+```
+
+Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non-inheritance;
+same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock
+blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist;
+degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR
+construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality.
+
+## 11. File / symbol index
 
 | What | Where |
 |---|---|
-| `maxTests` / `DRONE_RUNNER_CAPACITY` | `nix/modules/drone-runner.nix` |
-| `concurrency.limit`, `HOME=/root`, env | `.drone.yml` (`recipe-ci` pipeline) |
-| Lock + registry constants & helpers | `runner/harness/lifecycle.py` (top, after `TeardownError`) |
-| `acquire_recipe_lock` call site | `runner/run_recipe_ci.py` `main()`, before `fetch_recipe()` |
-| `register_run_app` call site | `lifecycle.deploy_app()` (before app creation) |
-| `unregister_run_app` call sites | `lifecycle.teardown_app()`, `lifecycle.janitor()` |
-| Janitor + decision table | `lifecycle.janitor()`, `_run_owner_state()` |
-| Run-app naming | `runner/harness/naming.py` (`app_domain`), `RUN_APP_RE` in `lifecycle.py` |
-| Convergence (adjacent) | `lifecycle.services_converged()`, `lifecycle.backup_app()` |
+| lifetime guards (PDEATHSIG, signal funnels, deadline) | `runner/harness/lifetime.py`; installed in `run_recipe_ci.main()` |
+| setsid/trap cancel forwarding | `.drone.yml` (`recipe-ci` step) |
+| `acquire_app_lock`, `_held_app_locks`, `_app_lock_path` | `runner/harness/lifecycle.py` |
+| `acquire_app_lock` call site | `lifecycle.deploy_app()` (before app creation) |
+| janitor + probe (`janitor`, `_probe_and_reap`, `LONG_HELD_LOCK_SECONDS`) | `runner/harness/lifecycle.py` |
+| per-run ABRA_DIR (`setup_run_abra_dir`, `fetch_recipe`) | `runner/run_recipe_ci.py` |
+| path resolution (`abra_dir`, `recipe_dir`) | `runner/harness/abra.py` (used by `generic`, `lifecycle.prepull_images`, `warm_reconcile`) |
+| run-app naming | `runner/harness/naming.py` (`app_domain`), `RUN_APP_RE` in `lifecycle.py` |
+| capacity knob | `nix/modules/drone-runner.nix` (`maxTests`) |
+| convergence (adjacent) | `lifecycle.services_converged()`, `lifecycle.backup_app()` |
+| the test suite | `tests/concurrency/` (`helpers.py` subprocess entrypoints, `concutil.py` probes) |
+
+Deleted in the restructure (grep should find NOTHING): `register_run_app`, `unregister_run_app`,
+`_run_owner_state`, `ACTIVE_RUN_DIR`, `CCCI_JANITOR_MAX_AGE`, `_stack_age_seconds`,
+`acquire_recipe_lock`, `RECIPE_LOCK_DIR`.