From b492f995bdfc4be2248938d8b74ed0da15147fb1 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 10 Jun 2026 04:04:28 +0000 Subject: [PATCH] =?UTF-8?q?feat(harness):=20P1=20lock-lifetime=20hardening?= =?UTF-8?q?=20=E2=80=94=20PDEATHSIG=20+=20SIGTERM/SIGALRM=20teardown=20fun?= =?UTF-8?q?nel=20+=2060-min=20hard=20deadline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - new harness/lifetime.py: install_lifetime_guards() arms PR_SET_PDEATHSIG(SIGTERM) (with post-prctl ppid==1 orphan refusal), a SIGTERM handler raising SystemExit through the run's finally: teardown funnel (exit 143), and signal.alarm(3600) funnelling SIGALRM the same way with a distinct deadline log line (exit 142). Re-entrant signals during teardown are logged and ignored (begin_teardown guard) so a second signal can't abort the running cleanup. - run_recipe_ci.main(): guards installed first thing, before any abra call/lock; both teardown finally: blocks (cold + quick) mark begin_teardown(). - .drone.yml recipe-ci step: harness runs under setsid in its own process group; a trap forwards the step shell's TERM/EXIT to the whole group so drone cancel reaches the harness instead of leaking it (docs/concurrency.md §8.1). - PEP 446 note on the recipe-lock open(): the fd is non-inheritable, children never carry it. --- .drone.yml | 11 ++++- runner/harness/lifecycle.py | 1 + runner/harness/lifetime.py | 95 +++++++++++++++++++++++++++++++++++++ runner/run_recipe_ci.py | 9 ++++ 4 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 runner/harness/lifetime.py diff --git a/.drone.yml b/.drone.yml index 5c4c88b..d46adcc 100644 --- a/.drone.yml +++ b/.drone.yml @@ -70,4 +70,13 @@ steps: # build's custom params. CCCI_QUICK=1 makes run_recipe_ci take the opt-in fast lane (WC7); # absent => full cold (default). run_quick ignores STAGES (always upgrade+custom). - 'echo "recipe-ci: RECIPE=$RECIPE REF=$REF PR=$PR SRC=$SRC stages=$STAGES quick=${CCCI_QUICK:-0}"' - - cc-ci-run runner/run_recipe_ci.py + # P1 lock-lifetime hardening: run the harness in its own session/process group (setsid) and + # forward a drone cancel (TERM to this step shell) to the WHOLE group, so the harness's + # SIGTERM handler runs its teardown funnel instead of being leaked (the exec runner kills + # only the step shell, not the tree). PDEATHSIG inside the harness backstops the case where + # this shell dies without the trap firing. `wait` propagates the harness exit code. + - | + setsid cc-ci-run runner/run_recipe_ci.py & + PID=$! + trap 'kill -TERM -- "-$PID" 2>/dev/null' TERM EXIT + wait "$PID" diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 5c44c81..2346948 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -48,6 +48,7 @@ def acquire_recipe_lock(recipe: str): recipe is in flight. Returns the open lock file — the CALLER must keep a reference for the whole run; the lock is released only when the process exits and the fd closes.""" path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") + # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) diff --git a/runner/harness/lifetime.py b/runner/harness/lifetime.py new file mode 100644 index 0000000..4ad7f60 --- /dev/null +++ b/runner/harness/lifetime.py @@ -0,0 +1,95 @@ +"""Run-lifetime hardening (concurrency restructure P1). + +The concurrency model's invariant chain is: + + lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline + +Locks are kernel flocks released on process exit, so the only thing that needs managing is the +PROCESS lifetime. Three guards, installed at run startup (before any abra call) by +`install_lifetime_guards()`: + + 1. `PR_SET_PDEATHSIG(SIGTERM)`: if the parent (the drone step shell) dies — cancel, runner + crash, host shutdown of the step — the kernel delivers SIGTERM to the harness, so a dead + build can never leak a running harness that holds locks. Paired with a ppid==1 re-check + AFTER the prctl: a parent that died BEFORE the prctl took effect would never trigger the + death signal, so a harness that finds itself already reparented refuses to run. + 2. SIGTERM handler: raise SystemExit so the run's `finally:` teardown funnel executes and the + process exits non-zero. Re-entrant deliveries during teardown are logged and IGNORED so a + second signal can't abort the cleanup the first one asked for (`begin_teardown()` guards + this; the run's own `finally:` blocks also call it so a signal landing mid-normal-teardown + can't abort that either). + 3. `signal.alarm(3600)`: self-imposed hard deadline. SIGALRM funnels into the same teardown + path with a distinct log line. Teardown time after the deadline is not alarm-bounded — + interrupting a teardown buys nothing; the janitor (flock probe) is the backstop if a + teardown wedges and the process is killed harder. +""" + +from __future__ import annotations + +import ctypes +import os +import signal +import sys + +HARD_DEADLINE_SECONDS = 60 * 60 + +_PR_SET_PDEATHSIG = 1 # linux/prctl.h + +_state = {"tearing_down": False} + + +def begin_teardown() -> None: + """Mark the teardown funnel as running. From here on SIGTERM/SIGALRM must NOT raise — it + would abort the very cleanup it asks for — so the handlers log and return instead. Called by + the handlers themselves before raising, and at the top of the run's `finally:` blocks.""" + _state["tearing_down"] = True + + +def _funnel_handler(log_line: str, exit_code: int): + """A signal handler that routes into the teardown funnel exactly once: log, then raise + SystemExit (propagates through the run's try/finally → teardown executes → non-zero exit). + While teardown is already running, further signals are logged and swallowed.""" + + def handler(signum: int, frame) -> None: # noqa: ARG001 + print(log_line, flush=True) + if _state["tearing_down"]: + print( + f"== signal {signum} during teardown — ignored (teardown continues, " + "exit stays non-zero) ==", + flush=True, + ) + return + begin_teardown() + raise SystemExit(exit_code) + + return handler + + +def install_lifetime_guards(deadline_seconds: int = HARD_DEADLINE_SECONDS) -> None: + """Install all three lifetime guards (see module docstring). Must run at harness startup, + before any abra call and before any lock is taken.""" + libc = ctypes.CDLL("libc.so.6", use_errno=True) + if libc.prctl(_PR_SET_PDEATHSIG, signal.SIGTERM, 0, 0, 0) != 0: + err = ctypes.get_errno() + raise OSError(err, f"prctl(PR_SET_PDEATHSIG, SIGTERM) failed: {os.strerror(err)}") + # The prctl is armed now — but only fires for a parent death AFTER this point. If the parent + # already died, we are reparented (ppid 1) and would never get the signal: refuse to run, an + # orphaned harness would hold locks/apps with nothing managing its lifetime. + if os.getppid() == 1: + sys.exit("parent died before prctl(PR_SET_PDEATHSIG) — refusing to run orphaned") + signal.signal( + signal.SIGTERM, + _funnel_handler( + "== SIGTERM received (drone cancel / parent death) — tearing down ==", + 128 + signal.SIGTERM, + ), + ) + minutes = deadline_seconds // 60 + signal.signal( + signal.SIGALRM, + _funnel_handler( + f"== run exceeded {minutes}-minute hard deadline — tearing down ==", + 128 + signal.SIGALRM, + ), + ) + signal.alarm(deadline_seconds) diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index c7e55da..5884e63 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -47,6 +47,7 @@ from harness import ( # noqa: E402 discovery, generic, lifecycle, + lifetime, naming, warm, warmsnap, @@ -658,6 +659,8 @@ def run_quick( results["upgrade"] = "fail" results["custom"] = "skip" finally: + # Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py). + lifetime.begin_teardown() # F2-11 skip count (read before deciding pass/fail) requires_deps_skipped = 0 try: @@ -821,6 +824,9 @@ def promote_canonical(recipe: str, head_ref: str | None) -> None: def main() -> int: + # P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard + # deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py). + lifetime.install_lifetime_guards() recipe = os.environ.get("RECIPE") if not recipe: print("RECIPE env is required", file=sys.stderr) @@ -1123,6 +1129,9 @@ def main() -> int: if op in stages: results[op] = "skip" finally: + # From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored + # (lifetime.py) so a second signal can't abort the cleanup the first one asked for. + lifetime.begin_teardown() # Teardown the recipe under test FIRST, then deps in reverse declaration order. # Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier # failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are