diff --git a/.drone.yml b/.drone.yml index d356171..9908c8b 100644 --- a/.drone.yml +++ b/.drone.yml @@ -39,9 +39,8 @@ steps: # concurrency.limit=2 below allow two recipe runs in parallel. Concurrent-run safety is enforced by # the harness, not by serialisation: every run holds an exclusive flock on its app domain # (/run/lock/cc-ci-app-.lock) for its whole process lifetime, the run-start janitor probes -# that lock to reap only orphans (held lock = live run, never touched), and same-recipe runs -# serialise on a per-recipe flock for the shared ~/.abra/recipes/ checkout -# (lifecycle.acquire_recipe_lock — removed by P3's per-run ABRA_DIR). See docs/concurrency.md. +# that lock to reap only orphans (held lock = live run, never touched), and recipe working trees +# are per-run ($ABRA_DIR/recipes — no shared checkout, no recipe lock). See docs/concurrency.md. kind: pipeline type: exec name: recipe-ci @@ -61,10 +60,11 @@ steps: - name: ci environment: STAGES: install,upgrade,backup,restore,custom - # The exec runner points HOME at a per-build workspace; force it to /root so abra finds its - # server config + recipes under /root/.abra (as the manual M4/M5 runs did). Safe with - # capacity=2: app names are unique per (recipe,pr,ref) and same-recipe runs serialise on the - # per-recipe flock, so concurrent builds never touch the same recipe checkout or app. + # The exec runner points HOME at a per-build workspace; force it to /root so abra's server + # config is found via the per-run ABRA_DIR's servers/ symlink -> /root/.abra/servers. + # Recipe trees are PER-RUN ($ABRA_DIR/recipes, exported by run_recipe_ci before any abra + # call), so concurrent builds never share a recipe checkout; app .env files are per-domain + # in the shared canonical servers/ path, guarded by the app-domain flock. HOME: /root commands: # RECIPE/REF/PR/SRC (+ CCCI_QUICK for `!testme --quick`) are injected as env vars from the diff --git a/runner/harness/abra.py b/runner/harness/abra.py index 1c92066..b9d6454 100644 --- a/runner/harness/abra.py +++ b/runner/harness/abra.py @@ -10,6 +10,7 @@ Bakes in the known abra gotchas (re-verify per installed abra version, currently from __future__ import annotations import json +import os import subprocess ABRA = "abra" @@ -19,6 +20,20 @@ class AbraError(RuntimeError): pass +def abra_dir() -> str: + """abra's state dir, resolved the same way the abra CLI resolves it: $ABRA_DIR if set, else + ~/.abra. Inside a CI run, run_recipe_ci exports a PER-RUN $ABRA_DIR (fresh recipes/, shared + servers/+catalogue/ symlinks) before any abra call, so every helper here and every abra + subprocess agree on the same tree; outside a run (warm_reconcile's systemd timer, manual use) + both fall back to the canonical /root/.abra.""" + return os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra") + + +def recipe_dir(recipe: str) -> str: + """The current ABRA_DIR's working tree for a recipe (per-run inside a CI run).""" + return os.path.join(abra_dir(), "recipes", recipe) + + def _run_pty( args: list[str], timeout: int = 900, check: bool = True ) -> subprocess.CompletedProcess: @@ -77,9 +92,7 @@ def recipe_checkout(recipe: str, version: str) -> None: a chaos (`-C`) deploy ignores ENV VERSION and uses the current checkout — together that silently deployed LATEST for a 'previous-version' base, making the upgrade a no-op (Adversary F1d-2). With this checkout + a non-chaos deploy, a pinned deploy genuinely deploys that version.""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) # -f (force): the version-pinning checkout must yield the EXACT ref tree. Without it, a cc-ci # install_steps-provided overlay (e.g. discourse's compose.ccci.yml, copied into the pinned base) # is an UNTRACKED file that collides with the same path TRACKED in a later ref, and @@ -100,9 +113,7 @@ def has_lightweight_version_tags(recipe: str) -> bool: 'reference not found'.) The caller (deploy_app) uses this to fall back to a chaos base deploy (which skips lint and deploys the explicitly-checked-out pinned version — see lifecycle.deploy_app). Read-only: just `git tag` + `cat-file -t`; no fetch/mutation, so it can't trigger abra's revert.""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) tags = subprocess.run( ["git", "-C", path, "tag", "-l"], capture_output=True, text=True ).stdout.split() @@ -231,9 +242,7 @@ def recipe_head_commit(recipe: str) -> str | None: """The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the prev-tag base deploy reset the working tree (HC1).""" - import os - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True) out = proc.stdout.strip() return out or None @@ -241,10 +250,7 @@ def recipe_head_commit(recipe: str) -> str | None: def recipe_versions(recipe: str) -> list[str]: """Published versions of a recipe, oldest→newest (from the recipe git tags).""" - import os - import subprocess - - path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + path = recipe_dir(recipe) proc = subprocess.run( ["git", "-C", path, "tag", "--sort=creatordate"], capture_output=True, text=True ) diff --git a/runner/harness/generic.py b/runner/harness/generic.py index f42b896..d468dbf 100644 --- a/runner/harness/generic.py +++ b/runner/harness/generic.py @@ -25,7 +25,7 @@ _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE) def _recipe_dir(recipe: str) -> str: - return os.path.expanduser(f"~/.abra/recipes/{recipe}") + return abra.recipe_dir(recipe) # the per-run tree inside a CI run ($ABRA_DIR) def backup_capable(recipe: str, meta: dict | None = None) -> bool: diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 210a011..10a1d4b 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -37,31 +37,9 @@ class TeardownError(RuntimeError): # however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held → # leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never # steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each -# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR) -# serialise on this app lock. See docs/concurrency.md. -RECIPE_LOCK_DIR = "/run/lock" - - -def acquire_recipe_lock(recipe: str): - """Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes - checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and - with it this lock, structurally unnecessary); until then the caller keeps the returned file - alive for the whole run and release is implicit at process exit.""" - path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock") - # PEP 446: the fd is non-inheritable, so subprocess children never carry the lock. - f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run - try: - fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) - except BlockingIOError: - print( - f"== recipe lock: another {recipe} run is in flight — waiting for {path} " - "(shared ~/.abra/recipes checkout) ==", - flush=True, - ) - fcntl.flock(f, fcntl.LOCK_EX) - print(f"== recipe lock: acquired {path} ==", flush=True) - return f - +# run deploys from its own per-run ABRA_DIR — there is no shared recipe tree and no recipe lock), +# and same-domain runs (double-!testme of one PR) serialise on this app lock. +# See docs/concurrency.md. # Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the # caller drops the returned file object, GC would close the fd and silently release the lock — @@ -209,9 +187,9 @@ def prepull_images(recipe: str, domain: str) -> None: app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE). Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real pull error (don't mask it).""" - import os - - recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}") + recipe_dir = abra.recipe_dir(recipe) # per-run tree inside a CI run + # The app .env lives in the CANONICAL servers path (the per-run ABRA_DIR's servers/ is a + # symlink to it, so abra and this path agree on the same file). env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env") if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path): print(f" prepull: recipe dir or .env missing for {recipe} — skipping", flush=True) diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 5884e63..9f8f539 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -138,18 +138,62 @@ def _gitea_token() -> str | None: return tok or None +def setup_run_abra_dir() -> str: + """P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees. + + `//abra/` with: + servers/ -> symlink to the canonical ~/.abra/servers. App .env files land in the shared + canonical path, so janitor discovery (`abra app ls`) and env-based teardown + work unchanged from any process; per-domain filenames + the app-domain lock + prevent write conflicts. + catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly). + recipes/ fresh + empty — THE isolation that matters: each run clones and git-checkouts + its own recipe trees, so concurrent runs (same recipe included) can never + corrupt each other's deploy tree. Replaces the per-recipe flock. + Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper + (abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention.""" + canonical = os.path.expanduser("~/.abra") + rid = results_mod.run_id() + if rid == "manual": + rid = f"manual-{os.getpid()}" # two concurrent hand-runs must not share a tree + run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra") + os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True) + for shared in ("servers", "catalogue"): + link = os.path.join(run_abra_dir, shared) + if not os.path.islink(link): + os.symlink(os.path.join(canonical, shared), link) + os.environ["ABRA_DIR"] = run_abra_dir + print( + f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==", + flush=True, + ) + return run_abra_dir + + def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: - """Make the recipe available at the code under test. If SRC+REF point at the mirror PR, + """Make the recipe available at the code under test in THIS RUN's recipe tree + ($ABRA_DIR/recipes/): a plain clone — no locking needed, no rm-rf of any shared + state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered + `abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR, clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed).""" - recipes_dir = os.path.expanduser("~/.abra/recipes") - os.makedirs(recipes_dir, exist_ok=True) - dest = os.path.join(recipes_dir, recipe) - # CCCI_SKIP_FETCH=1: use the local recipe clone as-is (lets a test/Adversary stage a fake/broken - # ref — e.g. a simulated broken PR head for the --quick rollback proof — without it being clobbered - # by a re-fetch). Never set in production CI. + dest = abra.recipe_dir(recipe) + os.makedirs(os.path.dirname(dest), exist_ok=True) + # CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a + # fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it + # being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/; + # copy it into the per-run tree so the rest of the run reads the staged state. Never set in + # production CI. if os.environ.get("CCCI_SKIP_FETCH") == "1": - print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True) + canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}") + subprocess.run(["rm", "-rf", dest], check=False) + if os.path.isdir(canonical): + shutil.copytree(canonical, dest, symlinks=True) + print( + f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is " + f"(copied {canonical} -> per-run tree)", + flush=True, + ) return if src and ref: url = f"https://git.autonomic.zone/{src}.git" @@ -178,7 +222,7 @@ def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None: def snapshot_recipe_tests(recipe: str) -> str | None: """Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the recipe to a version tag during the run. Returns the snapshot path, or None if no tests/.""" - src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests") + src = os.path.join(abra.recipe_dir(recipe), "tests") if not os.path.isdir(src): return None has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile( @@ -841,12 +885,10 @@ def main() -> int: print( f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}" ) - # Concurrent-run safety: runs of the SAME recipe serialise on a per-recipe flock — they share - # ONE ~/.abra/recipes/ working tree which fetch_recipe (below) rm-rf's/reclones and the - # upgrade tier git-checkouts mid-run. Must be taken BEFORE fetch_recipe. Different recipes run - # in parallel (capacity=2). The reference must stay alive for the whole run: the kernel drops - # the flock when the fd closes (including on any crash/SIGKILL — no stale-lock failure mode). - _recipe_lock = lifecycle.acquire_recipe_lock(recipe) # noqa: F841 + # Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR + # (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs + # serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md). + setup_run_abra_dir() fetch_recipe(recipe, ref, src) # The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test # (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index d9e58b7..d41fef0 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -199,7 +199,13 @@ def _run(cmd, timeout=120, check=False): def _recipe_dir(recipe: str) -> str: - return os.path.expanduser(f"~/.abra/recipes/{recipe}") + # Resolve like the abra CLI does: $ABRA_DIR (the per-run tree when imported by a CI run, + # e.g. promote_canonical) else the canonical ~/.abra (this module's own systemd-timer runs, + # which set no ABRA_DIR). Keeps fetch_recipe (an `abra` subprocess) and the git readers + # below pointed at the SAME tree in both contexts. + return os.path.join( + os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra"), "recipes", recipe + ) def recipe_tags(recipe: str) -> list[str]: diff --git a/tests/discourse/install_steps.sh b/tests/discourse/install_steps.sh index 330a5cc..930e663 100755 --- a/tests/discourse/install_steps.sh +++ b/tests/discourse/install_steps.sh @@ -15,7 +15,9 @@ set -euo pipefail : "${CCCI_RECIPE:?missing CCCI_RECIPE}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}" +# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else +# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from. +RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}" if [ ! -d "$RECIPE_DIR" ]; then echo " discourse install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2 diff --git a/tests/ghost/install_steps.sh b/tests/ghost/install_steps.sh index ef10674..2c2dc50 100755 --- a/tests/ghost/install_steps.sh +++ b/tests/ghost/install_steps.sh @@ -15,7 +15,9 @@ set -euo pipefail : "${CCCI_RECIPE:?missing CCCI_RECIPE}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}" +# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else +# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from. +RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}" if [ ! -d "$RECIPE_DIR" ]; then echo " ghost install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2