feat(harness): P3 per-run ABRA_DIR — structural recipe-tree isolation, recipe flock deleted
All checks were successful
continuous-integration/drone/push Build is passing

- run_recipe_ci.setup_run_abra_dir(): builds <runs_dir>/<run-id>/abra with servers/ and
  catalogue/ symlinked to the canonical ~/.abra (app .env files keep landing in the shared
  canonical path, so janitor discovery and env-based teardown are unchanged; per-domain
  filenames + the P2 app-domain lock prevent write conflicts) and a FRESH empty recipes/ —
  each run clones + checkouts its own recipe trees. Exported as $ABRA_DIR (honored by the
  abra CLI, verified on-host) before ANY abra call. Manual runs get manual-<pid> isolation.
- fetch_recipe(): plain clone into $ABRA_DIR/recipes/<recipe> — no shared-tree rm-rf, no lock.
  CCCI_SKIP_FETCH=1 now copies the canonically-staged clone into the per-run tree (same staging
  workflow, run reads staged state).
- abra.abra_dir()/recipe_dir(): single resolution rule ($ABRA_DIR else ~/.abra), used by
  recipe_checkout, has_lightweight_version_tags, recipe_head_commit, recipe_versions,
  generic._recipe_dir, lifecycle.prepull_images, snapshot_recipe_tests, and
  warm_reconcile._recipe_dir (which keeps the canonical default for its own systemd runs but
  follows the per-run tree when imported by promote_canonical inside a run).
- deleted: lifecycle.acquire_recipe_lock, RECIPE_LOCK_DIR, the main() call site and the
  must-lock-before-fetch ordering rule.
- tests/{ghost,discourse}/install_steps.sh: RECIPE_DIR resolves ${ABRA_DIR:-$HOME/.abra} so the
  compose.ccci.yml overlay lands in the tree the run actually deploys from (mechanical path fix
  required by per-run trees; no assertion/gate touched — see DECISIONS.md).
- .drone.yml comments updated (HOME=/root rationale now via the servers symlink).
This commit is contained in:
autonomic-bot
2026-06-10 04:18:33 +00:00
parent b302f3ab63
commit 17ebdf39ac
8 changed files with 103 additions and 67 deletions

View File

@ -10,6 +10,7 @@ Bakes in the known abra gotchas (re-verify per installed abra version, currently
from __future__ import annotations
import json
import os
import subprocess
ABRA = "abra"
@ -19,6 +20,20 @@ class AbraError(RuntimeError):
pass
def abra_dir() -> str:
"""abra's state dir, resolved the same way the abra CLI resolves it: $ABRA_DIR if set, else
~/.abra. Inside a CI run, run_recipe_ci exports a PER-RUN $ABRA_DIR (fresh recipes/, shared
servers/+catalogue/ symlinks) before any abra call, so every helper here and every abra
subprocess agree on the same tree; outside a run (warm_reconcile's systemd timer, manual use)
both fall back to the canonical /root/.abra."""
return os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra")
def recipe_dir(recipe: str) -> str:
"""The current ABRA_DIR's working tree for a recipe (per-run inside a CI run)."""
return os.path.join(abra_dir(), "recipes", recipe)
def _run_pty(
args: list[str], timeout: int = 900, check: bool = True
) -> subprocess.CompletedProcess:
@ -77,9 +92,7 @@ def recipe_checkout(recipe: str, version: str) -> None:
a chaos (`-C`) deploy ignores ENV VERSION and uses the current checkout — together that silently
deployed LATEST for a 'previous-version' base, making the upgrade a no-op (Adversary F1d-2). With
this checkout + a non-chaos deploy, a pinned deploy genuinely deploys that version."""
import os
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
path = recipe_dir(recipe)
# -f (force): the version-pinning checkout must yield the EXACT ref tree. Without it, a cc-ci
# install_steps-provided overlay (e.g. discourse's compose.ccci.yml, copied into the pinned base)
# is an UNTRACKED file that collides with the same path TRACKED in a later ref, and
@ -100,9 +113,7 @@ def has_lightweight_version_tags(recipe: str) -> bool:
'reference not found'.) The caller (deploy_app) uses this to fall back to a chaos base deploy
(which skips lint and deploys the explicitly-checked-out pinned version — see lifecycle.deploy_app).
Read-only: just `git tag` + `cat-file -t`; no fetch/mutation, so it can't trigger abra's revert."""
import os
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
path = recipe_dir(recipe)
tags = subprocess.run(
["git", "-C", path, "tag", "-l"], capture_output=True, text=True
).stdout.split()
@ -231,9 +242,7 @@ def recipe_head_commit(recipe: str) -> str | None:
"""The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or
the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the
prev-tag base deploy reset the working tree (HC1)."""
import os
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
path = recipe_dir(recipe)
proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True)
out = proc.stdout.strip()
return out or None
@ -241,10 +250,7 @@ def recipe_head_commit(recipe: str) -> str | None:
def recipe_versions(recipe: str) -> list[str]:
"""Published versions of a recipe, oldest→newest (from the recipe git tags)."""
import os
import subprocess
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
path = recipe_dir(recipe)
proc = subprocess.run(
["git", "-C", path, "tag", "--sort=creatordate"], capture_output=True, text=True
)

View File

@ -25,7 +25,7 @@ _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
def _recipe_dir(recipe: str) -> str:
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
return abra.recipe_dir(recipe) # the per-run tree inside a CI run ($ABRA_DIR)
def backup_capable(recipe: str, meta: dict | None = None) -> bool:

View File

@ -37,31 +37,9 @@ class TeardownError(RuntimeError):
# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held →
# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never
# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each
# run deploys from its own per-run ABRA_DIR), and same-domain runs (double-!testme of one PR)
# serialise on this app lock. See docs/concurrency.md.
RECIPE_LOCK_DIR = "/run/lock"
def acquire_recipe_lock(recipe: str):
"""Per-recipe exclusive lock serialising same-recipe runs on the shared ~/.abra/recipes
checkout. P3 of the restructure deletes this (per-run ABRA_DIR makes the shared tree, and
with it this lock, structurally unnecessary); until then the caller keeps the returned file
alive for the whole run and release is implicit at process exit."""
path = os.path.join(RECIPE_LOCK_DIR, f"cc-ci-recipe-{recipe}.lock")
# PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
f = open(path, "w") # noqa: SIM115 — deliberately held for the lifetime of the run
try:
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
print(
f"== recipe lock: another {recipe} run is in flight — waiting for {path} "
"(shared ~/.abra/recipes checkout) ==",
flush=True,
)
fcntl.flock(f, fcntl.LOCK_EX)
print(f"== recipe lock: acquired {path} ==", flush=True)
return f
# run deploys from its own per-run ABRA_DIR — there is no shared recipe tree and no recipe lock),
# and same-domain runs (double-!testme of one PR) serialise on this app lock.
# See docs/concurrency.md.
# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the
# caller drops the returned file object, GC would close the fd and silently release the lock —
@ -209,9 +187,9 @@ def prepull_images(recipe: str, domain: str) -> None:
app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE).
Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real
pull error (don't mask it)."""
import os
recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}")
recipe_dir = abra.recipe_dir(recipe) # per-run tree inside a CI run
# The app .env lives in the CANONICAL servers path (the per-run ABRA_DIR's servers/ is a
# symlink to it, so abra and this path agree on the same file).
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
print(f" prepull: recipe dir or .env missing for {recipe} — skipping", flush=True)