diff --git a/docs/concurrency.md b/docs/concurrency.md index 06858f1..ae5b8af 100644 --- a/docs/concurrency.md +++ b/docs/concurrency.md @@ -70,6 +70,13 @@ Per-run (no conflict possible): - **Recipe working trees** — `$ABRA_DIR/recipes/`, per run (§4). NEW in the restructure. - **Drone build workspace** (`/var/lib/drone-runner/drone-/`) and **run artifacts** (`/var/lib/cc-ci-runs//`). +- **Run-scoped state files** (`/tmp/ccci-{deploys,opstate,deps,depskip}--…`) — + keyed by run id + harness pid via `run_recipe_ci._run_state_path()`, NEVER by app domain. + A second run of the same domain executes its `main()` preamble before blocking at the app + lock (§5), so domain-keyed files would be reset/removed underneath the live first run + (live finding, M2(c) double-`!testme`: false DG4.1 deploy-count in run 1, countfile + `FileNotFoundError` in run 2). Tier/hook children get the exact paths via the + `CCCI_*_FILE` env vars; removed on normal run exit. Shared (by design, conflict-free): @@ -205,7 +212,8 @@ Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non- same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist; degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR -construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality. +construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality; +run-keyed (never domain-keyed) run-scoped state files (M2(c) regression, `test_run_state.py`). ## 11. File / symbol index diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 9f8f539..77891cf 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -138,6 +138,17 @@ def _gitea_token() -> str | None: return tok or None +def _run_state_path(name: str) -> str: + """Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain. + A second run of the SAME domain overlaps this process (its main() preamble executes before it + blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the + live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile + FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them + via the CCCI_*_FILE env vars, so the key only has to be unique per harness process.""" + rid = results_mod.run_id() + return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}") + + def setup_run_abra_dir() -> str: """P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees. @@ -619,15 +630,15 @@ def run_quick( flush=True, ) - statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json") + statefile = _run_state_path("opstate") + ".json" with open(statefile, "w") as f: json.dump({}, f) os.environ["CCCI_OP_STATE_FILE"] = statefile - depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json") + depsfile = _run_state_path("deps") + ".json" with open(depsfile, "w") as f: json.dump({}, f) os.environ["CCCI_DEPS_FILE"] = depsfile - skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt") + skipfile = _run_state_path("depskip") + ".txt" with contextlib.suppress(OSError): os.remove(skipfile) os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile @@ -927,7 +938,7 @@ def main() -> int: hook = discovery.install_steps(recipe, repo_local) # Deploy-count guard (DG4.1): exactly one deploy_app() per run. - countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}") + countfile = _run_state_path("deploys") with open(countfile, "w") as f: f.write("0") os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile @@ -943,7 +954,7 @@ def main() -> int: # Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup # snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state(). - statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json") + statefile = _run_state_path("opstate") + ".json" with open(statefile, "w") as f: json.dump({}, f) os.environ["CCCI_OP_STATE_FILE"] = statefile @@ -954,12 +965,12 @@ def main() -> int: # cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs # `tests//setup_custom_tests.sh` to wire OIDC env via in-place redeploy. # `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable). - depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json") + depsfile = _run_state_path("deps") + ".json" with open(depsfile, "w") as f: json.dump({}, f) os.environ["CCCI_DEPS_FILE"] = depsfile # F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here. - skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt") + skipfile = _run_state_path("depskip") + ".txt" with contextlib.suppress(OSError): os.remove(skipfile) os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile diff --git a/tests/concurrency/helpers.py b/tests/concurrency/helpers.py index 559801f..8e5dbf3 100644 --- a/tests/concurrency/helpers.py +++ b/tests/concurrency/helpers.py @@ -108,6 +108,34 @@ def cmd_fetch_checkout(recipe: str, ref: str) -> None: mark(f"RESULT {head} {content}") +def cmd_deploy_count_run(domain: str, gate: str) -> None: + """Mirror the REAL run flow for the DG4.1 counter (CONC-A1 regression): countfile init + (main() preamble) → _record_deploy (deploy_app fires it BEFORE the app lock) → acquire + the app lock → wait for `gate` (file path; '' = no wait) → read + remove own countfile. + Two of these on the SAME domain must each see COUNT 1 and never lose their file.""" + import run_recipe_ci + + countfile = run_recipe_ci._run_state_path("deploys") + with open(countfile, "w") as f: + f.write("0") + os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile + lifecycle._record_deploy() # pre-lock, exactly like lifecycle.deploy_app() + mark("PRELOCK") + lifecycle.acquire_app_lock(domain) + mark("ACQUIRED") + if gate: + deadline = time.time() + 15 + while not os.path.exists(gate) and time.time() < deadline: + time.sleep(0.05) + try: + with open(countfile) as f: + n = int(f.read().strip() or "0") + os.remove(countfile) + mark(f"COUNT {n}") + except FileNotFoundError: + mark("COUNT_FILE_MISSING") + + if __name__ == "__main__": cmd, *args = sys.argv[1:] { @@ -117,4 +145,5 @@ if __name__ == "__main__": "wrapper": cmd_wrapper, "orphan-probe": cmd_orphan_probe, "fetch-checkout": cmd_fetch_checkout, + "deploy-count-run": cmd_deploy_count_run, }[cmd](*args) diff --git a/tests/concurrency/test_run_state.py b/tests/concurrency/test_run_state.py new file mode 100644 index 0000000..ca34b6b --- /dev/null +++ b/tests/concurrency/test_run_state.py @@ -0,0 +1,79 @@ +"""Run-scoped state files — M2(c) live-verify regression (not one of the 19 plan cases). + +The four CCCI state files (deploys countfile, opstate, deps, depskip) must be keyed by +run id + harness pid, NEVER by app domain: a second run of the SAME domain executes its +main() preamble (state-file init, deploy_app's _record_deploy) BEFORE it blocks at the +app lock, so domain-keyed files in the shared tempdir get reset/removed underneath the +live first run. Observed live (builds 279/281): false DG4.1 deploy-count=2 in run 1, +countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they +receive them via the CCCI_*_FILE env vars, so per-process uniqueness is sufficient. +""" + +from __future__ import annotations + +import os +import sys +import tempfile + +sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +import run_recipe_ci # noqa: E402 +from concutil import wait_marker # noqa: E402 + +DOMAIN = "fake-abc123.ci.commoninternet.net" + + +def test_20_state_paths_keyed_by_run_and_pid_never_by_domain(monkeypatch): + domain = "immi-ad3e33.ci.commoninternet.net" + monkeypatch.setenv("CCCI_APP_DOMAIN", domain) + + monkeypatch.setenv("DRONE_BUILD_NUMBER", "279") + p279 = run_recipe_ci._run_state_path("deploys") + monkeypatch.setenv("DRONE_BUILD_NUMBER", "281") + p281 = run_recipe_ci._run_state_path("deploys") + + # the double-!testme invariant: two runs (same domain) share NO state file + assert p279 != p281 + # keyed by run id + pid, under the tempdir + base = os.path.basename(p279) + assert base == f"ccci-deploys-279-{os.getpid()}" + assert os.path.dirname(p279) == tempfile.gettempdir() + # the app domain must not appear in the path at all + assert domain not in p279 and domain not in p281 + + +def test_20c_same_domain_runs_each_keep_their_own_count(tmp_path, lock_dir, pool): + """The live CONC-A1 interleaving, with REAL processes + the REAL lock and counter code: + run A holds the app lock; run B (same domain) fires its pre-lock _record_deploy and + blocks; A then reads its counter — must still be 1 (not polluted by B) — and removes + its own file; B acquires and must find ITS file intact (no FileNotFoundError).""" + gate = tmp_path / "gate" + env_a = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9001"} + env_b = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9002"} + + pa, out_a = pool.spawn("deploy-count-run", DOMAIN, str(gate), env_extra=env_a) + assert wait_marker(out_a, "ACQUIRED") + pb, out_b = pool.spawn("deploy-count-run", DOMAIN, "", env_extra=env_b) + # B's main()-preamble + pre-lock increment have fired; B is now blocked on the app lock + assert wait_marker(out_b, "PRELOCK") + assert wait_marker(out_b, "ACQUIRED", timeout=1.0) is None # still serialised behind A + + gate.touch() # let A read its counter only AFTER B's pre-lock work landed + line_a = wait_marker(out_a, "COUNT") + assert line_a is not None and line_a.strip() == "COUNT 1", line_a # not 2: B didn't pollute A + pa.wait(timeout=15) + + line_b = wait_marker(out_b, "COUNT") + assert ( + line_b is not None and line_b.strip() == "COUNT 1" + ), line_b # B's file survived A's remove + pb.wait(timeout=15) + + +def test_20b_manual_runs_distinct_via_pid(monkeypatch): + # no DRONE_BUILD_NUMBER and no domain/run-id env → run_id() falls back to "manual"; + # the pid suffix still separates two concurrent hand-runs of the same domain. + for var in ("DRONE_BUILD_NUMBER", "CCCI_APP_DOMAIN", "CCCI_RUN_ID"): + monkeypatch.delenv(var, raising=False) + p = run_recipe_ci._run_state_path("opstate") + assert os.path.basename(p) == f"ccci-opstate-manual-{os.getpid()}"