Merge branch 'restructure/concurrency': fix(harness) CONC-A1 run-keyed state files (M2(c) live-verify finding)
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
autonomic-bot
2026-06-10 08:16:18 +00:00
4 changed files with 135 additions and 8 deletions

View File

@ -70,6 +70,13 @@ Per-run (no conflict possible):
- **Recipe working trees** — `$ABRA_DIR/recipes/<recipe>`, per run (§4). NEW in the restructure.
- **Drone build workspace** (`/var/lib/drone-runner/drone-<id>/`) and **run artifacts**
(`/var/lib/cc-ci-runs/<run-id>/`).
- **Run-scoped state files** (`/tmp/ccci-{deploys,opstate,deps,depskip}-<run-id>-<pid>…`) —
keyed by run id + harness pid via `run_recipe_ci._run_state_path()`, NEVER by app domain.
A second run of the same domain executes its `main()` preamble before blocking at the app
lock (§5), so domain-keyed files would be reset/removed underneath the live first run
(live finding, M2(c) double-`!testme`: false DG4.1 deploy-count in run 1, countfile
`FileNotFoundError` in run 2). Tier/hook children get the exact paths via the
`CCCI_*_FILE` env vars; removed on normal run exit.
Shared (by design, conflict-free):
@ -205,7 +212,8 @@ Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non-
same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock
blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist;
degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR
construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality.
construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality;
run-keyed (never domain-keyed) run-scoped state files (M2(c) regression, `test_run_state.py`).
## 11. File / symbol index

View File

@ -138,6 +138,17 @@ def _gitea_token() -> str | None:
return tok or None
def _run_state_path(name: str) -> str:
"""Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain.
A second run of the SAME domain overlaps this process (its main() preamble executes before it
blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the
live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile
FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them
via the CCCI_*_FILE env vars, so the key only has to be unique per harness process."""
rid = results_mod.run_id()
return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}")
def setup_run_abra_dir() -> str:
"""P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
@ -619,15 +630,15 @@ def run_quick(
flush=True,
)
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
statefile = _run_state_path("opstate") + ".json"
with open(statefile, "w") as f:
json.dump({}, f)
os.environ["CCCI_OP_STATE_FILE"] = statefile
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
depsfile = _run_state_path("deps") + ".json"
with open(depsfile, "w") as f:
json.dump({}, f)
os.environ["CCCI_DEPS_FILE"] = depsfile
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
skipfile = _run_state_path("depskip") + ".txt"
with contextlib.suppress(OSError):
os.remove(skipfile)
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
@ -927,7 +938,7 @@ def main() -> int:
hook = discovery.install_steps(recipe, repo_local)
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}")
countfile = _run_state_path("deploys")
with open(countfile, "w") as f:
f.write("0")
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
@ -943,7 +954,7 @@ def main() -> int:
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
statefile = _run_state_path("opstate") + ".json"
with open(statefile, "w") as f:
json.dump({}, f)
os.environ["CCCI_OP_STATE_FILE"] = statefile
@ -954,12 +965,12 @@ def main() -> int:
# cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs
# `tests/<recipe>/setup_custom_tests.sh` to wire OIDC env via in-place redeploy.
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
depsfile = _run_state_path("deps") + ".json"
with open(depsfile, "w") as f:
json.dump({}, f)
os.environ["CCCI_DEPS_FILE"] = depsfile
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
skipfile = _run_state_path("depskip") + ".txt"
with contextlib.suppress(OSError):
os.remove(skipfile)
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile

View File

@ -108,6 +108,34 @@ def cmd_fetch_checkout(recipe: str, ref: str) -> None:
mark(f"RESULT {head} {content}")
def cmd_deploy_count_run(domain: str, gate: str) -> None:
"""Mirror the REAL run flow for the DG4.1 counter (CONC-A1 regression): countfile init
(main() preamble) → _record_deploy (deploy_app fires it BEFORE the app lock) → acquire
the app lock → wait for `gate` (file path; '' = no wait) → read + remove own countfile.
Two of these on the SAME domain must each see COUNT 1 and never lose their file."""
import run_recipe_ci
countfile = run_recipe_ci._run_state_path("deploys")
with open(countfile, "w") as f:
f.write("0")
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
lifecycle._record_deploy() # pre-lock, exactly like lifecycle.deploy_app()
mark("PRELOCK")
lifecycle.acquire_app_lock(domain)
mark("ACQUIRED")
if gate:
deadline = time.time() + 15
while not os.path.exists(gate) and time.time() < deadline:
time.sleep(0.05)
try:
with open(countfile) as f:
n = int(f.read().strip() or "0")
os.remove(countfile)
mark(f"COUNT {n}")
except FileNotFoundError:
mark("COUNT_FILE_MISSING")
if __name__ == "__main__":
cmd, *args = sys.argv[1:]
{
@ -117,4 +145,5 @@ if __name__ == "__main__":
"wrapper": cmd_wrapper,
"orphan-probe": cmd_orphan_probe,
"fetch-checkout": cmd_fetch_checkout,
"deploy-count-run": cmd_deploy_count_run,
}[cmd](*args)

View File

@ -0,0 +1,79 @@
"""Run-scoped state files — M2(c) live-verify regression (not one of the 19 plan cases).
The four CCCI state files (deploys countfile, opstate, deps, depskip) must be keyed by
run id + harness pid, NEVER by app domain: a second run of the SAME domain executes its
main() preamble (state-file init, deploy_app's _record_deploy) BEFORE it blocks at the
app lock, so domain-keyed files in the shared tempdir get reset/removed underneath the
live first run. Observed live (builds 279/281): false DG4.1 deploy-count=2 in run 1,
countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they
receive them via the CCCI_*_FILE env vars, so per-process uniqueness is sufficient.
"""
from __future__ import annotations
import os
import sys
import tempfile
sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
import run_recipe_ci # noqa: E402
from concutil import wait_marker # noqa: E402
DOMAIN = "fake-abc123.ci.commoninternet.net"
def test_20_state_paths_keyed_by_run_and_pid_never_by_domain(monkeypatch):
domain = "immi-ad3e33.ci.commoninternet.net"
monkeypatch.setenv("CCCI_APP_DOMAIN", domain)
monkeypatch.setenv("DRONE_BUILD_NUMBER", "279")
p279 = run_recipe_ci._run_state_path("deploys")
monkeypatch.setenv("DRONE_BUILD_NUMBER", "281")
p281 = run_recipe_ci._run_state_path("deploys")
# the double-!testme invariant: two runs (same domain) share NO state file
assert p279 != p281
# keyed by run id + pid, under the tempdir
base = os.path.basename(p279)
assert base == f"ccci-deploys-279-{os.getpid()}"
assert os.path.dirname(p279) == tempfile.gettempdir()
# the app domain must not appear in the path at all
assert domain not in p279 and domain not in p281
def test_20c_same_domain_runs_each_keep_their_own_count(tmp_path, lock_dir, pool):
"""The live CONC-A1 interleaving, with REAL processes + the REAL lock and counter code:
run A holds the app lock; run B (same domain) fires its pre-lock _record_deploy and
blocks; A then reads its counter — must still be 1 (not polluted by B) — and removes
its own file; B acquires and must find ITS file intact (no FileNotFoundError)."""
gate = tmp_path / "gate"
env_a = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9001"}
env_b = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9002"}
pa, out_a = pool.spawn("deploy-count-run", DOMAIN, str(gate), env_extra=env_a)
assert wait_marker(out_a, "ACQUIRED")
pb, out_b = pool.spawn("deploy-count-run", DOMAIN, "", env_extra=env_b)
# B's main()-preamble + pre-lock increment have fired; B is now blocked on the app lock
assert wait_marker(out_b, "PRELOCK")
assert wait_marker(out_b, "ACQUIRED", timeout=1.0) is None # still serialised behind A
gate.touch() # let A read its counter only AFTER B's pre-lock work landed
line_a = wait_marker(out_a, "COUNT")
assert line_a is not None and line_a.strip() == "COUNT 1", line_a # not 2: B didn't pollute A
pa.wait(timeout=15)
line_b = wait_marker(out_b, "COUNT")
assert (
line_b is not None and line_b.strip() == "COUNT 1"
), line_b # B's file survived A's remove
pb.wait(timeout=15)
def test_20b_manual_runs_distinct_via_pid(monkeypatch):
# no DRONE_BUILD_NUMBER and no domain/run-id env → run_id() falls back to "manual";
# the pid suffix still separates two concurrent hand-runs of the same domain.
for var in ("DRONE_BUILD_NUMBER", "CCCI_APP_DOMAIN", "CCCI_RUN_ID"):
monkeypatch.delenv(var, raising=False)
p = run_recipe_ci._run_state_path("opstate")
assert os.path.basename(p) == f"ccci-opstate-manual-{os.getpid()}"