fix(harness): run-keyed run-scoped state files — CONC-A1 (same-domain runs corrupted shared deploy-count)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
The four CCCI state files (deploys countfile, opstate, deps, depskip) were keyed by app domain in shared /tmp. A second run of the same domain executes its main() preamble + deploy_app's pre-lock _record_deploy BEFORE blocking at the app lock, so it reset/polluted the live first run's counter (false DG4.1 deploy-count=2, build 279) and the first run's end-of-run os.remove crashed the second (FileNotFoundError, build 281). Masked pre-restructure by the end-to-end recipe flock. Now keyed by run id + harness pid via _run_state_path(); children receive exact paths via the CCCI_*_FILE env vars, so domain keying was never load-bearing. tests/concurrency/test_run_state.py: path-invariant cases + a real-process regression (helpers.py deploy-count-run) reproducing the live interleaving — verified to FAIL under simulated shared keying. docs/concurrency.md §3 updated.
This commit is contained in:
@ -70,6 +70,13 @@ Per-run (no conflict possible):
|
|||||||
- **Recipe working trees** — `$ABRA_DIR/recipes/<recipe>`, per run (§4). NEW in the restructure.
|
- **Recipe working trees** — `$ABRA_DIR/recipes/<recipe>`, per run (§4). NEW in the restructure.
|
||||||
- **Drone build workspace** (`/var/lib/drone-runner/drone-<id>/`) and **run artifacts**
|
- **Drone build workspace** (`/var/lib/drone-runner/drone-<id>/`) and **run artifacts**
|
||||||
(`/var/lib/cc-ci-runs/<run-id>/`).
|
(`/var/lib/cc-ci-runs/<run-id>/`).
|
||||||
|
- **Run-scoped state files** (`/tmp/ccci-{deploys,opstate,deps,depskip}-<run-id>-<pid>…`) —
|
||||||
|
keyed by run id + harness pid via `run_recipe_ci._run_state_path()`, NEVER by app domain.
|
||||||
|
A second run of the same domain executes its `main()` preamble before blocking at the app
|
||||||
|
lock (§5), so domain-keyed files would be reset/removed underneath the live first run
|
||||||
|
(live finding, M2(c) double-`!testme`: false DG4.1 deploy-count in run 1, countfile
|
||||||
|
`FileNotFoundError` in run 2). Tier/hook children get the exact paths via the
|
||||||
|
`CCCI_*_FILE` env vars; removed on normal run exit.
|
||||||
|
|
||||||
Shared (by design, conflict-free):
|
Shared (by design, conflict-free):
|
||||||
|
|
||||||
@ -205,7 +212,8 @@ Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non-
|
|||||||
same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock
|
same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock
|
||||||
blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist;
|
blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist;
|
||||||
degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR
|
degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR
|
||||||
construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality.
|
construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality;
|
||||||
|
run-keyed (never domain-keyed) run-scoped state files (M2(c) regression, `test_run_state.py`).
|
||||||
|
|
||||||
## 11. File / symbol index
|
## 11. File / symbol index
|
||||||
|
|
||||||
|
|||||||
@ -138,6 +138,17 @@ def _gitea_token() -> str | None:
|
|||||||
return tok or None
|
return tok or None
|
||||||
|
|
||||||
|
|
||||||
|
def _run_state_path(name: str) -> str:
|
||||||
|
"""Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain.
|
||||||
|
A second run of the SAME domain overlaps this process (its main() preamble executes before it
|
||||||
|
blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the
|
||||||
|
live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile
|
||||||
|
FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them
|
||||||
|
via the CCCI_*_FILE env vars, so the key only has to be unique per harness process."""
|
||||||
|
rid = results_mod.run_id()
|
||||||
|
return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}")
|
||||||
|
|
||||||
|
|
||||||
def setup_run_abra_dir() -> str:
|
def setup_run_abra_dir() -> str:
|
||||||
"""P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
|
"""P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
|
||||||
|
|
||||||
@ -619,15 +630,15 @@ def run_quick(
|
|||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
statefile = _run_state_path("opstate") + ".json"
|
||||||
with open(statefile, "w") as f:
|
with open(statefile, "w") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||||||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
depsfile = _run_state_path("deps") + ".json"
|
||||||
with open(depsfile, "w") as f:
|
with open(depsfile, "w") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||||||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
skipfile = _run_state_path("depskip") + ".txt"
|
||||||
with contextlib.suppress(OSError):
|
with contextlib.suppress(OSError):
|
||||||
os.remove(skipfile)
|
os.remove(skipfile)
|
||||||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||||||
@ -927,7 +938,7 @@ def main() -> int:
|
|||||||
hook = discovery.install_steps(recipe, repo_local)
|
hook = discovery.install_steps(recipe, repo_local)
|
||||||
|
|
||||||
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
|
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
|
||||||
countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}")
|
countfile = _run_state_path("deploys")
|
||||||
with open(countfile, "w") as f:
|
with open(countfile, "w") as f:
|
||||||
f.write("0")
|
f.write("0")
|
||||||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||||||
@ -943,7 +954,7 @@ def main() -> int:
|
|||||||
|
|
||||||
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
|
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
|
||||||
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
|
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
|
||||||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
statefile = _run_state_path("opstate") + ".json"
|
||||||
with open(statefile, "w") as f:
|
with open(statefile, "w") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||||||
@ -954,12 +965,12 @@ def main() -> int:
|
|||||||
# cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs
|
# cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs
|
||||||
# `tests/<recipe>/setup_custom_tests.sh` to wire OIDC env via in-place redeploy.
|
# `tests/<recipe>/setup_custom_tests.sh` to wire OIDC env via in-place redeploy.
|
||||||
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
|
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
|
||||||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
depsfile = _run_state_path("deps") + ".json"
|
||||||
with open(depsfile, "w") as f:
|
with open(depsfile, "w") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||||||
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
|
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
|
||||||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
skipfile = _run_state_path("depskip") + ".txt"
|
||||||
with contextlib.suppress(OSError):
|
with contextlib.suppress(OSError):
|
||||||
os.remove(skipfile)
|
os.remove(skipfile)
|
||||||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||||||
|
|||||||
@ -108,6 +108,34 @@ def cmd_fetch_checkout(recipe: str, ref: str) -> None:
|
|||||||
mark(f"RESULT {head} {content}")
|
mark(f"RESULT {head} {content}")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_deploy_count_run(domain: str, gate: str) -> None:
|
||||||
|
"""Mirror the REAL run flow for the DG4.1 counter (CONC-A1 regression): countfile init
|
||||||
|
(main() preamble) → _record_deploy (deploy_app fires it BEFORE the app lock) → acquire
|
||||||
|
the app lock → wait for `gate` (file path; '' = no wait) → read + remove own countfile.
|
||||||
|
Two of these on the SAME domain must each see COUNT 1 and never lose their file."""
|
||||||
|
import run_recipe_ci
|
||||||
|
|
||||||
|
countfile = run_recipe_ci._run_state_path("deploys")
|
||||||
|
with open(countfile, "w") as f:
|
||||||
|
f.write("0")
|
||||||
|
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||||||
|
lifecycle._record_deploy() # pre-lock, exactly like lifecycle.deploy_app()
|
||||||
|
mark("PRELOCK")
|
||||||
|
lifecycle.acquire_app_lock(domain)
|
||||||
|
mark("ACQUIRED")
|
||||||
|
if gate:
|
||||||
|
deadline = time.time() + 15
|
||||||
|
while not os.path.exists(gate) and time.time() < deadline:
|
||||||
|
time.sleep(0.05)
|
||||||
|
try:
|
||||||
|
with open(countfile) as f:
|
||||||
|
n = int(f.read().strip() or "0")
|
||||||
|
os.remove(countfile)
|
||||||
|
mark(f"COUNT {n}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
mark("COUNT_FILE_MISSING")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cmd, *args = sys.argv[1:]
|
cmd, *args = sys.argv[1:]
|
||||||
{
|
{
|
||||||
@ -117,4 +145,5 @@ if __name__ == "__main__":
|
|||||||
"wrapper": cmd_wrapper,
|
"wrapper": cmd_wrapper,
|
||||||
"orphan-probe": cmd_orphan_probe,
|
"orphan-probe": cmd_orphan_probe,
|
||||||
"fetch-checkout": cmd_fetch_checkout,
|
"fetch-checkout": cmd_fetch_checkout,
|
||||||
|
"deploy-count-run": cmd_deploy_count_run,
|
||||||
}[cmd](*args)
|
}[cmd](*args)
|
||||||
|
|||||||
79
tests/concurrency/test_run_state.py
Normal file
79
tests/concurrency/test_run_state.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
"""Run-scoped state files — M2(c) live-verify regression (not one of the 19 plan cases).
|
||||||
|
|
||||||
|
The four CCCI state files (deploys countfile, opstate, deps, depskip) must be keyed by
|
||||||
|
run id + harness pid, NEVER by app domain: a second run of the SAME domain executes its
|
||||||
|
main() preamble (state-file init, deploy_app's _record_deploy) BEFORE it blocks at the
|
||||||
|
app lock, so domain-keyed files in the shared tempdir get reset/removed underneath the
|
||||||
|
live first run. Observed live (builds 279/281): false DG4.1 deploy-count=2 in run 1,
|
||||||
|
countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they
|
||||||
|
receive them via the CCCI_*_FILE env vars, so per-process uniqueness is sufficient.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(__file__))
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||||
|
import run_recipe_ci # noqa: E402
|
||||||
|
from concutil import wait_marker # noqa: E402
|
||||||
|
|
||||||
|
DOMAIN = "fake-abc123.ci.commoninternet.net"
|
||||||
|
|
||||||
|
|
||||||
|
def test_20_state_paths_keyed_by_run_and_pid_never_by_domain(monkeypatch):
|
||||||
|
domain = "immi-ad3e33.ci.commoninternet.net"
|
||||||
|
monkeypatch.setenv("CCCI_APP_DOMAIN", domain)
|
||||||
|
|
||||||
|
monkeypatch.setenv("DRONE_BUILD_NUMBER", "279")
|
||||||
|
p279 = run_recipe_ci._run_state_path("deploys")
|
||||||
|
monkeypatch.setenv("DRONE_BUILD_NUMBER", "281")
|
||||||
|
p281 = run_recipe_ci._run_state_path("deploys")
|
||||||
|
|
||||||
|
# the double-!testme invariant: two runs (same domain) share NO state file
|
||||||
|
assert p279 != p281
|
||||||
|
# keyed by run id + pid, under the tempdir
|
||||||
|
base = os.path.basename(p279)
|
||||||
|
assert base == f"ccci-deploys-279-{os.getpid()}"
|
||||||
|
assert os.path.dirname(p279) == tempfile.gettempdir()
|
||||||
|
# the app domain must not appear in the path at all
|
||||||
|
assert domain not in p279 and domain not in p281
|
||||||
|
|
||||||
|
|
||||||
|
def test_20c_same_domain_runs_each_keep_their_own_count(tmp_path, lock_dir, pool):
|
||||||
|
"""The live CONC-A1 interleaving, with REAL processes + the REAL lock and counter code:
|
||||||
|
run A holds the app lock; run B (same domain) fires its pre-lock _record_deploy and
|
||||||
|
blocks; A then reads its counter — must still be 1 (not polluted by B) — and removes
|
||||||
|
its own file; B acquires and must find ITS file intact (no FileNotFoundError)."""
|
||||||
|
gate = tmp_path / "gate"
|
||||||
|
env_a = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9001"}
|
||||||
|
env_b = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9002"}
|
||||||
|
|
||||||
|
pa, out_a = pool.spawn("deploy-count-run", DOMAIN, str(gate), env_extra=env_a)
|
||||||
|
assert wait_marker(out_a, "ACQUIRED")
|
||||||
|
pb, out_b = pool.spawn("deploy-count-run", DOMAIN, "", env_extra=env_b)
|
||||||
|
# B's main()-preamble + pre-lock increment have fired; B is now blocked on the app lock
|
||||||
|
assert wait_marker(out_b, "PRELOCK")
|
||||||
|
assert wait_marker(out_b, "ACQUIRED", timeout=1.0) is None # still serialised behind A
|
||||||
|
|
||||||
|
gate.touch() # let A read its counter only AFTER B's pre-lock work landed
|
||||||
|
line_a = wait_marker(out_a, "COUNT")
|
||||||
|
assert line_a is not None and line_a.strip() == "COUNT 1", line_a # not 2: B didn't pollute A
|
||||||
|
pa.wait(timeout=15)
|
||||||
|
|
||||||
|
line_b = wait_marker(out_b, "COUNT")
|
||||||
|
assert (
|
||||||
|
line_b is not None and line_b.strip() == "COUNT 1"
|
||||||
|
), line_b # B's file survived A's remove
|
||||||
|
pb.wait(timeout=15)
|
||||||
|
|
||||||
|
|
||||||
|
def test_20b_manual_runs_distinct_via_pid(monkeypatch):
|
||||||
|
# no DRONE_BUILD_NUMBER and no domain/run-id env → run_id() falls back to "manual";
|
||||||
|
# the pid suffix still separates two concurrent hand-runs of the same domain.
|
||||||
|
for var in ("DRONE_BUILD_NUMBER", "CCCI_APP_DOMAIN", "CCCI_RUN_ID"):
|
||||||
|
monkeypatch.delenv(var, raising=False)
|
||||||
|
p = run_recipe_ci._run_state_path("opstate")
|
||||||
|
assert os.path.basename(p) == f"ccci-opstate-manual-{os.getpid()}"
|
||||||
Reference in New Issue
Block a user