fix(harness): run-keyed run-scoped state files — CONC-A1 (same-domain runs corrupted shared deploy-count)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
The four CCCI state files (deploys countfile, opstate, deps, depskip) were keyed by app domain in shared /tmp. A second run of the same domain executes its main() preamble + deploy_app's pre-lock _record_deploy BEFORE blocking at the app lock, so it reset/polluted the live first run's counter (false DG4.1 deploy-count=2, build 279) and the first run's end-of-run os.remove crashed the second (FileNotFoundError, build 281). Masked pre-restructure by the end-to-end recipe flock. Now keyed by run id + harness pid via _run_state_path(); children receive exact paths via the CCCI_*_FILE env vars, so domain keying was never load-bearing. tests/concurrency/test_run_state.py: path-invariant cases + a real-process regression (helpers.py deploy-count-run) reproducing the live interleaving — verified to FAIL under simulated shared keying. docs/concurrency.md §3 updated.
This commit is contained in:
@ -108,6 +108,34 @@ def cmd_fetch_checkout(recipe: str, ref: str) -> None:
|
||||
mark(f"RESULT {head} {content}")
|
||||
|
||||
|
||||
def cmd_deploy_count_run(domain: str, gate: str) -> None:
|
||||
"""Mirror the REAL run flow for the DG4.1 counter (CONC-A1 regression): countfile init
|
||||
(main() preamble) → _record_deploy (deploy_app fires it BEFORE the app lock) → acquire
|
||||
the app lock → wait for `gate` (file path; '' = no wait) → read + remove own countfile.
|
||||
Two of these on the SAME domain must each see COUNT 1 and never lose their file."""
|
||||
import run_recipe_ci
|
||||
|
||||
countfile = run_recipe_ci._run_state_path("deploys")
|
||||
with open(countfile, "w") as f:
|
||||
f.write("0")
|
||||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||||
lifecycle._record_deploy() # pre-lock, exactly like lifecycle.deploy_app()
|
||||
mark("PRELOCK")
|
||||
lifecycle.acquire_app_lock(domain)
|
||||
mark("ACQUIRED")
|
||||
if gate:
|
||||
deadline = time.time() + 15
|
||||
while not os.path.exists(gate) and time.time() < deadline:
|
||||
time.sleep(0.05)
|
||||
try:
|
||||
with open(countfile) as f:
|
||||
n = int(f.read().strip() or "0")
|
||||
os.remove(countfile)
|
||||
mark(f"COUNT {n}")
|
||||
except FileNotFoundError:
|
||||
mark("COUNT_FILE_MISSING")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cmd, *args = sys.argv[1:]
|
||||
{
|
||||
@ -117,4 +145,5 @@ if __name__ == "__main__":
|
||||
"wrapper": cmd_wrapper,
|
||||
"orphan-probe": cmd_orphan_probe,
|
||||
"fetch-checkout": cmd_fetch_checkout,
|
||||
"deploy-count-run": cmd_deploy_count_run,
|
||||
}[cmd](*args)
|
||||
|
||||
79
tests/concurrency/test_run_state.py
Normal file
79
tests/concurrency/test_run_state.py
Normal file
@ -0,0 +1,79 @@
|
||||
"""Run-scoped state files — M2(c) live-verify regression (not one of the 19 plan cases).
|
||||
|
||||
The four CCCI state files (deploys countfile, opstate, deps, depskip) must be keyed by
|
||||
run id + harness pid, NEVER by app domain: a second run of the SAME domain executes its
|
||||
main() preamble (state-file init, deploy_app's _record_deploy) BEFORE it blocks at the
|
||||
app lock, so domain-keyed files in the shared tempdir get reset/removed underneath the
|
||||
live first run. Observed live (builds 279/281): false DG4.1 deploy-count=2 in run 1,
|
||||
countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they
|
||||
receive them via the CCCI_*_FILE env vars, so per-process uniqueness is sufficient.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
import run_recipe_ci # noqa: E402
|
||||
from concutil import wait_marker # noqa: E402
|
||||
|
||||
DOMAIN = "fake-abc123.ci.commoninternet.net"
|
||||
|
||||
|
||||
def test_20_state_paths_keyed_by_run_and_pid_never_by_domain(monkeypatch):
|
||||
domain = "immi-ad3e33.ci.commoninternet.net"
|
||||
monkeypatch.setenv("CCCI_APP_DOMAIN", domain)
|
||||
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "279")
|
||||
p279 = run_recipe_ci._run_state_path("deploys")
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "281")
|
||||
p281 = run_recipe_ci._run_state_path("deploys")
|
||||
|
||||
# the double-!testme invariant: two runs (same domain) share NO state file
|
||||
assert p279 != p281
|
||||
# keyed by run id + pid, under the tempdir
|
||||
base = os.path.basename(p279)
|
||||
assert base == f"ccci-deploys-279-{os.getpid()}"
|
||||
assert os.path.dirname(p279) == tempfile.gettempdir()
|
||||
# the app domain must not appear in the path at all
|
||||
assert domain not in p279 and domain not in p281
|
||||
|
||||
|
||||
def test_20c_same_domain_runs_each_keep_their_own_count(tmp_path, lock_dir, pool):
|
||||
"""The live CONC-A1 interleaving, with REAL processes + the REAL lock and counter code:
|
||||
run A holds the app lock; run B (same domain) fires its pre-lock _record_deploy and
|
||||
blocks; A then reads its counter — must still be 1 (not polluted by B) — and removes
|
||||
its own file; B acquires and must find ITS file intact (no FileNotFoundError)."""
|
||||
gate = tmp_path / "gate"
|
||||
env_a = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9001"}
|
||||
env_b = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9002"}
|
||||
|
||||
pa, out_a = pool.spawn("deploy-count-run", DOMAIN, str(gate), env_extra=env_a)
|
||||
assert wait_marker(out_a, "ACQUIRED")
|
||||
pb, out_b = pool.spawn("deploy-count-run", DOMAIN, "", env_extra=env_b)
|
||||
# B's main()-preamble + pre-lock increment have fired; B is now blocked on the app lock
|
||||
assert wait_marker(out_b, "PRELOCK")
|
||||
assert wait_marker(out_b, "ACQUIRED", timeout=1.0) is None # still serialised behind A
|
||||
|
||||
gate.touch() # let A read its counter only AFTER B's pre-lock work landed
|
||||
line_a = wait_marker(out_a, "COUNT")
|
||||
assert line_a is not None and line_a.strip() == "COUNT 1", line_a # not 2: B didn't pollute A
|
||||
pa.wait(timeout=15)
|
||||
|
||||
line_b = wait_marker(out_b, "COUNT")
|
||||
assert (
|
||||
line_b is not None and line_b.strip() == "COUNT 1"
|
||||
), line_b # B's file survived A's remove
|
||||
pb.wait(timeout=15)
|
||||
|
||||
|
||||
def test_20b_manual_runs_distinct_via_pid(monkeypatch):
|
||||
# no DRONE_BUILD_NUMBER and no domain/run-id env → run_id() falls back to "manual";
|
||||
# the pid suffix still separates two concurrent hand-runs of the same domain.
|
||||
for var in ("DRONE_BUILD_NUMBER", "CCCI_APP_DOMAIN", "CCCI_RUN_ID"):
|
||||
monkeypatch.delenv(var, raising=False)
|
||||
p = run_recipe_ci._run_state_path("opstate")
|
||||
assert os.path.basename(p) == f"ccci-opstate-manual-{os.getpid()}"
|
||||
Reference in New Issue
Block a user