From 655a9998be4ceeefed0205baf6069ece2160bdee Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Wed, 17 Jun 2026 10:04:14 +0000 Subject: [PATCH] fix(canon): release cold-run app/dep locks before promote (cold-dep self-deadlock) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drone (DEPS=[gitea], a COLD dep) deadlocked in promote: the cold test holds the gitea dep's app-lock for the whole process lifetime, and promote's _provision_deps re-acquires the same lock in the same process → blocks forever. By promote time the cold test + its deps are torn down (dep teardown runs in the run finally, before promote), so the locks are stale. New lifecycle.release_app_locks() frees them at promote start; the serial sweep guarantees no concurrent run relies on them. lasuite-* (warm keycloak dep) were unaffected (no cold deploy). Co-Authored-By: Claude Opus 4.8 --- runner/harness/lifecycle.py | 17 +++++++++++++++++ runner/run_recipe_ci.py | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 583e4bf..8c827ed 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -49,6 +49,23 @@ class TeardownError(RuntimeError): _held_app_locks: list = [] +def release_app_locks() -> None: + """Release ALL app-domain flocks this process holds (closing the fds frees the kernel locks). + + Used by promote_canonical (phase canon): a cold run holds its app lock AND any COLD dep's lock + (e.g. drone→gitea) for the whole process lifetime, but by promote time those apps/deps are + already torn down (dep teardown runs in the run's finally, before promote). Re-provisioning a + cold dep in promote would otherwise `acquire_app_lock()` on a lock THIS process still + holds from the cold test → self-deadlock. Releasing the now-stale locks first lets promote + re-provision cleanly. Safe only because the sweep is SERIAL (no concurrent run could be relying + on these locks) and the apps they guarded are gone.""" + global _held_app_locks + for f in _held_app_locks: + with contextlib.suppress(Exception): + f.close() + _held_app_locks = [] + + def _app_lock_dir() -> str: """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 0599dbd..ee719f3 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -962,6 +962,10 @@ def promote_canonical( flush=True, ) # Faithful install wiring: deps (OIDC) then install_steps (via deploy_app's hook), same as cold. + # Release the cold run's process-lifetime app/dep locks first: the cold test + its deps are torn + # down by now, but their locks are still held by THIS process, so re-provisioning a COLD dep + # (e.g. drone→gitea) would self-deadlock on acquire_app_lock. Serial sweep → safe to release. + lifecycle.release_app_locks() declared = list(meta.DEPS) if declared: try: