diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index 583e4bf..8c827ed 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -49,6 +49,23 @@ class TeardownError(RuntimeError): _held_app_locks: list = [] +def release_app_locks() -> None: + """Release ALL app-domain flocks this process holds (closing the fds frees the kernel locks). + + Used by promote_canonical (phase canon): a cold run holds its app lock AND any COLD dep's lock + (e.g. drone→gitea) for the whole process lifetime, but by promote time those apps/deps are + already torn down (dep teardown runs in the run's finally, before promote). Re-provisioning a + cold dep in promote would otherwise `acquire_app_lock()` on a lock THIS process still + holds from the cold test → self-deadlock. Releasing the now-stale locks first lets promote + re-provision cleanly. Safe only because the sweep is SERIAL (no concurrent run could be relying + on these locks) and the apps they guarded are gone.""" + global _held_app_locks + for f in _held_app_locks: + with contextlib.suppress(Exception): + f.close() + _held_app_locks = [] + + def _app_lock_dir() -> str: """The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index 0599dbd..ee719f3 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -962,6 +962,10 @@ def promote_canonical( flush=True, ) # Faithful install wiring: deps (OIDC) then install_steps (via deploy_app's hook), same as cold. + # Release the cold run's process-lifetime app/dep locks first: the cold test + its deps are torn + # down by now, but their locks are still held by THIS process, so re-provisioning a COLD dep + # (e.g. drone→gitea) would self-deadlock on acquire_app_lock. Serial sweep → safe to release. + lifecycle.release_app_locks() declared = list(meta.DEPS) if declared: try: