diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index c33941a..b18b85d 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -295,18 +295,25 @@ def previous_version(recipe: str) -> str | None: return vers[-2] if len(vers) >= 2 else None -def _app_container(domain: str, service: str = "app") -> str: - """The running container id for _.""" +def _app_container(domain: str, service: str = "app", timeout: int = 60) -> str: + """The running container id for _, with a BOUNDED POLL for it to (re)appear. + A lifecycle op can briefly leave no running task — notably `abra app backup create`, where + backup-bot-two stops/cycles the app container, so a mutate exec right after backup hit an empty + `docker ps` and raised. Poll (no bare sleep) until the container is back or timeout.""" name = f"{_stack_name(domain)}_{service}" - proc = subprocess.run( - ["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"], - capture_output=True, - text=True, - ) - cid = proc.stdout.strip().split("\n")[0] - if not cid: - raise RuntimeError(f"no running container for {name}") - return cid + deadline = time.time() + timeout + while True: + proc = subprocess.run( + ["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"], + capture_output=True, + text=True, + ) + cid = proc.stdout.strip().split("\n")[0] + if cid: + return cid + if time.time() >= deadline: + raise RuntimeError(f"no running container for {name} after {timeout}s") + time.sleep(3) def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str: