fix(1d): bounded retry in _app_container (backup briefly cycles the app container)

abra app backup create (backup-bot-two) stops/cycles the app container, so a mutate exec_in_app
right after backup hit an empty docker ps and raised. _app_container now polls (no bare sleep) for
the container to reappear within a timeout. Recipe-agnostic harness robustness.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-28 00:06:28 +01:00
parent 81e26a1bdc
commit feb6f80d50

View File

@ -295,18 +295,25 @@ def previous_version(recipe: str) -> str | None:
return vers[-2] if len(vers) >= 2 else None
def _app_container(domain: str, service: str = "app") -> str:
"""The running container id for <stack>_<service>."""
def _app_container(domain: str, service: str = "app", timeout: int = 60) -> str:
"""The running container id for <stack>_<service>, with a BOUNDED POLL for it to (re)appear.
A lifecycle op can briefly leave no running task — notably `abra app backup create`, where
backup-bot-two stops/cycles the app container, so a mutate exec right after backup hit an empty
`docker ps` and raised. Poll (no bare sleep) until the container is back or timeout."""
name = f"{_stack_name(domain)}_{service}"
proc = subprocess.run(
["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"],
capture_output=True,
text=True,
)
cid = proc.stdout.strip().split("\n")[0]
if not cid:
raise RuntimeError(f"no running container for {name}")
return cid
deadline = time.time() + timeout
while True:
proc = subprocess.run(
["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"],
capture_output=True,
text=True,
)
cid = proc.stdout.strip().split("\n")[0]
if cid:
return cid
if time.time() >= deadline:
raise RuntimeError(f"no running container for {name} after {timeout}s")
time.sleep(3)
def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str: