diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index 738897f..d51a6d6 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -334,8 +334,17 @@ def reconcile(app: str) -> str: abra.undeploy(domain) warmsnap.snapshot(recipe, domain, version=last_good) # snapshot requires undeployed; now bring up latest. - deploy_version(recipe, domain, latest, dt) - if wait_healthy(spec): + # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a + # crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a + # deploy exception the same as an unhealthy result. + upgrade_ok = False + try: + deploy_version(recipe, domain, latest, dt) + upgrade_ok = wait_healthy(spec) + except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit + print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True) + upgrade_ok = False + if upgrade_ok: write_last_good(recipe, latest) print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True) return f"upgraded:{last_good}->{latest}"