From 07ea951f31d25040bd5ad47f3d935a263c97e54d Mon Sep 17 00:00:00 2001
From: autonomic-bot <maxf.account@proton.me>
Date: Fri, 29 May 2026 01:01:25 +0100
Subject: [PATCH] fix(2w): WC1.1 reconcile rolls back on deploy FAILURE too
 (not just unhealthy)

A broken 'latest' can fail abra's converge (deploy_version raises) rather than
deploy-then-be-unhealthy; wrap the upgrade deploy so BOTH paths trigger the
snapshot-restore rollback instead of crashing the reconcile unit.
---
 runner/warm_reconcile.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py
index 738897f..d51a6d6 100644
--- a/runner/warm_reconcile.py
+++ b/runner/warm_reconcile.py
@@ -334,8 +334,17 @@ def reconcile(app: str) -> str:
         abra.undeploy(domain)
         warmsnap.snapshot(recipe, domain, version=last_good)
         # snapshot requires undeployed; now bring up latest.
-    deploy_version(recipe, domain, latest, dt)
-    if wait_healthy(spec):
+    # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
+    # crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a
+    # deploy exception the same as an unhealthy result.
+    upgrade_ok = False
+    try:
+        deploy_version(recipe, domain, latest, dt)
+        upgrade_ok = wait_healthy(spec)
+    except Exception as e:  # noqa: BLE001 — a broken release must trigger rollback, not crash the unit
+        print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True)
+        upgrade_ok = False
+    if upgrade_ok:
         write_last_good(recipe, latest)
         print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True)
         return f"upgraded:{last_good}->{latest}"