fix(2w): WC1.1 reconcile rolls back on deploy FAILURE too (not just unhealthy)

A broken 'latest' can fail abra's converge (deploy_version raises) rather than deploy-then-be-unhealthy; wrap the upgrade deploy so BOTH paths trigger the snapshot-restore rollback instead of crashing the reconcile unit.
2026-05-29 01:01:25 +01:00
parent 0812132452
commit 07ea951f31
1 changed files with 11 additions and 2 deletions
--- a/runner/warm_reconcile.py
+++ b/runner/warm_reconcile.py
@ -334,8 +334,17 @@ def reconcile(app: str) -> str:
        abra.undeploy(domain)
        warmsnap.snapshot(recipe, domain, version=last_good)
        # snapshot requires undeployed; now bring up latest.
-    deploy_version(recipe, domain, latest, dt)
-    if wait_healthy(spec):
+    # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
+    # crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a
+    # deploy exception the same as an unhealthy result.
+    upgrade_ok = False
+    try:
+        deploy_version(recipe, domain, latest, dt)
+        upgrade_ok = wait_healthy(spec)
+    except Exception as e:  # noqa: BLE001 — a broken release must trigger rollback, not crash the unit
+        print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True)
+        upgrade_ok = False
+    if upgrade_ok:
        write_last_good(recipe, latest)
        print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True)
        return f"upgraded:{last_good}->{latest}"