From 07ea951f31d25040bd5ad47f3d935a263c97e54d Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Fri, 29 May 2026 01:01:25 +0100 Subject: [PATCH] fix(2w): WC1.1 reconcile rolls back on deploy FAILURE too (not just unhealthy) A broken 'latest' can fail abra's converge (deploy_version raises) rather than deploy-then-be-unhealthy; wrap the upgrade deploy so BOTH paths trigger the snapshot-restore rollback instead of crashing the reconcile unit. --- runner/warm_reconcile.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/runner/warm_reconcile.py b/runner/warm_reconcile.py index 738897f..d51a6d6 100644 --- a/runner/warm_reconcile.py +++ b/runner/warm_reconcile.py @@ -334,8 +334,17 @@ def reconcile(app: str) -> str: abra.undeploy(domain) warmsnap.snapshot(recipe, domain, version=last_good) # snapshot requires undeployed; now bring up latest. - deploy_version(recipe, domain, latest, dt) - if wait_healthy(spec): + # A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a + # crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a + # deploy exception the same as an unhealthy result. + upgrade_ok = False + try: + deploy_version(recipe, domain, latest, dt) + upgrade_ok = wait_healthy(spec) + except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit + print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True) + upgrade_ok = False + if upgrade_ok: write_last_good(recipe, latest) print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True) return f"upgraded:{last_good}->{latest}"