fix(2w): WC1.1 reconcile rolls back on deploy FAILURE too (not just unhealthy)

A broken 'latest' can fail abra's converge (deploy_version raises) rather than
deploy-then-be-unhealthy; wrap the upgrade deploy so BOTH paths trigger the
snapshot-restore rollback instead of crashing the reconcile unit.
This commit is contained in:
2026-05-29 01:01:25 +01:00
parent 0812132452
commit 07ea951f31

View File

@ -334,8 +334,17 @@ def reconcile(app: str) -> str:
abra.undeploy(domain)
warmsnap.snapshot(recipe, domain, version=last_good)
# snapshot requires undeployed; now bring up latest.
deploy_version(recipe, domain, latest, dt)
if wait_healthy(spec):
# A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
# crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a
# deploy exception the same as an unhealthy result.
upgrade_ok = False
try:
deploy_version(recipe, domain, latest, dt)
upgrade_ok = wait_healthy(spec)
except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit
print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True)
upgrade_ok = False
if upgrade_ok:
write_last_good(recipe, latest)
print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True)
return f"upgraded:{last_good}->{latest}"