fix(2w): WC1.1 reconcile rolls back on deploy FAILURE too (not just unhealthy)
A broken 'latest' can fail abra's converge (deploy_version raises) rather than deploy-then-be-unhealthy; wrap the upgrade deploy so BOTH paths trigger the snapshot-restore rollback instead of crashing the reconcile unit.
This commit is contained in:
@ -334,8 +334,17 @@ def reconcile(app: str) -> str:
|
||||
abra.undeploy(domain)
|
||||
warmsnap.snapshot(recipe, domain, version=last_good)
|
||||
# snapshot requires undeployed; now bring up latest.
|
||||
deploy_version(recipe, domain, latest, dt)
|
||||
if wait_healthy(spec):
|
||||
# A broken "latest" can fail in two ways: deploy_version raises (abra converge times out on a
|
||||
# crash-looping task) OR it deploys but never becomes healthy. BOTH must roll back, so treat a
|
||||
# deploy exception the same as an unhealthy result.
|
||||
upgrade_ok = False
|
||||
try:
|
||||
deploy_version(recipe, domain, latest, dt)
|
||||
upgrade_ok = wait_healthy(spec)
|
||||
except Exception as e: # noqa: BLE001 — a broken release must trigger rollback, not crash the unit
|
||||
print(f"[{app}] deploy of latest {latest} failed: {e}", flush=True)
|
||||
upgrade_ok = False
|
||||
if upgrade_ok:
|
||||
write_last_good(recipe, latest)
|
||||
print(f"[{app}] upgrade healthy → committed last-good={latest}", flush=True)
|
||||
return f"upgraded:{last_good}->{latest}"
|
||||
|
||||
Reference in New Issue
Block a user