Files
cc-ci/runner/adv_traefik_rollback.py

82 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""ADVERSARY traefik WC1.1 destructive rollback cold proof (LOW TLS risk).
Stage a fake NEWER traefik tag whose compose fails abra LINT (a bare-int env entry → "must be a
string"), so the broken deploy is REJECTED before the running proxy is touched. The reconciler then
exercises the STATELESS rollback path: deploy(latest=broken) fails → redeploy last_good 5.1.1+v3.6.15
(no snapshot — traefik is stateless) → healthy → rollback alert. Asserts traefik stays serving
(ci.commoninternet.net=200) + keycloak-through-traefik=200 throughout/after, last_good unchanged, a
*-rollback.json alert. DEFENSIVE: finally always restores traefik to 5.1.1+v3.6.15 healthy + cleans
the fake tag. Manual recovery if needed: abra app deploy traefik.ci.commoninternet.net 5.1.1+v3.6.15 -o -n -f"""
import os, subprocess, sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import warm_reconcile as wr
RDIR = os.path.expanduser("~/.abra/recipes/traefik")
DOMAIN = "traefik.ci.commoninternet.net"; GOOD = "5.1.1+v3.6.15"; FAKE = "5.2.0+v3.6.15"
ALERTS = os.path.join(wr.warmsnap.DEFAULT_WARM_ROOT, "alerts")
fails = []
def git(*a, check=True):
return subprocess.run(["git","-C",RDIR,"-c","user.email=adv@cc-ci","-c","user.name=adv",*a],
capture_output=True, text=True, check=check)
def routed(host="ci.commoninternet.net", path="/"):
return subprocess.run(["curl","-sk","--resolve",f"{host}:443:127.0.0.1","-o","/dev/null",
"-w","%{http_code}","--max-time","10",f"https://{host}{path}"],capture_output=True,text=True).stdout.strip()
def reconcile():
env={**os.environ,"CCCI_SKIP_FETCH":"1"}
r=subprocess.run(["python3",os.path.join(os.path.dirname(__file__),"warm_reconcile.py"),"traefik"],
capture_output=True,text=True,env=env,timeout=1200)
print(r.stdout[-2000:]); print(r.stderr[-500:],file=sys.stderr)
for line in r.stdout.splitlines():
if line.startswith("RECONCILE RESULT:"): return line.split(":",1)[1].strip()
return f"<no result rc={r.returncode}>"
orig_head = git("rev-parse","HEAD").stdout.strip()
print(f"START traefik TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} "
f"ci={routed()} kc-through={routed('warm-keycloak.ci.commoninternet.net','/realms/master')} orig_head={orig_head[:8]}")
try:
# stage fake NEWER tag with a lint-breaking env (bare int → not a string)
git("checkout","-fq",GOOD)
import re
cf=os.path.join(RDIR,"compose.yml"); txt=open(cf).read()
# add a bare-integer entry to the app service environment list (first 'environment:' block)
txt=txt.replace(" environment:\n - DASHBOARD_ENABLED",
" environment:\n - {advbad: brokenmapping}\n - DASHBOARD_ENABLED",1)
open(cf,"w").write(txt)
git("commit","-aqm","adv: lint-breaking env for traefik rollback proof")
broken=git("rev-parse","HEAD").stdout.strip()
git("tag","-a","-m","adv",FAKE,broken)
git("checkout","-fq",orig_head) # leave working tree on the good HEAD; tag keeps broken commit
print(f"staged fake {FAKE}@{broken[:8]} (lint-breaking); reconcile (expect rollback->{GOOD})...")
a0=set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set()
res=reconcile()
new=sorted((set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set())-a0)
ci, kc = routed(), routed("warm-keycloak.ci.commoninternet.net","/realms/master")
print(f"RESULT={res!r} TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} ci={ci} kc-through={kc} new_alerts={new}")
if not res.startswith("rolled-back:"): fails.append(f"not rolled-back: {res}")
if wr.read_last_good("traefik")!=GOOD: fails.append(f"last_good changed: {wr.read_last_good('traefik')}")
if ci!="200": fails.append(f"traefik not serving after rollback: ci={ci}")
if kc!="200": fails.append(f"keycloak-through-traefik not 200: {kc}")
rb=[a for a in new if "rollback" in a]
if not rb: fails.append("no rollback alert")
else:
import json; rec=json.load(open(os.path.join(ALERTS,rb[0])))
print(f"rollback alert: {rec}")
if rec.get("attempted")!=FAKE: fails.append(f"alert attempted={rec.get('attempted')}")
if rec.get("last_good")!=GOOD: fails.append(f"alert last_good={rec.get('last_good')}")
if rec.get("recovered") is not True: fails.append(f"alert recovered={rec.get('recovered')}")
finally:
# DEFENSIVE recovery: delete fake tag, restore recipe HEAD, ensure traefik on GOOD + healthy
git("tag","-d",FAKE,check=False); git("checkout","-fq",orig_head)
if wr.current_version(DOMAIN)!=GOOD or routed()!="200":
print("!! defensive recovery: redeploying traefik GOOD", flush=True)
try: wr.deploy_version("traefik",DOMAIN,GOOD,600); wr.wait_healthy(wr.SPECS["traefik"])
except Exception as e: print(f"!! recovery deploy error: {e}")
fin_ci=routed(); fin_kc=routed("warm-keycloak.ci.commoninternet.net","/realms/master")
fake_left=[t for t in git("tag").stdout.split() if t==FAKE]
print(f"END TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} ci={fin_ci} kc-through={fin_kc} fake_tag_left={fake_left}")
if fin_ci!="200": fails.append(f"FINAL traefik not serving: {fin_ci}")
if fake_left: fails.append("fake tag not cleaned")
print("\nRESULT:", "FAIL: "+"; ".join(fails) if fails else
"PASS — traefik WC1.1 stateless rollback: broken-latest deploy rejected → rolled back to last_good 5.1.1+v3.6.15, traefik+routes healthy (no TLS outage), alert written, cert/config preserved")
sys.exit(1 if fails else 0)