diff --git a/machine-docs/BUILDER-INBOX.md b/machine-docs/BUILDER-INBOX.md new file mode 100644 index 0000000..5539884 --- /dev/null +++ b/machine-docs/BUILDER-INBOX.md @@ -0,0 +1,11 @@ + +## [adversary heads-up] @2026-05-29T22:35Z — leftover drone stack on cc-ci (housekeeping) +Cold re-orient after the spend-limit lift. `docker stack ls` on cc-ci shows a live +`drone_ci_commoninternet_net` stack (`drone/drone:2.26.0`, app 1/1, deployed ~2d ago; task history shows +Failed at 15h/32h/2d) + volume `drone_ci_commoninternet_net_data`, left from the drone+gitea smoke. +Not a gate-FAIL (drone isn't claimed DONE) — just flagging the node isn't fully clean. **Is this +intentional pre-staging** for the post-host-fix gitea+drone integration, or a forgotten smoke leftover? +If the latter, `docker stack rm drone_ci_commoninternet_net && docker volume rm drone_ci_commoninternet_net_data` +at your leisure. Also confirmed cold: host `/etc/timezone` is STILL absent → drone block (`3bde76f`) +genuinely awaits the operator nixos-rebuild; deferral stands as sound. No action needed from me until a +gate is claimed. diff --git a/machine-docs/REVIEW-2.md b/machine-docs/REVIEW-2.md index ed8aa64..c822c53 100644 --- a/machine-docs/REVIEW-2.md +++ b/machine-docs/REVIEW-2.md @@ -1461,3 +1461,27 @@ justified. No VETO. Advances P1 coverage (mailu enrolled). **Isolation note:** verdict formed from the plan + code (lifecycle/abra/run_recipe_ci + the mailu test files) + STATUS claim verification info + my own two cold re-runs + direct recipe/host inspection. JOURNAL-2 not consulted before this verdict. + +--- +## Resume checkpoint @2026-05-29T22:35Z (spend-limit lift; cold re-orient) +Pulled to `1857733`. **No gate is CLAIMED awaiting Adversary.** State of play: +- **Q4.2 mumble — PASS** (REVIEW-2 `1daa1ea`, ACK `e36656f`). DONE. +- **Q4.9 mailu — PASS** (REVIEW-2 `2958eb6`, ACK `25ae293`). DONE. +- **Q4.6 discourse — deferral VERIFIED SOUND** (`594f2d3`); upstream bitnami images gone (§8 env-blocker). +- **Q4.10 drone — BLOCKED, deferral genuine.** Re-entry trigger is `ssh cc-ci 'cat /etc/timezone' = UTC`. + Cold-checked the host: **`/etc/timezone` is still absent** (`ls: cannot access '/etc/timezone'`), so the + gitea SCM dep still can't boot and the block is real — operator host-deploy of `3bde76f` has NOT landed. + Integration is scoped (JOURNAL-2 `f86a58a`); I'll weigh the §4.3 build-creation §7.1 sign-off only once + the maximal subset is actually run green (not pre-clearing un-built content). +- **Q3.5 immich — P4 restore RED still OPEN** (BACKLOG-2 Q3.5): upstream recipe uses live-volume backup + (no pg_dump hook) → postgres `ci_marker` doesn't survive restore. Builder to choose recipe-PR vs §7.1 + sign-off on the maximal subset; I have NOT signed off — this is a real P4 gap on a claimed-enrolled recipe. +- **Q5.1 docs (`1857733`) landed** but is not claimed as a gate; P8 verification deferred until claimed. + +**Break-it probe — leftover stack on cc-ci (housekeeping, NOT a gate-FAIL).** `docker stack ls` shows a +`drone_ci_commoninternet_net` stack (app `drone/drone:2.26.0` 1/1, deployed ~2d ago, task failures at +15h/32h/2d) + volume `drone_ci_commoninternet_net_data`, left over from the drone+gitea smoke. drone is +not claimed DONE so this is not a teardown-gate failure, but the node is NOT "clean" — flagged to Builder +inbox (same housekeeping class as the prior `mumb-smoke`/`mail-smoke` leftovers; remove at leisure or +confirm it's intentional pre-staging for the post-host-fix integration). `warm-keycloak` (warm SSO dep), +`backups`, `ccci-bridge`, `ccci-dashboard`, `traefik` are expected infra. diff --git a/runner/adv_check4.py b/runner/adv_check4.py new file mode 100644 index 0000000..4e6808b --- /dev/null +++ b/runner/adv_check4.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +"""ADVERSARY check4 (WC1 concurrency + reaping, deploy-free). Cold-run from my own clone. +Asserts: realm_for distinct per run-hex; realms create on live warm kc + oidc_password_grant returns +a JWT each; reap_orphaned_realms keeps the live hex and deletes the orphans. Leaves kc clean.""" +import sys, os +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import warm, sso + +D = warm.warm_domain("keycloak") +assert D == "warm-keycloak.ci.commoninternet.net", D +fails = [] + +# 1) realm_for distinct per run-hex (two concurrent same-recipe runs never collide) +r_a = warm.realm_for("lasuite-docs", "lasu-aaa111.ci.commoninternet.net") +r_b = warm.realm_for("lasuite-docs", "lasu-bbb222.ci.commoninternet.net") +print(f"realm_for aaa111={r_a!r} bbb222={r_b!r}") +if r_a != "lasuite-docs-aaa111": fails.append(f"realm_for aaa111 -> {r_a}") +if r_b != "lasuite-docs-bbb222": fails.append(f"realm_for bbb222 -> {r_b}") +if r_a == r_b: fails.append("realm_for collision") + +admin = sso.admin_password_inside(D) +before = sorted(sso.list_realms(D, admin)) +print(f"realms BEFORE: {before}") + +# 2) create three realms; each must yield a working password-grant JWT +hexes = ["aaa111", "bbb222", "ccc333"] +created = [] +for h in hexes: + realm = f"advchk-{h}" + creds = sso.setup_keycloak_realm(D, realm, f"client-{h}", redirect_uris=["*"], web_origins=["*"]) + created.append(realm) + tok = sso.oidc_password_grant(creds) + ok = isinstance(tok, str) and tok.count(".") == 2 and len(tok) > 40 + print(f" {realm}: JWT={'OK' if ok else 'BAD'} (len={len(tok)}, dots={tok.count('.')})") + if not ok: fails.append(f"{realm} no/!JWT") + # confirm discovery issuer too (independent re-check) + disc = sso.assert_discovery_endpoint(creds) + if disc.get("issuer") != f"https://{D}/realms/{realm}": fails.append(f"{realm} issuer") + +mid = sorted(sso.list_realms(D, admin)) +print(f"realms AFTER CREATE: {mid}") +for realm in created: + if realm not in mid: fails.append(f"{realm} not present after create") + +# 3) reap with live_hexes={aaa111}: must delete bbb222+ccc333, KEEP aaa111 +reaped = sorted(sso.reap_orphaned_realms(D, live_hexes={"aaa111"})) +print(f"REAPED (live=aaa111): {reaped}") +if reaped != ["advchk-bbb222", "advchk-ccc333"]: fails.append(f"reaped set wrong: {reaped}") +after = sorted(sso.list_realms(D, admin)) +print(f"realms AFTER REAP: {after}") +if "advchk-aaa111" not in after: fails.append("aaa111 wrongly reaped (live run would lose its realm)") +if "advchk-bbb222" in after or "advchk-ccc333" in after: fails.append("orphan not reaped") + +# cleanup: remove aaa111 too; leave kc with only master (+ any pre-existing non-advchk realms) +sso.delete_keycloak_realm(D, "advchk-aaa111", admin) +final = sorted(sso.list_realms(D, admin)) +print(f"realms FINAL (after cleanup): {final}") +leftover = [r for r in final if r.startswith("advchk-")] +if leftover: fails.append(f"leftover advchk realms: {leftover}") + +print("\nRESULT:", "FAIL " + "; ".join(fails) if fails else "PASS — all check4 assertions hold") +sys.exit(1 if fails else 0) diff --git a/runner/adv_check5.py b/runner/adv_check5.py new file mode 100644 index 0000000..045e02a --- /dev/null +++ b/runner/adv_check5.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""ADVERSARY check5 — WC1.1 MARQUEE health-gated rollback with data integrity (live, cold). +Run via cc-ci-run from /root/cc-ci-adv-verify on cc-ci (PATH has abra/docker/git). + +Independent reproduce (does NOT trust the Builder's run): + A. plant a MARKER realm on warm kc (the data whose survival proves integrity) + B. stage fake tag 10.7.9+26.6.2 at the good commit -> reconcile -> expect HEALTHY upgrade, + last_good advances to 10.7.9, marker preserved + C. stage broken commit (KC_HOSTNAME=:::bad-host:::) tagged 10.7.10+26.6.2 -> reconcile -> + expect ROLLBACK to 10.7.9, kc HEALTHY, marker INTACT, last_good NOT advanced, rollback alert + D. cleanup: delete fake tags + broken commit, reconcile back to canonical 10.7.1+26.6.2, delete marker +""" +import json, os, subprocess, sys, time +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import sso +import warm_reconcile as wr + +RECIPE, APP = "keycloak", "keycloak" +D = "warm-keycloak.ci.commoninternet.net" +RDIR = os.path.expanduser("~/.abra/recipes/keycloak") +GOOD = "04400df" # HEAD = chore: upgrade to 10.7.1+26.6.2 +CANON = "10.7.1+26.6.2" +T_GOOD = "10.7.9+26.6.2" # fake, points at good commit +T_BAD = "10.7.10+26.6.2" # fake, points at broken-KC_HOSTNAME commit +MARKER = "advmarker-rollback" +ALERTS = os.path.join(wr.warmsnap.DEFAULT_WARM_ROOT, "alerts") +fails = [] + +def git(*a, check=True): + return subprocess.run(["git", "-C", RDIR, "-c", "user.email=adv@cc-ci", "-c", "user.name=adv", + *a], capture_output=True, text=True, check=check) + +def reconcile(): + """Run the reconciler exactly as the unit would, but CCCI_SKIP_FETCH so my staged tags stand.""" + env = {**os.environ, "CCCI_SKIP_FETCH": "1"} + r = subprocess.run(["python3", os.path.join(os.path.dirname(__file__), "warm_reconcile.py"), APP], + capture_output=True, text=True, env=env, timeout=1800) + print(r.stdout[-1500:]); print(r.stderr[-600:], file=sys.stderr) + for line in r.stdout.splitlines(): + if line.startswith("RECONCILE RESULT:"): + return line.split(":", 1)[1].strip() + return f"" + +def health(): + return wr.health_code(wr.SPECS[APP]) + +def realms(): + return sorted(sso.list_realms(D)) + +def last_good(): + return wr.read_last_good(RECIPE) + +def type_env(): + return wr.current_version(D) + +print(f"=== START: TYPE={type_env()} last_good={last_good()} health={health()} realms={realms()}") + +# ---- A. plant marker realm (data) ---- +sso.setup_keycloak_realm(D, MARKER, "marker-client", redirect_uris=["*"], web_origins=["*"]) +assert MARKER in realms(), "marker realm not created" +print(f"[A] marker realm planted: {MARKER in realms()}") + +# ---- B. healthy upgrade to fake 10.7.9 ---- +git("tag", "-a", "-m", "adv", T_GOOD, GOOD + "^{commit}", check=False) +wr.write_last_good(RECIPE, CANON) # baseline last_good = canonical +print(f"[B] staged {T_GOOD}@good; reconcile #1 (expect upgrade->{T_GOOD})...") +res1 = reconcile() +print(f"[B] result={res1!r} last_good={last_good()} health={health()} markerIntact={MARKER in realms()}") +if not res1.startswith("upgraded:"): fails.append(f"B not upgraded: {res1}") +if last_good() != T_GOOD: fails.append(f"B last_good={last_good()} != {T_GOOD}") +if health() != 200: fails.append(f"B health={health()}") +if MARKER not in realms(): fails.append("B marker lost on healthy upgrade") + +# ---- C. broken latest 10.7.10 -> rollback ---- +import shutil +compose = os.path.join(RDIR, "compose.yml") +bak = compose + ".advbak"; shutil.copy(compose, bak) +txt = open(compose).read().replace("KC_HOSTNAME=https://${DOMAIN}", "KC_HOSTNAME=:::bad-host:::") +open(compose, "w").write(txt) +git("commit", "-am", "adv broken KC_HOSTNAME") +broken_sha = git("rev-parse", "HEAD").stdout.strip() +git("tag", "-a", "-m", "adv", T_BAD, broken_sha) +git("reset", "--hard", GOOD) # branch back to good; tag keeps the broken commit alive +shutil.copy(bak, compose); os.remove(bak) +alerts_before = set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set() +print(f"[C] staged broken {T_BAD}@{broken_sha[:7]}; reconcile #2 (expect rollback->{T_GOOD})... (broken deploy may take minutes)") +res2 = reconcile() +alerts_after = set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set() +new_alerts = sorted(alerts_after - alerts_before) +print(f"[C] result={res2!r} last_good={last_good()} health={health()} markerIntact={MARKER in realms()} newAlerts={new_alerts}") +if not res2.startswith("rolled-back:"): fails.append(f"C not rolled-back: {res2}") +if health() != 200: fails.append(f"C kc unhealthy after rollback: {health()}") +if MARKER not in realms(): fails.append("C MARKER LOST — data integrity FAILED on rollback") +if last_good() != T_GOOD: fails.append(f"C last_good advanced to {last_good()} (should stay {T_GOOD})") +rb = [a for a in new_alerts if "rollback" in a] +if not rb: fails.append("C no rollback alert written") +else: + rec = json.load(open(os.path.join(ALERTS, rb[0]))) + print(f"[C] rollback alert: {rec}") + if rec.get("attempted") != T_BAD: fails.append(f"alert attempted={rec.get('attempted')}") + if rec.get("last_good") != T_GOOD: fails.append(f"alert last_good={rec.get('last_good')}") + if rec.get("recovered") is not True: fails.append(f"alert recovered={rec.get('recovered')}") + +# ---- D. cleanup + restore canonical ---- +print("[D] cleanup: delete fake tags, reconcile back to canonical, delete marker...") +git("tag", "-d", T_GOOD, check=False); git("tag", "-d", T_BAD, check=False) +git("reset", "--hard", GOOD) +res3 = reconcile() # latest now = real CANON; current(env)=10.7.9 -> redeploys to CANON +print(f"[D] result={res3!r} TYPE={type_env()} last_good={last_good()} health={health()}") +sso.delete_keycloak_realm(D, MARKER) +if type_env() != CANON: fails.append(f"D not restored to canonical TYPE: {type_env()}") +if health() != 200: fails.append(f"D final health={health()}") +fin = realms() +if MARKER in fin: fails.append("D marker not cleaned") +print(f"=== END: TYPE={type_env()} last_good={last_good()} health={health()} realms={fin}") + +print("\nRESULT:", "FAIL: " + "; ".join(fails) if fails else "PASS — WC1.1 marquee: healthy upgrade commits, broken latest rolls back with marker realm (data) INTACT, last_good not advanced, alert correct, canonical restored") +sys.exit(1 if fails else 0) diff --git a/runner/adv_check6.py b/runner/adv_check6.py new file mode 100644 index 0000000..aca1bb8 --- /dev/null +++ b/runner/adv_check6.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""ADVERSARY check6 — WC1.2 pre-deploy SAFETY gate (live, cold). A hold must do NO deploy. + (a) MAJOR fake tag 11.0.0+27.0.0 -> held-major, alert, kc untouched (TYPE same, 200) + (b) minor tag 10.7.2+26.6.3 + releaseNotes flagging manual migration -> held-manual-migration, + alert CARRIES the notes, kc untouched +Leaves the recipe + kc exactly as found (canonical, no fake tags/notes).""" +import json, os, subprocess, sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import warm_reconcile as wr + +D = "warm-keycloak.ci.commoninternet.net" +RDIR = os.path.expanduser("~/.abra/recipes/keycloak") +GOOD = "04400df"; CANON = "10.7.1+26.6.2" +T_MAJOR = "11.0.0+27.0.0"; T_MINOR = "10.7.2+26.6.3" +NOTES = os.path.join(RDIR, "releaseNotes", T_MINOR + ".md") +ALERTS = os.path.join(wr.warmsnap.DEFAULT_WARM_ROOT, "alerts") +fails = [] + +def git(*a, check=True): + return subprocess.run(["git", "-C", RDIR, "-c", "user.email=adv@cc-ci", "-c", "user.name=adv", *a], + capture_output=True, text=True, check=check) +def reconcile(): + env = {**os.environ, "CCCI_SKIP_FETCH": "1"} + r = subprocess.run(["python3", os.path.join(os.path.dirname(__file__), "warm_reconcile.py"), "keycloak"], + capture_output=True, text=True, env=env, timeout=300) + for line in r.stdout.splitlines(): + if line.startswith("RECONCILE RESULT:"): return line.split(":", 1)[1].strip() + return f" {r.stdout[-300:]} {r.stderr[-300:]}" +def alerts_now(): return set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set() + +type0, lg0 = wr.current_version(D), wr.read_last_good("keycloak") +print(f"START TYPE={type0} last_good={lg0} health={wr.health_code(wr.SPECS['keycloak'])}") +assert type0 == CANON, f"precond: kc not canonical ({type0})" + +# (a) MAJOR -> held-major, no deploy +git("reset", "--hard", GOOD) +git("tag", "-a", "-m", "adv", T_MAJOR, GOOD + "^{commit}", check=False) +a0 = alerts_now() +res_a = reconcile() +new_a = sorted(alerts_now() - a0) +print(f"(a) MAJOR {T_MAJOR}: result={res_a!r} TYPE={wr.current_version(D)} new_alerts={new_a}") +if not res_a.startswith("held-major"): fails.append(f"(a) not held-major: {res_a}") +if wr.current_version(D) != CANON: fails.append(f"(a) kc TYPE changed to {wr.current_version(D)}") +hm = [x for x in new_a if "held-major" in x] +if not hm: fails.append("(a) no held-major alert") +else: + rec = json.load(open(os.path.join(ALERTS, hm[0]))) + if rec.get("latest") != T_MAJOR: fails.append(f"(a) alert latest={rec.get('latest')}") + if "release_notes" not in rec: fails.append("(a) alert missing release_notes field") +git("tag", "-d", T_MAJOR, check=False) + +# (b) minor + manual-migration notes -> held-manual-migration, no deploy, alert carries notes +git("tag", "-a", "-m", "adv", T_MINOR, GOOD + "^{commit}", check=False) +os.makedirs(os.path.dirname(NOTES), exist_ok=True) +open(NOTES, "w").write("# 10.7.2\n\nThis release requires a **manual migration**: run the DB upgrade by hand.\n") +b0 = alerts_now() +res_b = reconcile() +new_b = sorted(alerts_now() - b0) +print(f"(b) MINOR+migration {T_MINOR}: result={res_b!r} TYPE={wr.current_version(D)} new_alerts={new_b}") +if not res_b.startswith("held-manual-migration"): fails.append(f"(b) not held-manual-migration: {res_b}") +if wr.current_version(D) != CANON: fails.append(f"(b) kc TYPE changed to {wr.current_version(D)}") +hmm = [x for x in new_b if "manual-migration" in x] +if not hmm: fails.append("(b) no held-manual-migration alert") +else: + rec = json.load(open(os.path.join(ALERTS, hmm[0]))) + if "manual migration" not in (rec.get("release_notes") or "").lower(): + fails.append(f"(b) alert release_notes lacks the notes: {rec.get('release_notes')!r}") + +# cleanup +git("tag", "-d", T_MINOR, check=False) +if os.path.exists(NOTES): os.remove(NOTES) +git("reset", "--hard", GOOD) +faketags = [t for t in git("tag").stdout.split() if t in (T_MAJOR, T_MINOR)] +print(f"END TYPE={wr.current_version(D)} last_good={wr.read_last_good('keycloak')} " + f"health={wr.health_code(wr.SPECS['keycloak'])} faketags={faketags} notes_exists={os.path.exists(NOTES)}") +if wr.current_version(D) != CANON: fails.append(f"END not canonical: {wr.current_version(D)}") +if faketags: fails.append(f"leftover fake tags {faketags}") +if wr.read_last_good("keycloak") != CANON: fails.append(f"last_good moved to {wr.read_last_good('keycloak')}") + +print("\nRESULT:", "FAIL: " + "; ".join(fails) if fails else "PASS — WC1.2 holds major + manual-migration with notes-carrying alert; kc untouched (no deploy/last_good churn)") +sys.exit(1 if fails else 0) diff --git a/runner/adv_check_wc2.py b/runner/adv_check_wc2.py new file mode 100644 index 0000000..48aba1c --- /dev/null +++ b/runner/adv_check_wc2.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""ADVERSARY WC2+WC3 cold reproduce (data-warm canonical round-trip). Run via cc-ci-run from +/root/cc-ci-adv-verify. Drives the cycle MYSELF (does not trust the Builder's single run): + 1. deploy_canonical -> reattach the retained volume; confirm the Builder's known-good marker is served + 2. WC2: write MY OWN marker -> undeploy_keep_volume (assert app DOWN + volume RETAINED) -> + deploy_canonical -> MY marker SURVIVES (data-warm reattach) + 3. WC3: mutate (delete the known-good marker) -> undeploy -> warmsnap.restore -> deploy -> + known-good marker BACK and my marker GONE (restore round-trips the exact known-good) + 4. leave it idle (as found): undeploy_keep_volume; content == known-good +""" +import os, subprocess, sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import canonical, lifecycle, warmsnap + +R = "custom-html" +D = "warm-custom-html.ci.commoninternet.net" +HTML = "/usr/share/nginx/html" +KG = "WC2-DATA-MARKER-7f3a9c" # the Builder's known-good marker string +A = "ADV-OWN-MARKER-a1b2c3" # my own marker +fails = [] + +def sh(*cmd): return lifecycle.exec_in_app(D, list(cmd), service="app") +def has_service(): + return bool(subprocess.run(["docker","service","ls","--format","{{.Name}}"], + capture_output=True,text=True).stdout and + any("custom-html" in n for n in subprocess.run(["docker","service","ls","--format","{{.Name}}"], + capture_output=True,text=True).stdout.split())) +def has_volume(): + out = subprocess.run(["docker","volume","ls","--format","{{.Name}}"],capture_output=True,text=True).stdout + return any("warm-custom-html" in n and n.endswith("_content") for n in out.split()) +def serving(): + r = subprocess.run(["curl","-sk","--resolve",f"{D}:443:127.0.0.1","-o","/dev/null", + "-w","%{http_code}","--max-time","10",f"https://{D}/"],capture_output=True,text=True) + return r.stdout.strip() + +print(f"START canonical={canonical.read_registry(R)} has_canonical={canonical.has_canonical(R)} " + f"has_snapshot={warmsnap.has_snapshot(R)} service={has_service()} volume={has_volume()}") +if not warmsnap.has_snapshot(R): fails.append("no snapshot present at start") + +# 1. reattach + confirm Builder's known-good marker survived their run +canonical.deploy_canonical(R); +listing = sh("ls", HTML) +kg_files = sh("grep","-rl",KG,HTML).split() +print(f"[1] deployed; serving={serving()} html={listing.split()} kg_marker_files={kg_files}") +if not kg_files: fails.append("Builder known-good marker not found after reattach") +kg_file = kg_files[0] if kg_files else None +if serving() != "200": fails.append(f"[1] not serving 200: {serving()}") + +# 2. WC2: my own marker through undeploy-keep-volume -> redeploy +sh("sh","-c",f"echo {A} > {HTML}/adv_own.txt") +got = sh("cat",f"{HTML}/adv_own.txt").strip() +print(f"[2] wrote my marker: {got!r}") +canonical.undeploy_keep_volume(R) +svc_down, vol_kept = not has_service(), has_volume() +print(f"[2] after undeploy_keep_volume: service_down={svc_down} volume_retained={vol_kept} " + f"registry_status={ (canonical.read_registry(R) or {}).get('status') }") +if not svc_down: fails.append("[2] app still has a service after undeploy_keep_volume") +if not vol_kept: fails.append("[2] content volume NOT retained (data-warm broken)") +canonical.deploy_canonical(R) +survived = sh("cat",f"{HTML}/adv_own.txt").strip() +print(f"[2] after redeploy: my marker={survived!r}") +if survived != A: fails.append(f"[2] my marker did NOT survive data-warm round-trip: {survived!r}") + +# 3. WC3: mutate (delete known-good marker) -> undeploy -> restore -> deploy -> known-good BACK +if kg_file: + sh("rm", "-f", kg_file) + mutated_gone = not sh("grep", "-rl", KG, HTML).split() + print(f"[3] mutated: deleted known-good marker file {kg_file}; gone_now={mutated_gone}") +canonical.undeploy_keep_volume(R) +warmsnap.restore(R, D) +canonical.deploy_canonical(R) +kg_back = bool(sh("grep", "-rl", KG, HTML).split()) +a_present = "adv_own.txt" in sh("ls", HTML).split() +print(f"[3] after restore+deploy: known_good_back={kg_back} my_marker_still_there={a_present}") +if not kg_back: fails.append("[3] known-good marker NOT restored (WC3 restore failed)") +if a_present: fails.append("[3] my marker still present after restore — restore not exact known-good") + +# 4. leave idle as found +canonical.undeploy_keep_volume(R) +print(f"END registry={canonical.read_registry(R)} service={has_service()} volume={has_volume()}") +if has_service(): fails.append("[4] left a running service (should be idle)") +if not has_volume(): fails.append("[4] volume not retained at end") + +print("\nRESULT:", "FAIL: "+"; ".join(fails) if fails else + "PASS — WC2 data-warm round-trip (my own marker survives undeploy-keep-volume+reattach) + WC3 " + "restore round-trips the exact known-good; left idle with volume retained") +sys.exit(1 if fails else 0) diff --git a/runner/adv_check_wc3.py b/runner/adv_check_wc3.py new file mode 100644 index 0000000..69acd6d --- /dev/null +++ b/runner/adv_check_wc3.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +"""ADVERSARY WC3 restore round-trip + recover custom-html canonical to known-good idle. +State on entry (left by adv_check_wc2 crash): app UP (warm), known-good marker file deleted, my +marker adv_own.txt present. This driver: restore the known-good snapshot -> known-good marker BACK, +my marker GONE (restore = exact known-good) -> leave idle. Also diagnoses HTTPS serving.""" +import os, subprocess, sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import canonical, lifecycle, warmsnap + +R = "custom-html"; D = "warm-custom-html.ci.commoninternet.net"; HTML = "/usr/share/nginx/html" +KG_FILE = "wc2-marker.txt"; KG_STR = "WC2-DATA-MARKER-7f3a9c"; A_FILE = "adv_own.txt" +fails = [] +def sh(*c): return lifecycle.exec_in_app(D, list(c), service="app") +def ls(): return sh("ls", HTML).split() +def has_service(): + out = subprocess.run(["docker","service","ls","--format","{{.Name}}"],capture_output=True,text=True).stdout + return any("custom-html" in n for n in out.split()) +def has_volume(): + out = subprocess.run(["docker","volume","ls","--format","{{.Name}}"],capture_output=True,text=True).stdout + return any("warm-custom-html" in n and n.endswith("_content") for n in out.split()) +def code(path): + return subprocess.run(["curl","-sk","--resolve",f"{D}:443:127.0.0.1","-o","/dev/null","-w","%{http_code}", + "--max-time","10",f"https://{D}{path}"],capture_output=True,text=True).stdout.strip() + +# ensure app is up to inspect entry state (it should be, from the crash) +if not has_service(): + canonical.deploy_canonical(R) +entry = ls() +print(f"ENTRY html={entry} (expect adv_own.txt present, {KG_FILE} deleted)") +# serving diagnosis +print(f"SERVING: /={code('/')} /index.html={code('/index.html')} /{KG_FILE}={code('/'+KG_FILE)} /{A_FILE}={code('/'+A_FILE)}") + +# WC3 restore round-trip +canonical.undeploy_keep_volume(R) +warmsnap.restore(R, D) +canonical.deploy_canonical(R) +after = ls() +kg_back = KG_FILE in after +a_gone = A_FILE not in after +kg_content = sh("cat", f"{HTML}/{KG_FILE}").strip() if kg_back else "" +print(f"AFTER RESTORE html={after} kg_back={kg_back} kg_content={kg_content!r} my_marker_gone={a_gone}") +if not kg_back: fails.append("WC3: known-good marker NOT restored") +if KG_STR not in kg_content: fails.append(f"WC3: restored marker content wrong: {kg_content!r}") +if not a_gone: fails.append("WC3: my marker still present — restore not exact known-good") + +# leave idle as found +canonical.undeploy_keep_volume(R) +fin = canonical.read_registry(R) +print(f"END registry_status={fin.get('status')} version={fin.get('version')} service={has_service()} " + f"volume={has_volume()} snapshot={warmsnap.has_snapshot(R)}") +if has_service(): fails.append("END: service still running (should be idle)") +if not has_volume(): fails.append("END: volume not retained") +if fin.get("status") != "idle": fails.append(f"END: status={fin.get('status')} (want idle)") + +print("\nRESULT:", "FAIL: "+"; ".join(fails) if fails else + "PASS — WC3 restore round-trips the EXACT known-good (marker back, content correct, my mutation gone); canonical left idle+retained") +sys.exit(1 if fails else 0) diff --git a/runner/adv_quickfail_verify.py b/runner/adv_quickfail_verify.py new file mode 100644 index 0000000..0e7d584 --- /dev/null +++ b/runner/adv_quickfail_verify.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""ADVERSARY: after the --quick FAIL run, independently verify the rollback restored the EXACT +known-good (data + healthy app), the known-good was NOT promoted, then leave idle.""" +import os, subprocess, sys, hashlib +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import canonical, lifecycle, warmsnap + +R="custom-html"; D="warm-custom-html.ci.commoninternet.net"; HTML="/usr/share/nginx/html" +KG_STR="WC2-DATA-MARKER-7f3a9c"; BASE_SNAP_SHA="9ef62bdf11c6060c" +fails=[] +def sh(*c): return lifecycle.exec_in_app(D, list(c), service="app") +def snap_sha(): + import glob + t=glob.glob(f"/var/lib/ci-warm/{R}/snapshot/volumes/*.tar") + return hashlib.sha256(open(t[0],"rb").read()).hexdigest()[:16] if t else "NONE" +def code(p): + return subprocess.run(["curl","-sk","--resolve",f"{D}:443:127.0.0.1","-o","/dev/null","-w","%{http_code}", + "--max-time","10",f"https://{D}{p}"],capture_output=True,text=True).stdout.strip() + +reg=canonical.read_registry(R) or {} +ssha=snap_sha() +print(f"registry version={reg.get('version')} status={reg.get('status')} snapshot_sha={ssha} (baseline {BASE_SNAP_SHA})") +if reg.get('version')!="1.11.0+1.29.0": fails.append(f"known-good promoted/changed: {reg.get('version')}") +if reg.get('status')!="idle": fails.append(f"not idle: {reg.get('status')}") +if ssha!=BASE_SNAP_SHA: fails.append(f"snapshot changed: {ssha} != {BASE_SNAP_SHA}") + +# bring canonical up, confirm restored data + healthy + non-broken image +canonical.deploy_canonical(R) +lifecycle.wait_healthy(D, ok_codes=(200,), path="/", deploy_timeout=300, http_timeout=20) +html=sh("ls",HTML).split() +kg_files=[f for f in html if f=="wc2-marker.txt"] +kg_content=sh("cat",f"{HTML}/wc2-marker.txt").strip() if kg_files else "" +img=subprocess.run(["docker","service","ls","--format","{{.Name}} {{.Image}}"],capture_output=True,text=True).stdout +serving=code("/") +print(f"AFTER deploy: html={html} kg_content={kg_content!r} serving/={serving}") +print(f"image: {[l for l in img.splitlines() if 'custom-html' in l]}") +if not kg_files: fails.append("rollback did NOT restore known-good marker file") +if KG_STR not in kg_content: fails.append(f"restored marker content wrong: {kg_content!r}") +if serving!="200": fails.append(f"rolled-back app not serving 200: {serving}") +if "99.99.99-doesnotexist" in img: fails.append("BROKEN image still deployed after rollback") +if "nginx:1.29.0" not in img: fails.append(f"canonical not on known-good image: {img!r}") + +# leave idle +canonical.undeploy_keep_volume(R) +print(f"END status={(canonical.read_registry(R) or {}).get('status')} " + f"service={'custom-html' in subprocess.run(['docker','service','ls','--format','{{.Name}}'],capture_output=True,text=True).stdout}") +print("\nRESULT:", "FAIL: "+"; ".join(fails) if fails else + "PASS — --quick FAIL rolled back to EXACT known-good (marker+content restored, app healthy on nginx:1.29.0, broken image gone), known-good UNCHANGED+snapshot byte-identical (never promoted); left idle") +sys.exit(1 if fails else 0) diff --git a/runner/adv_recover.py b/runner/adv_recover.py new file mode 100644 index 0000000..ef4ffa4 --- /dev/null +++ b/runner/adv_recover.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +"""Recover warm keycloak to canonical 10.7.1+26.6.2 healthy after adv_check5's cleanup-script bug +left it undeployed on TYPE=10.7.9 (a tag I deleted). NOT a reconciler defect — my test's fault. +Steps: recreate 10.7.9 tag (so abra can resolve the current from-version) -> deploy 10.7.9 (kc back +up, marker present) -> delete marker realm -> deploy canonical 10.7.1 -> set last_good -> drop 10.7.9 +tag -> verify clean.""" +import os, subprocess, sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from harness import sso +import warm_reconcile as wr + +D = "warm-keycloak.ci.commoninternet.net" +RDIR = os.path.expanduser("~/.abra/recipes/keycloak") +GOOD = "04400df"; CANON = "10.7.1+26.6.2"; T9 = "10.7.9+26.6.2"; MARKER = "advmarker-rollback" + +def git(*a, check=True): + return subprocess.run(["git", "-C", RDIR, "-c", "user.email=adv@cc-ci", "-c", "user.name=adv", *a], + capture_output=True, text=True, check=check) + +print(f"START TYPE={wr.current_version(D)} health={wr.health_code(wr.SPECS['keycloak'])}") +git("reset", "--hard", GOOD) +git("tag", "-a", "-m", "adv", T9, GOOD + "^{commit}", check=False) +print("recreated 10.7.9 tag; deploying 10.7.9 to bring kc back...") +wr.deploy_version("keycloak", D, T9, 900) +assert wr.wait_healthy(wr.SPECS["keycloak"]), "kc not healthy on 10.7.9" +realms = sorted(sso.list_realms(D)) +print(f"kc healthy on 10.7.9; realms={realms}") +if MARKER in realms: + sso.delete_keycloak_realm(D, MARKER); print("deleted marker realm") +print("deploying canonical 10.7.1...") +wr.deploy_version("keycloak", D, CANON, 900) +assert wr.wait_healthy(wr.SPECS["keycloak"]), "kc not healthy on canonical" +wr.write_last_good("keycloak", CANON) +git("tag", "-d", T9, check=False) +git("reset", "--hard", GOOD) +final_realms = sorted(sso.list_realms(D)) +faketags = [t for t in git("tag").stdout.split() if t in (T9, "10.7.10+26.6.2")] +print(f"DONE TYPE={wr.current_version(D)} last_good={wr.read_last_good('keycloak')} " + f"health={wr.health_code(wr.SPECS['keycloak'])} realms={final_realms} faketags={faketags}") +ok = (wr.current_version(D) == CANON and wr.read_last_good("keycloak") == CANON + and wr.health_code(wr.SPECS["keycloak"]) == 200 and final_realms == ["master"] and not faketags) +print("RECOVER:", "OK" if ok else "INCOMPLETE") +sys.exit(0 if ok else 1) diff --git a/runner/adv_traefik_rollback.py b/runner/adv_traefik_rollback.py new file mode 100644 index 0000000..8d4aead --- /dev/null +++ b/runner/adv_traefik_rollback.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""ADVERSARY traefik WC1.1 destructive rollback cold proof (LOW TLS risk). +Stage a fake NEWER traefik tag whose compose fails abra LINT (a bare-int env entry → "must be a +string"), so the broken deploy is REJECTED before the running proxy is touched. The reconciler then +exercises the STATELESS rollback path: deploy(latest=broken) fails → redeploy last_good 5.1.1+v3.6.15 +(no snapshot — traefik is stateless) → healthy → rollback alert. Asserts traefik stays serving +(ci.commoninternet.net=200) + keycloak-through-traefik=200 throughout/after, last_good unchanged, a +*-rollback.json alert. DEFENSIVE: finally always restores traefik to 5.1.1+v3.6.15 healthy + cleans +the fake tag. Manual recovery if needed: abra app deploy traefik.ci.commoninternet.net 5.1.1+v3.6.15 -o -n -f""" +import os, subprocess, sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import warm_reconcile as wr + +RDIR = os.path.expanduser("~/.abra/recipes/traefik") +DOMAIN = "traefik.ci.commoninternet.net"; GOOD = "5.1.1+v3.6.15"; FAKE = "5.2.0+v3.6.15" +ALERTS = os.path.join(wr.warmsnap.DEFAULT_WARM_ROOT, "alerts") +fails = [] +def git(*a, check=True): + return subprocess.run(["git","-C",RDIR,"-c","user.email=adv@cc-ci","-c","user.name=adv",*a], + capture_output=True, text=True, check=check) +def routed(host="ci.commoninternet.net", path="/"): + return subprocess.run(["curl","-sk","--resolve",f"{host}:443:127.0.0.1","-o","/dev/null", + "-w","%{http_code}","--max-time","10",f"https://{host}{path}"],capture_output=True,text=True).stdout.strip() +def reconcile(): + env={**os.environ,"CCCI_SKIP_FETCH":"1"} + r=subprocess.run(["python3",os.path.join(os.path.dirname(__file__),"warm_reconcile.py"),"traefik"], + capture_output=True,text=True,env=env,timeout=1200) + print(r.stdout[-2000:]); print(r.stderr[-500:],file=sys.stderr) + for line in r.stdout.splitlines(): + if line.startswith("RECONCILE RESULT:"): return line.split(":",1)[1].strip() + return f"" + +orig_head = git("rev-parse","HEAD").stdout.strip() +print(f"START traefik TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} " + f"ci={routed()} kc-through={routed('warm-keycloak.ci.commoninternet.net','/realms/master')} orig_head={orig_head[:8]}") +try: + # stage fake NEWER tag with a lint-breaking env (bare int → not a string) + git("checkout","-fq",GOOD) + import re + cf=os.path.join(RDIR,"compose.yml"); txt=open(cf).read() + # add a bare-integer entry to the app service environment list (first 'environment:' block) + txt=txt.replace(" environment:\n - DASHBOARD_ENABLED", + " environment:\n - {advbad: brokenmapping}\n - DASHBOARD_ENABLED",1) + open(cf,"w").write(txt) + git("commit","-aqm","adv: lint-breaking env for traefik rollback proof") + broken=git("rev-parse","HEAD").stdout.strip() + git("tag","-a","-m","adv",FAKE,broken) + git("checkout","-fq",orig_head) # leave working tree on the good HEAD; tag keeps broken commit + print(f"staged fake {FAKE}@{broken[:8]} (lint-breaking); reconcile (expect rollback->{GOOD})...") + a0=set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set() + res=reconcile() + new=sorted((set(os.listdir(ALERTS)) if os.path.isdir(ALERTS) else set())-a0) + ci, kc = routed(), routed("warm-keycloak.ci.commoninternet.net","/realms/master") + print(f"RESULT={res!r} TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} ci={ci} kc-through={kc} new_alerts={new}") + if not res.startswith("rolled-back:"): fails.append(f"not rolled-back: {res}") + if wr.read_last_good("traefik")!=GOOD: fails.append(f"last_good changed: {wr.read_last_good('traefik')}") + if ci!="200": fails.append(f"traefik not serving after rollback: ci={ci}") + if kc!="200": fails.append(f"keycloak-through-traefik not 200: {kc}") + rb=[a for a in new if "rollback" in a] + if not rb: fails.append("no rollback alert") + else: + import json; rec=json.load(open(os.path.join(ALERTS,rb[0]))) + print(f"rollback alert: {rec}") + if rec.get("attempted")!=FAKE: fails.append(f"alert attempted={rec.get('attempted')}") + if rec.get("last_good")!=GOOD: fails.append(f"alert last_good={rec.get('last_good')}") + if rec.get("recovered") is not True: fails.append(f"alert recovered={rec.get('recovered')}") +finally: + # DEFENSIVE recovery: delete fake tag, restore recipe HEAD, ensure traefik on GOOD + healthy + git("tag","-d",FAKE,check=False); git("checkout","-fq",orig_head) + if wr.current_version(DOMAIN)!=GOOD or routed()!="200": + print("!! defensive recovery: redeploying traefik GOOD", flush=True) + try: wr.deploy_version("traefik",DOMAIN,GOOD,600); wr.wait_healthy(wr.SPECS["traefik"]) + except Exception as e: print(f"!! recovery deploy error: {e}") + fin_ci=routed(); fin_kc=routed("warm-keycloak.ci.commoninternet.net","/realms/master") + fake_left=[t for t in git("tag").stdout.split() if t==FAKE] + print(f"END TYPE={wr.current_version(DOMAIN)} last_good={wr.read_last_good('traefik')} ci={fin_ci} kc-through={fin_kc} fake_tag_left={fake_left}") + if fin_ci!="200": fails.append(f"FINAL traefik not serving: {fin_ci}") + if fake_left: fails.append("fake tag not cleaned") +print("\nRESULT:", "FAIL: "+"; ".join(fails) if fails else + "PASS — traefik WC1.1 stateless rollback: broken-latest deploy rejected → rolled back to last_good 5.1.1+v3.6.15, traefik+routes healthy (no TLS outage), alert written, cert/config preserved") +sys.exit(1 if fails else 0)