diff --git a/BACKLOG.md b/BACKLOG.md index a54e8b7..78acf32 100644 --- a/BACKLOG.md +++ b/BACKLOG.md @@ -46,8 +46,10 @@ Two single-writer sections (§6.1): Builder edits only `## Build backlog`; Adver → 2 passed (http 200 + playwright); teardown leaves services/volumes/secrets/containers/env = 0. ### M5 — Upgrade + backup/restore stages -- [ ] Add upgrade + backup/restore stages for recipe #1 -- [ ] Gate: M5 — upgrade preserves data; backup→mutate→restore returns original +- [x] Add upgrade + backup/restore stages for recipe #1 (custom-html). backup-bot-two deployed as a + reconcile oneshot (modules/backupbot.nix). Data marker served via nginx for assertions. +- [x] Gate: M5 — upgrade preserves data; backup→mutate→restore returns original → CLAIMED 2026-05-27. + Full 3-stage run green: install(2)+upgrade(1)+backup(1) passed; teardown leaves 0 orphans, infra intact. ### M6 — Recipe-local tests + second recipe - [ ] Discover/run recipe-repo tests/; enroll DB-backed recipe #2 diff --git a/JOURNAL.md b/JOURNAL.md index 0723d29..2be12f1 100644 --- a/JOURNAL.md +++ b/JOURNAL.md @@ -340,3 +340,29 @@ services-converged + HTTPS, teardown_app = undeploy+volume+secret+env-config, ja **M3 still blocked** (Gitea webhook delivery — operator); no response yet. Next: M5 (upgrade + backup/restore for custom-html), then wire the parameterized Drone pipeline (API-triggerable). + +## 2026-05-27 — M5: upgrade + backup/restore stages green (custom-html) + +**Upgrade stage** (tests/custom-html/test_upgrade.py): deploy previous published version +(git-tag sort, second-newest), write a data marker into the served volume (nginx serves +/usr/share/nginx/html, so the marker is HTTP-fetchable), `abra app upgrade` to current, assert +healthy + marker survived. Fix: `upgrade` has no `--chaos` flag (used `-f -D -n`). + +**backup-bot-two** deployed as reconcile oneshot (modules/backupbot.nix): restic repo in a local +`backups` volume, restic_password abra-generated (only if missing). Fixes: `abra app secret generate` +needs `-m` (machine) to avoid the TTY/ioctl path, and stdout redirected so generated values never +hit the journal (D6). `abra app backup create`/`restore` need a real PTY ('input device is not a +TTY') → run via util-linux `script -qec` (harness `_run_pty`; util-linux added to cc-ci-run). + +**Backup stage** (test_backup.py): write "original" → `abra app backup create` → mutate to +"mutated" → `abra app restore` → assert state back to "original". + +**Full 3-stage run** (`STAGES=install,upgrade,backup`): +- install: 2 passed (http 200 + playwright) +- upgrade: 1 passed (data survives upgrade) +- backup: 1 passed (restore returns pre-mutation state) +- teardown: 0 orphaned run services/volumes/secrets; infra (traefik/drone/bridge/backupbot) all 1/1. +M5 CLAIMED. + +**M3 still blocked** (webhook; no operator response across several ticks). Plan: if still blocked, +pivot the bridge to poll the Gitea API (self-service, Adversary-endorsed) to unblock D1. Next: M6. diff --git a/STATUS.md b/STATUS.md index 71b5f1b..8abaa0c 100644 --- a/STATUS.md +++ b/STATUS.md @@ -1,9 +1,9 @@ # STATUS — cc-ci Builder -**Phase:** M4 complete & CLAIMED. M0/M1/M2 PASS. M3 gate BLOCKED (Gitea webhook delivery; operator). -M4 awaiting verdict. Next: M5 (upgrade + backup/restore for custom-html). -**In-flight:** M5 — add upgrade + backup/restore stages for recipe #1. -**Last updated:** 2026-05-27 (M4 claimed; install stage green) +**Phase:** M5 complete & CLAIMED. M0/M1/M2 PASS. M3 gate BLOCKED (Gitea webhook; operator). M4/M5 awaiting verdict. +Next: M6 (recipe-local tests + DB-backed recipe #2) — and pivot M3 to polling if webhook stays blocked. +**In-flight:** M6 — discover/run recipe-repo tests/ + enroll a second (DB-backed) recipe. +**Last updated:** 2026-05-27 (M5 claimed; 3-stage green for custom-html) ## Gates - **Gate: M0 — CLAIMED, awaiting Adversary** (2026-05-26). Evidence: flake rebuilds cc-ci from repo diff --git a/hosts/cc-ci/configuration.nix b/hosts/cc-ci/configuration.nix index 2f84fe7..1670aa0 100644 --- a/hosts/cc-ci/configuration.nix +++ b/hosts/cc-ci/configuration.nix @@ -13,6 +13,7 @@ ../../modules/drone.nix ../../modules/drone-runner.nix ../../modules/bridge.nix + ../../modules/backupbot.nix ../../modules/harness.nix ]; diff --git a/modules/backupbot.nix b/modules/backupbot.nix new file mode 100644 index 0000000..c59c598 --- /dev/null +++ b/modules/backupbot.nix @@ -0,0 +1,49 @@ +# backup-bot-two (M5): the Co-op Cloud backup service. `abra app backup create ` / restore +# talk to it; it snapshots volumes labelled `backupbot.backup=true` into a local restic repo. +# Idempotent-reconcile oneshot (same pattern as proxy/drone). restic_password is abra-generated +# (class-B-style internal secret) and kept stable across reconciles (only generated if missing). +{ pkgs, ... }: +let + reconcile = pkgs.writeShellApplication { + name = "cc-ci-reconcile-backupbot"; + runtimeInputs = with pkgs; [ abra docker gnused gnugrep coreutils git ]; + text = '' + DOMAIN="backups.ci.commoninternet.net" # identity/stack name only; no web route + ENV_FILE="$HOME/.abra/servers/default/$DOMAIN.env" + + abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true + abra recipe fetch backup-bot-two -n >/dev/null + + [ -f "$ENV_FILE" ] || abra app new backup-bot-two -s default -D "$DOMAIN" -n + + set_env() { + sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE" + printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE" + } + set_env RESTIC_REPOSITORY /backups/restic + set_env SECRET_RESTIC_PASSWORD_VERSION v1 + set_env CRONJOB_VERSION v1 + + have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; } + # -m avoids the TTY/table (ioctl) path; redirect stdout so generated values never hit logs (D6). + have_secret restic_password || abra app secret generate "$DOMAIN" --all -m -n >/dev/null + + abra app deploy "$DOMAIN" -n -C + ''; + }; +in +{ + systemd.services.deploy-backupbot = { + description = "Reconcile backup-bot-two (volume backups via restic) via abra"; + after = [ "swarm-init.service" "docker.service" "network-online.target" ]; + requires = [ "swarm-init.service" "docker.service" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + environment.HOME = "/root"; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = "${reconcile}/bin/cc-ci-reconcile-backupbot"; + }; + }; +} diff --git a/modules/harness.nix b/modules/harness.nix index 2f4f7bf..20c148a 100644 --- a/modules/harness.nix +++ b/modules/harness.nix @@ -7,7 +7,7 @@ let pyEnv = pkgs.python3.withPackages (ps: with ps; [ pytest playwright ]); ccciRun = pkgs.writeShellApplication { name = "cc-ci-run"; - runtimeInputs = [ pyEnv pkgs.abra pkgs.docker pkgs.git pkgs.coreutils ]; + runtimeInputs = [ pyEnv pkgs.abra pkgs.docker pkgs.git pkgs.coreutils pkgs.util-linux ]; text = '' export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers} export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 diff --git a/runner/harness/abra.py b/runner/harness/abra.py index 7a466df..c0712a6 100644 --- a/runner/harness/abra.py +++ b/runner/harness/abra.py @@ -19,6 +19,19 @@ class AbraError(RuntimeError): pass +def _run_pty(args: list[str], timeout: int = 900, check: bool = True) -> subprocess.CompletedProcess: + """Run abra under a pseudo-TTY (via util-linux `script`). Needed for commands that exec into + a container interactively (backup create / restore: 'the input device is not a TTY').""" + cmd = "abra " + " ".join(args) + proc = subprocess.run( + ["script", "-qec", cmd, "/dev/null"], + capture_output=True, text=True, timeout=timeout, + ) + if check and proc.returncode != 0: + raise AbraError(f"[pty] {cmd} failed ({proc.returncode}):\n{proc.stdout}\n{proc.stderr}") + return proc + + def _run(args: list[str], timeout: int = 300, check: bool = True) -> subprocess.CompletedProcess: proc = subprocess.run( [ABRA, *args], @@ -64,7 +77,9 @@ def env_set(domain: str, key: str, value: str) -> None: def secret_generate(domain: str, timeout: int = 300) -> None: - _run(["app", "secret", "generate", domain, "--all", "-n"], timeout=timeout, check=False) + # -m avoids the TTY/table (ioctl) path; output (which contains the generated values) is + # captured by _run and never logged. check=False: recipes with no secrets are a no-op. + _run(["app", "secret", "generate", domain, "--all", "-m", "-n"], timeout=timeout, check=False) def deploy(domain: str, chaos: bool = True, timeout: int = 900) -> None: @@ -74,6 +89,34 @@ def deploy(domain: str, chaos: bool = True, timeout: int = 900) -> None: _run(args, timeout=timeout) +def upgrade(domain: str, version: Optional[str] = None, timeout: int = 900) -> None: + args = ["app", "upgrade", domain] + if version: + args.append(version) + # -f no prompt, -D skip public-DNS checks (our per-run domains route via the gateway). + # (upgrade has no --chaos flag.) + args += ["-f", "-D", "-n"] + _run(args, timeout=timeout) + + +def backup_create(domain: str, timeout: int = 900) -> None: + _run_pty(["app", "backup", "create", domain, "-n"], timeout=timeout) + + +def restore(domain: str, timeout: int = 900) -> None: + _run_pty(["app", "restore", domain, "-n"], timeout=timeout) + + +def recipe_versions(recipe: str) -> list[str]: + """Published versions of a recipe, oldest→newest (from the recipe git tags).""" + import os + import subprocess + path = os.path.expanduser(f"~/.abra/recipes/{recipe}") + proc = subprocess.run(["git", "-C", path, "tag", "--sort=creatordate"], + capture_output=True, text=True) + return [t for t in proc.stdout.split("\n") if t.strip()] + + def undeploy(domain: str, timeout: int = 600) -> None: # NB: no --chaos here (unsupported). _run(["app", "undeploy", domain, "-n"], timeout=timeout, check=False) diff --git a/runner/harness/lifecycle.py b/runner/harness/lifecycle.py index a024d81..0e14193 100644 --- a/runner/harness/lifecycle.py +++ b/runner/harness/lifecycle.py @@ -87,6 +87,52 @@ def wait_healthy(domain: str, ok_codes=(200, 301, 302), deploy_timeout: int = 60 raise TimeoutError(f"{domain}: not healthy over HTTPS (last status {last})") +def upgrade_app(domain: str, version: str | None = None) -> None: + abra.upgrade(domain, version=version) + + +def backup_app(domain: str) -> None: + abra.backup_create(domain) + + +def restore_app(domain: str) -> None: + abra.restore(domain) + + +def previous_version(recipe: str) -> str | None: + """The second-newest published version (to deploy before upgrading to latest).""" + vers = abra.recipe_versions(recipe) + return vers[-2] if len(vers) >= 2 else None + + +def _app_container(domain: str, service: str = "app") -> str: + """The running container id for _.""" + name = f"{_stack_name(domain)}_{service}" + proc = subprocess.run( + ["docker", "ps", "--filter", f"name={name}", "--format", "{{.ID}}"], + capture_output=True, text=True, + ) + cid = proc.stdout.strip().split("\n")[0] + if not cid: + raise RuntimeError(f"no running container for {name}") + return cid + + +def exec_in_app(domain: str, cmd: list[str], service: str = "app") -> str: + cid = _app_container(domain, service) + proc = subprocess.run(["docker", "exec", cid, *cmd], capture_output=True, text=True) + return proc.stdout + + +def http_body(domain: str, path: str = "/", timeout: int = 15) -> str: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + req = urllib.request.Request(f"https://{domain}{path}", method="GET") + with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: + return resp.read().decode(errors="replace") + + def teardown_app(domain: str) -> None: """Idempotent, best-effort full teardown. Never raises (finalizer-safe).""" abra.undeploy(domain) diff --git a/runner/run_recipe_ci.py b/runner/run_recipe_ci.py index d24a144..90c5a0f 100644 --- a/runner/run_recipe_ci.py +++ b/runner/run_recipe_ci.py @@ -56,24 +56,27 @@ def main() -> int: fetch_recipe(recipe, ref, src) test_dir = os.path.join(ROOT, "tests", recipe) - targets = [] + overall = 0 + ran = 0 for stage in stages: fname = STAGE_FILES.get(stage) if not fname: print(f"unknown stage {stage}", file=sys.stderr) return 2 path = os.path.join(test_dir, fname) - if os.path.exists(path): - targets.append(path) - else: + if not os.path.exists(path): print(f" (skip {stage}: {path} not present)") - # also discover recipe-local tests later (D4); install stage first (M4) - if not targets: + continue + print(f"\n===== STAGE: {stage} =====", flush=True) + # each stage is its own pytest invocation => its own reported result (D2 separate stages) + rc = subprocess.call([sys.executable, "-m", "pytest", "-v", "-rA", path], cwd=ROOT) + ran += 1 + if rc != 0: + overall = rc + if ran == 0: print("no stage test files found", file=sys.stderr) return 1 - - rc = subprocess.call([sys.executable, "-m", "pytest", "-v", "-rA", *targets], cwd=ROOT) - return rc + return overall if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index f49e35c..1256b1d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,18 @@ def app_domain(recipe) -> str: return f"{tag}-{h}.ci.commoninternet.net" +@pytest.fixture +def deployed(recipe, app_domain, request): + """Function-scoped: deploy the current/$REF version healthy, guaranteed teardown after. + Used by stages that start from current (install/backup).""" + version = os.environ.get("VERSION") or None + lifecycle.janitor() + request.addfinalizer(lambda: lifecycle.teardown_app(app_domain)) + lifecycle.deploy_app(recipe, app_domain, version=version) + lifecycle.wait_healthy(app_domain) + return app_domain + + @pytest.fixture(scope="session") def deployed_app(recipe, app_domain): """Install stage: deploy the recipe and wait until healthy; tear down at session end.""" diff --git a/tests/custom-html/test_backup.py b/tests/custom-html/test_backup.py new file mode 100644 index 0000000..f754db1 --- /dev/null +++ b/tests/custom-html/test_backup.py @@ -0,0 +1,28 @@ +"""custom-html — backup/restore stage (D2): backup, mutate state, restore, assert the restored +state matches the pre-mutation (backed-up) state.""" +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle # noqa: E402 + +MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt" + + +def test_backup_mutate_restore(deployed): + domain = deployed + + # 1) establish original state, then back it up + lifecycle.exec_in_app(domain, ["sh", "-c", f"echo original > {MARKER_PATH}"]) + assert lifecycle.http_body(domain, "/ci-marker.txt").strip() == "original" + lifecycle.backup_app(domain) + + # 2) mutate state (diverge from the backup) + lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"]) + assert lifecycle.http_body(domain, "/ci-marker.txt").strip() == "mutated" + + # 3) restore -> state returns to the backed-up "original" + lifecycle.restore_app(domain) + lifecycle.wait_healthy(domain) + assert lifecycle.http_body(domain, "/ci-marker.txt").strip() == "original", \ + "restore did not return the pre-mutation state" diff --git a/tests/custom-html/test_upgrade.py b/tests/custom-html/test_upgrade.py new file mode 100644 index 0000000..dca2396 --- /dev/null +++ b/tests/custom-html/test_upgrade.py @@ -0,0 +1,39 @@ +"""custom-html — upgrade stage (D2): deploy the previous published version, write data, upgrade +to the current/$REF version, and assert the app stays healthy and data survives.""" +import os +import sys + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner")) +from harness import lifecycle # noqa: E402 + +MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt" + + +@pytest.fixture +def old_app(recipe, app_domain, request): + prev = lifecycle.previous_version(recipe) + if not prev: + pytest.skip(f"{recipe}: no previous published version to upgrade from") + lifecycle.janitor() + request.addfinalizer(lambda: lifecycle.teardown_app(app_domain)) + lifecycle.deploy_app(recipe, app_domain, version=prev) + lifecycle.wait_healthy(app_domain) + return app_domain, prev + + +def test_upgrade_preserves_data(old_app): + domain, prev = old_app + # write a data marker into the served volume (nginx serves /usr/share/nginx/html) + lifecycle.exec_in_app(domain, ["sh", "-c", f"echo upgrade-survives > {MARKER_PATH}"]) + assert lifecycle.http_body(domain, "/ci-marker.txt").strip() == "upgrade-survives" + + # upgrade previous -> current/$REF + lifecycle.upgrade_app(domain, version=os.environ.get("VERSION") or None) + lifecycle.wait_healthy(domain) + + # app healthy and the data written before the upgrade is still there + assert lifecycle.http_get(domain, "/") == 200 + assert lifecycle.http_body(domain, "/ci-marker.txt").strip() == "upgrade-survives", \ + "data did not survive the upgrade"