diff --git a/tests/discourse/functional/_discourse.py b/tests/discourse/functional/_discourse.py index 0e50c25..7e0ca2a 100644 --- a/tests/discourse/functional/_discourse.py +++ b/tests/discourse/functional/_discourse.py @@ -40,11 +40,16 @@ _BOOTSTRAP_RB = ( def mint_admin(domain: str) -> tuple[str, str]: """Bootstrap an admin + fresh API key via Rails in the app container. Returns (api_key, username).""" + # `bin/rails` is `#!/usr/bin/env ruby`; the bitnami discourse image keeps ruby at + # /opt/bitnami/ruby/bin, which is NOT on a login shell's PATH (`bash -lc` resets PATH from + # /etc/profile → `env: 'ruby': No such file or directory`, rc=127). Use a non-login shell, discover + # ruby (image-ENV PATH first, bitnami fallback), and invoke it explicitly so the shebang is moot. cmd = ( "cd /opt/bitnami/discourse && " - f"RAILS_ENV=production bin/rails runner \"{_BOOTSTRAP_RB}\"" + "RUBY=$(command -v ruby || echo /opt/bitnami/ruby/bin/ruby) && " + f"RAILS_ENV=production \"$RUBY\" bin/rails runner \"{_BOOTSTRAP_RB}\"" ) - out = lifecycle.exec_in_app(domain, ["bash", "-lc", cmd], service="app", timeout=240) + out = lifecycle.exec_in_app(domain, ["bash", "-c", cmd], service="app", timeout=240) key = user = None for line in out.splitlines(): line = line.strip() diff --git a/tests/discourse/recipe_meta.py b/tests/discourse/recipe_meta.py index bfe2280..f316cd8 100644 --- a/tests/discourse/recipe_meta.py +++ b/tests/discourse/recipe_meta.py @@ -36,3 +36,32 @@ EXTRA_ENV = { "TIMEOUT": "2400", "COMPOSE_FILE": "compose.yml:compose.ccci.yml", } + + +def BACKUP_VERIFY(domain): + """Post-backup integrity check (Q4.6, same race ghost F2-14b hit). The recipe's backupbot db + pre-hook (`/pg_backup.sh backup`) dumps the discourse postgres DB to `/var/lib/postgresql/data/ + backup.sql` (gzip), then restic captures that path. On the loaded single CI node the db container + is cycled by the immediately-preceding UPGRADE tier (chaos redeploy), and at backup time the + pre-hook's pg_dump can race that cycle — the dump is truncated/never written, restic snapshots an + empty/absent path, and a later restore reimports nothing → the seeded ci_marker is lost (P4 RED; + observed full1/full2 WITH upgrade, vs full3 WITHOUT upgrade green). Proven first-hand: the pre-hook + itself succeeds on a stable db (manual exec → valid 922KB dump), so the failure is the cycle race, + not the script. This probe proves the dump completed: backup.sql exists, is a VALID gzip, non-empty. + False → the harness re-runs the WHOLE backup with a re-stabilised db (run_recipe_ci _perform_op, + caps at 3 then proceeds — a persistent failure still surfaces RED at restore, so it weakens no + assertion; it only retries a flaky CAPTURE). READ-ONLY.""" + # recipe_meta.py is exec()'d into a bare namespace (no __file__); runner/ is already on sys.path + # and `harness` importable — import directly (ghost F2-14b shipped broken by computing a path here). + from harness import lifecycle + + try: + out = lifecycle.exec_in_app( + domain, + ["sh", "-c", "gzip -t /var/lib/postgresql/data/backup.sql && wc -c < /var/lib/postgresql/data/backup.sql"], + service="db", + timeout=60, + ).strip() + except Exception: # noqa: BLE001 — exec fails if the db is mid-cycle: treat as not-yet-captured + return False + return out.isdigit() and int(out) > 0