fix(2): discourse — mint_admin ruby PATH (bash -c + discover) + BACKUP_VERIFY for post-upgrade backup race
This commit is contained in:
@ -36,3 +36,32 @@ EXTRA_ENV = {
|
||||
"TIMEOUT": "2400",
|
||||
"COMPOSE_FILE": "compose.yml:compose.ccci.yml",
|
||||
}
|
||||
|
||||
|
||||
def BACKUP_VERIFY(domain):
|
||||
"""Post-backup integrity check (Q4.6, same race ghost F2-14b hit). The recipe's backupbot db
|
||||
pre-hook (`/pg_backup.sh backup`) dumps the discourse postgres DB to `/var/lib/postgresql/data/
|
||||
backup.sql` (gzip), then restic captures that path. On the loaded single CI node the db container
|
||||
is cycled by the immediately-preceding UPGRADE tier (chaos redeploy), and at backup time the
|
||||
pre-hook's pg_dump can race that cycle — the dump is truncated/never written, restic snapshots an
|
||||
empty/absent path, and a later restore reimports nothing → the seeded ci_marker is lost (P4 RED;
|
||||
observed full1/full2 WITH upgrade, vs full3 WITHOUT upgrade green). Proven first-hand: the pre-hook
|
||||
itself succeeds on a stable db (manual exec → valid 922KB dump), so the failure is the cycle race,
|
||||
not the script. This probe proves the dump completed: backup.sql exists, is a VALID gzip, non-empty.
|
||||
False → the harness re-runs the WHOLE backup with a re-stabilised db (run_recipe_ci _perform_op,
|
||||
caps at 3 then proceeds — a persistent failure still surfaces RED at restore, so it weakens no
|
||||
assertion; it only retries a flaky CAPTURE). READ-ONLY."""
|
||||
# recipe_meta.py is exec()'d into a bare namespace (no __file__); runner/ is already on sys.path
|
||||
# and `harness` importable — import directly (ghost F2-14b shipped broken by computing a path here).
|
||||
from harness import lifecycle
|
||||
|
||||
try:
|
||||
out = lifecycle.exec_in_app(
|
||||
domain,
|
||||
["sh", "-c", "gzip -t /var/lib/postgresql/data/backup.sql && wc -c < /var/lib/postgresql/data/backup.sql"],
|
||||
service="db",
|
||||
timeout=60,
|
||||
).strip()
|
||||
except Exception: # noqa: BLE001 — exec fails if the db is mid-cycle: treat as not-yet-captured
|
||||
return False
|
||||
return out.isdigit() and int(out) > 0
|
||||
|
||||
Reference in New Issue
Block a user