diff --git a/abra.sh b/abra.sh index 1c66304..def0c38 100644 --- a/abra.sh +++ b/abra.sh @@ -1 +1,2 @@ export DB_ENTRYPOINT_VERSION=v1 +export PG_BACKUP_VERSION=v1 diff --git a/compose.yml b/compose.yml index 142fd71..b2e7473 100644 --- a/compose.yml +++ b/compose.yml @@ -49,7 +49,7 @@ services: interval: 30s timeout: 10s retries: 6 - start_period: 5m + start_period: 20m db: image: postgres:16 @@ -63,6 +63,9 @@ services: - source: db_entrypoint target: /docker-entrypoint.sh mode: 0555 + - source: pg_backup + target: /pg_backup.sh + mode: 0555 entrypoint: /docker-entrypoint.sh environment: - POSTGRES_HOST_AUTH_METHOD=trust @@ -72,9 +75,9 @@ services: deploy: labels: backupbot.backup: "true" - backupbot.backup.pre-hook: "bash -c 'PGPASSWORD=$$(cat $${POSTGRES_PASSWORD_FILE}) pg_dump -U $${POSTGRES_USER} $${POSTGRES_DB} > /tmp/backup.sql'" - backupbot.backup.post-hook: "rm -rf /tmp/backup.sql" - backupbot.backup.path: "/tmp/backup.sql" + backupbot.backup.pre-hook: "/pg_backup.sh backup" + backupbot.backup.volumes.postgresql_data.path: "backup.sql" + backupbot.restore.post-hook: "/pg_backup.sh restore" redis: image: redis:7.4-alpine @@ -132,3 +135,6 @@ configs: name: ${STACK_NAME}_db_entrypoint_${DB_ENTRYPOINT_VERSION} file: entrypoint.postgres.sh.tmpl template_driver: golang + pg_backup: + name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION} + file: pg_backup.sh diff --git a/pg_backup.sh b/pg_backup.sh new file mode 100755 index 0000000..5a2e6d8 --- /dev/null +++ b/pg_backup.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Postgres backup/restore hook for the discourse `db` service. Invoked by backupbot-two via: +# backupbot.backup.pre-hook = "/pg_backup.sh backup" +# backupbot.backup.volumes.postgresql_data.path = "backup.sql" +# backupbot.restore.post-hook = "/pg_backup.sh restore" +# Backup dumps the DB to backup.sql (gzip) inside the postgresql_data volume; backupbot archives it. +# Restore reimports it. Discourse (the rails app + sidekiq) keeps many TCP connections open to the DB +# and reconnects within milliseconds, so a one-shot pg_terminate_backend is NOT enough: restore must +# first block all non-local connections at the pg_hba level (so the app cannot reconnect and interfere +# mid-reimport), then FORCE-drop, recreate, and deterministically reimport the dump, then restore +# pg_hba. (Mirrors the proven matrix-synapse restore hook.) The previous recipe shipped a pg_dump +# backup but NO restore hook — a file-level restore did not reload into the running postgres, so a +# restored backup silently kept the live (un-restored) state. cc-ci caught this: a seeded ci_marker row +# was gone after restore. Same pattern as the immich / mattermost-lts / ghost recipe-PRs. + +set -e + +BACKUP_FILE='/var/lib/postgresql/data/backup.sql' +export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}") +DB_USER="${POSTGRES_USER:-discourse}" +DB_NAME="${POSTGRES_DB:-discourse}" + +function backup { + pg_dump -U "$DB_USER" "$DB_NAME" | gzip > "$BACKUP_FILE" +} + +function restore { + cd /var/lib/postgresql/data/ + + # Block all non-local connections so the running discourse app + sidekiq cannot reconnect and + # interfere with the drop/recreate/reimport (a one-shot pg_terminate_backend is not enough — the + # app reconnects within ms over TCP). Restored on exit. + restore_hba() { + cat pg_hba.conf.bak > pg_hba.conf + rm -f pg_hba.conf.bak + su postgres -c 'pg_ctl reload' + } + cp pg_hba.conf pg_hba.conf.bak + echo 'local all all trust' > pg_hba.conf + su postgres -c 'pg_ctl reload' + trap restore_hba EXIT INT TERM + + # Terminate lingering local sessions, then FORCE-drop + recreate + deterministic reimport. + psql -U "$DB_USER" -d postgres -c \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();" + psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);" + createdb -U "$DB_USER" "$DB_NAME" + gunzip -c "$BACKUP_FILE" | psql -U "$DB_USER" -d "$DB_NAME" -1 -v ON_ERROR_STOP=1 -f - +} + +$@