diff --git a/pg_backup.sh b/pg_backup.sh index 0e0ad26..71a3221 100755 --- a/pg_backup.sh +++ b/pg_backup.sh @@ -6,21 +6,32 @@ # backupbot.backup.volumes.postgres.path = "backup.sql" # backupbot.restore.post-hook = "/pg_backup.sh restore" # Backup dumps the immich DB to backup.sql (gzip) inside the postgres volume; backupbot archives that -# file. Restore reads it back and reimports. +# file. Restore reads it back and reimports it. # -# The immich DB runs the VectorChord/pgvecto.rs extensions. A plain pg_dump emits -# SELECT pg_catalog.set_config('search_path', '', false); -# near the top; on reimport that empty search_path leaves the vector/vchord types + operator classes -# unresolvable, so the first statement that references them errors. immich's official restore -# (docs.immich.app/administration/backup-and-restore) rewrites that line to put `public, pg_catalog` -# back on the search_path BEFORE the import. Without that rewrite a single-transaction import aborts -# wholesale and nothing is restored (the bug in the original ci/pg-backup attempt). We keep -# --single-transaction --set ON_ERROR_STOP=on (immich upstream) so a genuine failure is loud rather -# than silently leaving a half-restored DB. +# Two things make the immich DB tricky to restore, and BOTH are handled below: +# +# 1. VectorChord/pgvecto.rs search_path. A plain pg_dump emits +# SELECT pg_catalog.set_config('search_path', '', false); +# near the top; on reimport that empty search_path leaves the vector/vchord types + operator +# classes unresolvable, so the first statement that references them errors. immich's official +# restore (docs.immich.app/administration/backup-and-restore) rewrites that line to put +# `public, pg_catalog` back on the search_path BEFORE the import. We do the same. +# +# 2. The app races the restore. immich-server keeps a TCP connection pool to the DB and reconnects +# within milliseconds of being dropped, re-running its OWN startup migrations. If it does that +# while our single-transaction import is running, the two conflict (duplicate CREATE/relation +# errors) and ON_ERROR_STOP aborts our transaction — the import rolls back and nothing is +# restored, while the app finishes its migration and looks "healthy" on an empty DB. A one-shot +# pg_terminate_backend does NOT prevent the reconnect. So, like the (green) matrix-synapse hook, +# we replace pg_hba.conf with a LOCAL-trust-only policy and reload: postgres then REJECTS every +# TCP connection, locking the app out for the duration of the restore. We restore pg_hba (via an +# EXIT trap, so it always runs) once the import is done, and the app reconnects to the fully +# restored DB. Our own psql/createdb use the local socket (trust), so they are unaffected. set -e BACKUP_FILE='/var/lib/postgresql/data/backup.sql' +HBA='/var/lib/postgresql/data/pg_hba.conf' export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}") DB_USER="${POSTGRES_USER:-postgres}" DB_NAME="${POSTGRES_DB:-immich}" @@ -30,17 +41,33 @@ function backup { } function restore { - # immich-server holds TCP connections to the DB; terminate them so DROP DATABASE can proceed - # (the matrix-synapse pg_hba "local trust" trick does not cover networked connections). + restore_hba() { + if [ -f "${HBA}.ccci.bak" ]; then + cat "${HBA}.ccci.bak" > "$HBA" + rm -f "${HBA}.ccci.bak" + psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();" >/dev/null 2>&1 || true + fi + } + trap restore_hba EXIT INT TERM + + # Lock the networked immich-server OUT for the duration of the restore (see header note 2): + # local-trust-only pg_hba + reload makes postgres reject all TCP connections, then terminate the + # connections the app already holds so DROP DATABASE can proceed. + cp "$HBA" "${HBA}.ccci.bak" + printf 'local all all trust\n' > "$HBA" + psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();" psql -U "$DB_USER" -d postgres -c \ "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();" + psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);" createdb -U "$DB_USER" "$DB_NAME" - # Rewrite the empty search_path the VectorChord dump sets so vector/vchord type + operator - # references resolve during the import (immich upstream restore procedure), then import. + # Rewrite the empty search_path the VectorChord dump sets (header note 1), then import alone. gunzip -c "$BACKUP_FILE" \ | sed "s/SELECT pg_catalog.set_config('search_path', '', false);/SELECT pg_catalog.set_config('search_path', 'public, pg_catalog', true);/g" \ | psql -U "$DB_USER" -d "$DB_NAME" --single-transaction --set ON_ERROR_STOP=on + + restore_hba + trap - EXIT INT TERM } $@