fix(restore): lock the app out via pg_hba during restore so it cannot race the import
Some checks failed
cc-ci/testme cc-ci: failure
Some checks failed
cc-ci/testme cc-ci: failure
This commit is contained in:
55
pg_backup.sh
55
pg_backup.sh
@ -6,21 +6,32 @@
|
||||
# backupbot.backup.volumes.postgres.path = "backup.sql"
|
||||
# backupbot.restore.post-hook = "/pg_backup.sh restore"
|
||||
# Backup dumps the immich DB to backup.sql (gzip) inside the postgres volume; backupbot archives that
|
||||
# file. Restore reads it back and reimports.
|
||||
# file. Restore reads it back and reimports it.
|
||||
#
|
||||
# The immich DB runs the VectorChord/pgvecto.rs extensions. A plain pg_dump emits
|
||||
# SELECT pg_catalog.set_config('search_path', '', false);
|
||||
# near the top; on reimport that empty search_path leaves the vector/vchord types + operator classes
|
||||
# unresolvable, so the first statement that references them errors. immich's official restore
|
||||
# (docs.immich.app/administration/backup-and-restore) rewrites that line to put `public, pg_catalog`
|
||||
# back on the search_path BEFORE the import. Without that rewrite a single-transaction import aborts
|
||||
# wholesale and nothing is restored (the bug in the original ci/pg-backup attempt). We keep
|
||||
# --single-transaction --set ON_ERROR_STOP=on (immich upstream) so a genuine failure is loud rather
|
||||
# than silently leaving a half-restored DB.
|
||||
# Two things make the immich DB tricky to restore, and BOTH are handled below:
|
||||
#
|
||||
# 1. VectorChord/pgvecto.rs search_path. A plain pg_dump emits
|
||||
# SELECT pg_catalog.set_config('search_path', '', false);
|
||||
# near the top; on reimport that empty search_path leaves the vector/vchord types + operator
|
||||
# classes unresolvable, so the first statement that references them errors. immich's official
|
||||
# restore (docs.immich.app/administration/backup-and-restore) rewrites that line to put
|
||||
# `public, pg_catalog` back on the search_path BEFORE the import. We do the same.
|
||||
#
|
||||
# 2. The app races the restore. immich-server keeps a TCP connection pool to the DB and reconnects
|
||||
# within milliseconds of being dropped, re-running its OWN startup migrations. If it does that
|
||||
# while our single-transaction import is running, the two conflict (duplicate CREATE/relation
|
||||
# errors) and ON_ERROR_STOP aborts our transaction — the import rolls back and nothing is
|
||||
# restored, while the app finishes its migration and looks "healthy" on an empty DB. A one-shot
|
||||
# pg_terminate_backend does NOT prevent the reconnect. So, like the (green) matrix-synapse hook,
|
||||
# we replace pg_hba.conf with a LOCAL-trust-only policy and reload: postgres then REJECTS every
|
||||
# TCP connection, locking the app out for the duration of the restore. We restore pg_hba (via an
|
||||
# EXIT trap, so it always runs) once the import is done, and the app reconnects to the fully
|
||||
# restored DB. Our own psql/createdb use the local socket (trust), so they are unaffected.
|
||||
|
||||
set -e
|
||||
|
||||
BACKUP_FILE='/var/lib/postgresql/data/backup.sql'
|
||||
HBA='/var/lib/postgresql/data/pg_hba.conf'
|
||||
export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}")
|
||||
DB_USER="${POSTGRES_USER:-postgres}"
|
||||
DB_NAME="${POSTGRES_DB:-immich}"
|
||||
@ -30,17 +41,33 @@ function backup {
|
||||
}
|
||||
|
||||
function restore {
|
||||
# immich-server holds TCP connections to the DB; terminate them so DROP DATABASE can proceed
|
||||
# (the matrix-synapse pg_hba "local trust" trick does not cover networked connections).
|
||||
restore_hba() {
|
||||
if [ -f "${HBA}.ccci.bak" ]; then
|
||||
cat "${HBA}.ccci.bak" > "$HBA"
|
||||
rm -f "${HBA}.ccci.bak"
|
||||
psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
trap restore_hba EXIT INT TERM
|
||||
|
||||
# Lock the networked immich-server OUT for the duration of the restore (see header note 2):
|
||||
# local-trust-only pg_hba + reload makes postgres reject all TCP connections, then terminate the
|
||||
# connections the app already holds so DROP DATABASE can proceed.
|
||||
cp "$HBA" "${HBA}.ccci.bak"
|
||||
printf 'local all all trust\n' > "$HBA"
|
||||
psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();"
|
||||
psql -U "$DB_USER" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();"
|
||||
|
||||
psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);"
|
||||
createdb -U "$DB_USER" "$DB_NAME"
|
||||
# Rewrite the empty search_path the VectorChord dump sets so vector/vchord type + operator
|
||||
# references resolve during the import (immich upstream restore procedure), then import.
|
||||
# Rewrite the empty search_path the VectorChord dump sets (header note 1), then import alone.
|
||||
gunzip -c "$BACKUP_FILE" \
|
||||
| sed "s/SELECT pg_catalog.set_config('search_path', '', false);/SELECT pg_catalog.set_config('search_path', 'public, pg_catalog', true);/g" \
|
||||
| psql -U "$DB_USER" -d "$DB_NAME" --single-transaction --set ON_ERROR_STOP=on
|
||||
|
||||
restore_hba
|
||||
trap - EXIT INT TERM
|
||||
}
|
||||
|
||||
$@
|
||||
|
||||
Reference in New Issue
Block a user