fix(restore): lock the app out via pg_hba during restore so it cannot race the import
Some checks failed
cc-ci/testme cc-ci: failure

This commit is contained in:
autonomic-bot
2026-06-09 16:44:46 +00:00
parent f89f82be4e
commit 57944be67a

View File

@ -6,21 +6,32 @@
# backupbot.backup.volumes.postgres.path = "backup.sql"
# backupbot.restore.post-hook = "/pg_backup.sh restore"
# Backup dumps the immich DB to backup.sql (gzip) inside the postgres volume; backupbot archives that
# file. Restore reads it back and reimports.
# file. Restore reads it back and reimports it.
#
# The immich DB runs the VectorChord/pgvecto.rs extensions. A plain pg_dump emits
# SELECT pg_catalog.set_config('search_path', '', false);
# near the top; on reimport that empty search_path leaves the vector/vchord types + operator classes
# unresolvable, so the first statement that references them errors. immich's official restore
# (docs.immich.app/administration/backup-and-restore) rewrites that line to put `public, pg_catalog`
# back on the search_path BEFORE the import. Without that rewrite a single-transaction import aborts
# wholesale and nothing is restored (the bug in the original ci/pg-backup attempt). We keep
# --single-transaction --set ON_ERROR_STOP=on (immich upstream) so a genuine failure is loud rather
# than silently leaving a half-restored DB.
# Two things make the immich DB tricky to restore, and BOTH are handled below:
#
# 1. VectorChord/pgvecto.rs search_path. A plain pg_dump emits
# SELECT pg_catalog.set_config('search_path', '', false);
# near the top; on reimport that empty search_path leaves the vector/vchord types + operator
# classes unresolvable, so the first statement that references them errors. immich's official
# restore (docs.immich.app/administration/backup-and-restore) rewrites that line to put
# `public, pg_catalog` back on the search_path BEFORE the import. We do the same.
#
# 2. The app races the restore. immich-server keeps a TCP connection pool to the DB and reconnects
# within milliseconds of being dropped, re-running its OWN startup migrations. If it does that
# while our single-transaction import is running, the two conflict (duplicate CREATE/relation
# errors) and ON_ERROR_STOP aborts our transaction — the import rolls back and nothing is
# restored, while the app finishes its migration and looks "healthy" on an empty DB. A one-shot
# pg_terminate_backend does NOT prevent the reconnect. So, like the (green) matrix-synapse hook,
# we replace pg_hba.conf with a LOCAL-trust-only policy and reload: postgres then REJECTS every
# TCP connection, locking the app out for the duration of the restore. We restore pg_hba (via an
# EXIT trap, so it always runs) once the import is done, and the app reconnects to the fully
# restored DB. Our own psql/createdb use the local socket (trust), so they are unaffected.
set -e
BACKUP_FILE='/var/lib/postgresql/data/backup.sql'
HBA='/var/lib/postgresql/data/pg_hba.conf'
export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}")
DB_USER="${POSTGRES_USER:-postgres}"
DB_NAME="${POSTGRES_DB:-immich}"
@ -30,17 +41,33 @@ function backup {
}
function restore {
# immich-server holds TCP connections to the DB; terminate them so DROP DATABASE can proceed
# (the matrix-synapse pg_hba "local trust" trick does not cover networked connections).
restore_hba() {
if [ -f "${HBA}.ccci.bak" ]; then
cat "${HBA}.ccci.bak" > "$HBA"
rm -f "${HBA}.ccci.bak"
psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();" >/dev/null 2>&1 || true
fi
}
trap restore_hba EXIT INT TERM
# Lock the networked immich-server OUT for the duration of the restore (see header note 2):
# local-trust-only pg_hba + reload makes postgres reject all TCP connections, then terminate the
# connections the app already holds so DROP DATABASE can proceed.
cp "$HBA" "${HBA}.ccci.bak"
printf 'local all all trust\n' > "$HBA"
psql -U "$DB_USER" -d postgres -c "SELECT pg_reload_conf();"
psql -U "$DB_USER" -d postgres -c \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();"
psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);"
createdb -U "$DB_USER" "$DB_NAME"
# Rewrite the empty search_path the VectorChord dump sets so vector/vchord type + operator
# references resolve during the import (immich upstream restore procedure), then import.
# Rewrite the empty search_path the VectorChord dump sets (header note 1), then import alone.
gunzip -c "$BACKUP_FILE" \
| sed "s/SELECT pg_catalog.set_config('search_path', '', false);/SELECT pg_catalog.set_config('search_path', 'public, pg_catalog', true);/g" \
| psql -U "$DB_USER" -d "$DB_NAME" --single-transaction --set ON_ERROR_STOP=on
restore_hba
trap - EXIT INT TERM
}
$@