diff --git a/README.md b/README.md index 14a478f..36defb8 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,17 @@ override) so it works behind the reverse proxy. abra app run YOURAPPDOMAIN app discourse admin create ``` +## Postgres major version upgrades + +Handled automatically by the [`discourse/postgres`] image (pgvector + an +auto-upgrade layer). On deploy it finds an older cluster, installs the old +binaries and runs `pg_upgrade` into the new versioned data directory. The recipe +adds a small entrypoint wrapper that injects the password secret and detects the +old cluster's real install superuser (oid 10), so the upgrade works whether that +user is `postgres` or `discourse`. No manual dump/restore needed. + +[`discourse/postgres`]: https://github.com/discourse/discourse-postgres + ## Migrating from the previous (bitnami) recipe The official image stores uploads under `/shared` rather than bitnami's diff --git a/abra.sh b/abra.sh index 2fae15c..346fbd5 100644 --- a/abra.sh +++ b/abra.sh @@ -1,5 +1,5 @@ -export DB_ENTRYPOINT_VERSION=v3 -export PG_BACKUP_VERSION=v2 +export DB_ENTRYPOINT_VERSION=v6 +export PG_BACKUP_VERSION=v3 export APP_ENTRYPOINT_VERSION=v2 export APP_INSTALL_SSL_VERSION=v1 export APP_MIGRATE_UPLOADS_VERSION=v1 diff --git a/cc-db-entrypoint.sh b/cc-db-entrypoint.sh new file mode 100644 index 0000000..4d2807f --- /dev/null +++ b/cc-db-entrypoint.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Co-op Cloud entrypoint wrapper for the discourse/postgres image. +# +# discourse/postgres (https://github.com/discourse/discourse-postgres) is pgvector +# plus a management layer that auto-upgrades an older cluster on boot. It does the +# heavy lifting (apt-installs the old binaries, runs pg_upgrade, writes the new +# cluster to the versioned PGDATA). This wrapper only fills the two gaps it leaves: +# +# 1. Secrets: the image reads DB_PASSWORD / POSTGRES_PASSWORD from the process +# env (no *_FILE support), so inject them from the docker secret. +# 2. Install user: the image runs `pg_upgrade --username="$POSTGRES_USER"` and +# initdb's the new cluster with $POSTGRES_USER, but never detects the OLD +# cluster's real bootstrap superuser (the install user, oid 10). pg_upgrade +# aborts unless the new cluster's install-user name matches the old one. Real +# deployments differ: some were bootstrapped with `postgres` as the install +# user (+ a separate `discourse` app role), others with `discourse` itself. +# So detect oid 10 from the old cluster and export POSTGRES_USER to match +# before handing off to the image's run-postgres.sh. +set -e + +# --- 1. secret injection ----------------------------------------------------- +if [ -f /run/secrets/db_password ]; then + pw="$(cat /run/secrets/db_password)" + export DB_PASSWORD="$pw" + export POSTGRES_PASSWORD="$pw" +fi + +# --- 2. install-user detection (only matters on the upgrade path) ------------ +NEW_MAJOR="$(postgres --version | sed -rn 's/^[^0-9]*([0-9]+).*/\1/p')" + +# Newest existing cluster under the data mount (same search the image uses). +PGVER_FILE="$(find /var/lib/postgresql -maxdepth 3 -type f -name PG_VERSION 2>/dev/null \ + | xargs -I{} sh -c 'printf "%s " "{}"; cat "{}"' \ + | sort -nk2,2 | tail -n1 | awk '{print $1}')" + +if [ -n "$PGVER_FILE" ]; then + OLD_DIR="$(dirname "$PGVER_FILE")" + OLD_MAJOR="$(cat "$PGVER_FILE")" + if [ "$OLD_MAJOR" != "$NEW_MAJOR" ]; then + echo "cc-db-entrypoint: existing pg${OLD_MAJOR} cluster at ${OLD_DIR}, image is pg${NEW_MAJOR} -> detecting install user" + OLD_BIN="/usr/lib/postgresql/${OLD_MAJOR}/bin" + if [ ! -x "$OLD_BIN/pg_ctl" ]; then + echo "cc-db-entrypoint: installing postgresql-${OLD_MAJOR} to read the old cluster" + apt-get update + apt-get install -y --no-install-recommends "postgresql-${OLD_MAJOR}" >/dev/null + fi + chown -R postgres "$OLD_DIR" 2>/dev/null || true + + # Briefly start the old cluster on a local socket only, ask it for oid 10. + gosu postgres "$OLD_BIN/pg_ctl" -D "$OLD_DIR" -w \ + -o "-c listen_addresses= -c unix_socket_directories=/tmp" start >/dev/null 2>&1 || true + detected="" + for login_role in discourse postgres; do + detected="$(gosu postgres psql -h /tmp -U "$login_role" -d postgres -tAc \ + 'select rolname from pg_roles where oid = 10' 2>/dev/null | tr -d '[:space:]')" + [ -n "$detected" ] && break + done + gosu postgres "$OLD_BIN/pg_ctl" -D "$OLD_DIR" -w stop >/dev/null 2>&1 || true + + if [ -n "$detected" ]; then + echo "cc-db-entrypoint: old cluster install user is '$detected' -> POSTGRES_USER=$detected" + export POSTGRES_USER="$detected" + else + echo "cc-db-entrypoint: WARNING could not detect old install user; leaving POSTGRES_USER=${POSTGRES_USER:-}" + fi + + # pg_upgrade refuses to run if the old and new clusters disagree on data + # checksums. PostgreSQL 18's initdb enables checksums by default, but the + # older clusters in this recipe's lineage (pg13-17) were created without + # them, so initdb the new cluster to match. Default to OFF (the lineage + # reality) and only enable when the old cluster positively reports them on. + csum="" + if [ -x "$OLD_BIN/pg_controldata" ]; then + csum="$("$OLD_BIN/pg_controldata" "$OLD_DIR" 2>/dev/null \ + | awk -F: '/checksum version/{gsub(/[^0-9]/,"",$2); print $2}')" + fi + if [ "$csum" = "1" ]; then + echo "cc-db-entrypoint: old cluster data checksums ON -> initdb new cluster --data-checksums" + export POSTGRES_INITDB_ARGS="${POSTGRES_INITDB_ARGS:+$POSTGRES_INITDB_ARGS }--data-checksums" + else + echo "cc-db-entrypoint: old cluster data checksums OFF (version='${csum:-unknown}') -> initdb new cluster --no-data-checksums" + export POSTGRES_INITDB_ARGS="${POSTGRES_INITDB_ARGS:+$POSTGRES_INITDB_ARGS }--no-data-checksums" + fi + fi +fi + +exec run-postgres.sh postgres diff --git a/compose.yml b/compose.yml index 5a3e717..8935685 100644 --- a/compose.yml +++ b/compose.yml @@ -63,35 +63,42 @@ services: start_period: 25m db: - image: pgvector/pgvector:pg17 + # discourse/postgres = pgvector + discourse's postgres management layer, which + # auto-upgrades an older cluster in place on boot (pg_upgrade into the versioned + # PGDATA /var/lib/postgresql/${MAJOR}/docker). The cc-db-entrypoint wrapper + # injects the password secret and detects the old cluster's install user. + image: discourse/postgres:pg18 networks: - internal secrets: - db_password volumes: - - 'postgresql_data:/var/lib/postgresql/data' + # the image expects the whole cluster tree mounted here (not the data subdir); + # an existing pg17 cluster at the volume root is found and upgraded into /18/docker + - 'postgresql_data:/var/lib/postgresql' configs: - source: db_entrypoint - target: /docker-entrypoint.sh + target: /usr/local/bin/cc-db-entrypoint.sh mode: 0555 - source: pg_backup target: /pg_backup.sh mode: 0555 - entrypoint: /docker-entrypoint.sh + entrypoint: /usr/local/bin/cc-db-entrypoint.sh environment: + # internal-only overlay network; keep all-trust so the app and the + # backup/restore hooks connect without juggling the superuser password - POSTGRES_HOST_AUTH_METHOD=trust - - POSTGRES_USER=discourse - POSTGRES_DB=discourse - - POSTGRES_PASSWORD_FILE=/run/secrets/db_password + - DB_USER=discourse healthcheck: test: "pg_isready -U discourse -d discourse" interval: 30s timeout: 10s retries: 5 - # generous: a postgres major-version upgrade (apt install + pg_upgrade) runs - # in the entrypoint before the server accepts connections — don't let the - # healthcheck kill an in-progress migration - start_period: 10m + # generous: a postgres major-version upgrade (apt install old binaries + + # pg_upgrade) runs in the entrypoint before the server accepts connections — + # don't let the healthcheck kill an in-progress migration + start_period: 15m deploy: labels: backupbot.backup: "true" @@ -140,8 +147,7 @@ configs: file: migrate-uploads.sh db_entrypoint: name: ${STACK_NAME}_db_entrypoint_${DB_ENTRYPOINT_VERSION} - file: entrypoint.postgres.sh.tmpl - template_driver: golang + file: cc-db-entrypoint.sh pg_backup: name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION} file: pg_backup.sh diff --git a/entrypoint.postgres.sh.tmpl b/entrypoint.postgres.sh.tmpl deleted file mode 100644 index cbd032b..0000000 --- a/entrypoint.postgres.sh.tmpl +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -set -e - -OLDDATA=$PGDATA/old_data -NEWDATA=$PGDATA/new_data - -echo "Running as $(id)" - -# The migration uses $OLDDATA/$NEWDATA as scratch and removes them when it -# finishes; a leftover *empty* one means a run was interrupted before any data -# moved (data still intact at $PGDATA) so we clear it and retry, while a -# *non-empty* one means data may live only there, so we stop for manual recovery. -for scratch in $OLDDATA $NEWDATA; do - if [ -d "$scratch" ] && [ -n "$(ls -A "$scratch")" ]; then - echo "FATAL: $scratch exists and is not empty - a previous migration did not" - echo "complete and the data may only exist there. manual recovery necessary." - exit 1 - fi -done -rm -rf $OLDDATA $NEWDATA - -if [ -f $PGDATA/PG_VERSION ]; then - DATA_VERSION=$(cat $PGDATA/PG_VERSION) - - if [ -n "$DATA_VERSION" -a "$PG_MAJOR" != "$DATA_VERSION" ]; then - echo "postgres data version $DATA_VERSION found, but need $PG_MAJOR. Starting migration" - echo "Installing postgres $DATA_VERSION" - sed -i "s/$/ $DATA_VERSION/" /etc/apt/sources.list.d/pgdg.list - apt-get update && apt-get install -y --no-install-recommends \ - postgresql-$DATA_VERSION \ - && rm -rf /var/lib/apt/lists/* - # pg_upgrade must run as the old cluster's bootstrap superuser (the "install - # user", oid 10), and the new cluster must be initialised with that same - # user. It is not necessarily $POSTGRES_USER (e.g. clusters created with the - # default "postgres" superuser and a separate app role), so read it from the - # old cluster: briefly start it and ask, connecting as the app role we know. - PGBIN=/usr/lib/postgresql/$DATA_VERSION/bin - gosu postgres $PGBIN/pg_ctl -D $PGDATA -w \ - -o "-c listen_addresses= -c unix_socket_directories=/tmp" start - INSTALL_USER=$(gosu postgres psql -h /tmp -U "$POSTGRES_USER" -d postgres -tAc \ - "select rolname from pg_roles where oid = 10") - gosu postgres $PGBIN/pg_ctl -D $PGDATA -w stop - echo "old cluster install user: $INSTALL_USER" - echo "shuffling around" - gosu postgres mkdir $OLDDATA $NEWDATA - chmod 700 $OLDDATA $NEWDATA - mv $PGDATA/* $OLDDATA/ || true - echo "running initdb" - # abuse entrypoint script for initdb by making server error out; initialise - # the new cluster with the same superuser as the old one so pg_upgrade matches - gosu postgres bash -c "export PGDATA=$NEWDATA POSTGRES_USER=$INSTALL_USER ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true" - echo "running pg_upgrade" - cd /tmp - gosu postgres pg_upgrade --link -b /usr/lib/postgresql/$DATA_VERSION/bin -d $OLDDATA -D $NEWDATA -U $INSTALL_USER - cp $OLDDATA/pg_hba.conf $NEWDATA/ - mv $NEWDATA/* $PGDATA - rm -rf $OLDDATA - rmdir $NEWDATA - echo "migration complete" - fi -fi - -/usr/local/bin/docker-entrypoint.sh postgres diff --git a/pg_backup.sh b/pg_backup.sh index 382a1d2..8044e43 100755 --- a/pg_backup.sh +++ b/pg_backup.sh @@ -1,44 +1,59 @@ #!/bin/bash -# Postgres backup/restore hook for the discourse `db` service. +# Postgres backup/restore hook for the discourse `db` service (discourse/postgres image). set -e -BACKUP_FILE='/var/lib/postgresql/data/backup.sql' -export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}") -DB_USER="${POSTGRES_USER:-discourse}" +# discourse/postgres keeps the live cluster at a versioned PGDATA under the +# /var/lib/postgresql mount. Write the dump at the volume root so backupbot's +# `postgresql_data.path: backup.sql` label captures it. +BACKUP_FILE='/var/lib/postgresql/backup.sql' +DATADIR="${PGDATA:-/var/lib/postgresql/18/docker}" DB_NAME="${POSTGRES_DB:-discourse}" +# The bootstrap superuser (install user, oid 10) differs between deployments +# (`postgres` on bitnami-origin clusters, `discourse` on others). Detect it at +# runtime over the local trust socket rather than hard-coding a name. +detect_superuser() { + local u name + for u in discourse postgres; do + name="$(psql -U "$u" -d "$DB_NAME" -tAc 'select rolname from pg_roles where oid = 10' 2>/dev/null | tr -d '[:space:]')" + if [ -n "$name" ]; then echo "$name"; return 0; fi + done + echo postgres +} +SU="$(detect_superuser)" + function backup { - pg_dump -U "$DB_USER" "$DB_NAME" | gzip > "$BACKUP_FILE" + pg_dump -U "$SU" "$DB_NAME" | gzip > "$BACKUP_FILE" } function restore { - cd /var/lib/postgresql/data/ + cd "$DATADIR" # Block all non-local connections so the running discourse app + sidekiq cannot reconnect and # interfere with the drop/recreate/reimport. Restored on exit. restore_hba() { cat pg_hba.conf.bak > pg_hba.conf rm -f pg_hba.conf.bak - su postgres -c 'pg_ctl reload' + su postgres -c "pg_ctl -D '$DATADIR' reload" } cp pg_hba.conf pg_hba.conf.bak echo 'local all all trust' > pg_hba.conf - su postgres -c 'pg_ctl reload' + su postgres -c "pg_ctl -D '$DATADIR' reload" trap restore_hba EXIT INT TERM # terminate any lingering local sessions before recreate # see https://stackoverflow.com/questions/5108876/kill-a-postgresql-session-connection - psql -U "$DB_USER" -d postgres -c \ + psql -U "$SU" -d postgres -c \ "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();" # drop database and then recreate it - psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);" - createdb -U "$DB_USER" "$DB_NAME" + psql -U "$SU" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);" + createdb -U "$SU" "$DB_NAME" - # reimport data - gunzip -c "$BACKUP_FILE" | psql -U "$DB_USER" -d "$DB_NAME" -1 -v ON_ERROR_STOP=1 -f - + # reimport data + gunzip -c "$BACKUP_FILE" | psql -U "$SU" -d "$DB_NAME" -1 -v ON_ERROR_STOP=1 -f - } $@