fix(backup): block reconnections via pg_hba during pg restore (app reconnect broke reimport)

fix(backup): reimport postgres dump on restore (restore was a no-op)
The db service dumped the DB on backup (pg_dump pre-hook) but shipped no restore hook, and a file-level restore does not reload into the running postgres, so a restored backup silently kept the live (un-restored) state. Add pg_backup.sh (backup=pg_dump|gzip into the postgresql_data volume; restore=terminate conns + DROP DATABASE WITH FORCE + createdb + reimport), mount it via a config, and wire the backupbot backup/restore hooks. Same fix as the immich / mattermost-lts / ghost recipes.
2026-05-30 23:37:31 +00:00 · 2026-05-30 15:19:26 +00:00 · 2026-05-30 09:52:27 +00:00
4 changed files with 22 additions and 13 deletions
--- a/.env.sample
+++ b/.env.sample
@ -19,3 +19,4 @@ LETS_ENCRYPT_ENV=production
 #SECRET_SMTP_PASSWORD_VERSION=v1

 SECRET_DB_PASSWORD_VERSION=v1
+
--- a/abra.sh
+++ b/abra.sh
@ -1,2 +1,2 @@
 export DB_ENTRYPOINT_VERSION=v1
-export PG_BACKUP_VERSION=v2
+export PG_BACKUP_VERSION=v1
--- a/compose.yml
+++ b/compose.yml
@ -3,7 +3,7 @@ version: "3.8"

 services:
  app:
-    image: bitnamilegacy/discourse:3.5.0
+    image: bitnamilegacy/discourse:3.3.1
    networks:
      - proxy
      - internal
@ -43,7 +43,7 @@ services:
        #- "traefik.http.routers.${STACK_NAME}.middlewares=${STACK_NAME}-redirect"
        #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLForceHost=true"
        #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLHost=${DOMAIN}"
-        - "coop-cloud.${STACK_NAME}.version=0.8.0+3.5.0"
+        - "coop-cloud.${STACK_NAME}.version=0.8.0+3.3.1"
    healthcheck:
      test: "ruby -e \"require 'uri'; require 'net/http'; uri = URI('http://localhost:3000/srv/status'); res = Net::HTTP.get_response(uri); if res.is_a?(Net::HTTPSuccess) then exit (0) else exit (1) end\""
      interval: 30s
@ -52,7 +52,7 @@ services:
      start_period: 20m

  db:
-    image: pgvector/pgvector:pg17
+    image: postgres:13
    networks:
      - internal
    secrets:
@ -87,7 +87,7 @@ services:
      - 'redis_data:/data'

  sidekiq:
-    image: bitnamilegacy/discourse:3.5.0
+    image: bitnamilegacy/discourse:3.3.1
    networks:
      - proxy
      - internal
--- a/pg_backup.sh
+++ b/pg_backup.sh
@ -1,6 +1,18 @@
 #!/bin/bash

-# Postgres backup/restore hook for the discourse `db` service.
+# Postgres backup/restore hook for the discourse `db` service. Invoked by backupbot-two via:
+#   backupbot.backup.pre-hook                     = "/pg_backup.sh backup"
+#   backupbot.backup.volumes.postgresql_data.path = "backup.sql"
+#   backupbot.restore.post-hook                   = "/pg_backup.sh restore"
+# Backup dumps the DB to backup.sql (gzip) inside the postgresql_data volume; backupbot archives it.
+# Restore reimports it. Discourse (the rails app + sidekiq) keeps many TCP connections open to the DB
+# and reconnects within milliseconds, so a one-shot pg_terminate_backend is NOT enough: restore must
+# first block all non-local connections at the pg_hba level (so the app cannot reconnect and interfere
+# mid-reimport), then FORCE-drop, recreate, and deterministically reimport the dump, then restore
+# pg_hba. (Mirrors the proven matrix-synapse restore hook.) The previous recipe shipped a pg_dump
+# backup but NO restore hook — a file-level restore did not reload into the running postgres, so a
+# restored backup silently kept the live (un-restored) state. cc-ci caught this: a seeded ci_marker row
+# was gone after restore. Same pattern as the immich / mattermost-lts / ghost recipe-PRs.

 set -e

@ -17,7 +29,8 @@ function restore {
  cd /var/lib/postgresql/data/

  # Block all non-local connections so the running discourse app + sidekiq cannot reconnect and
-  # interfere with the drop/recreate/reimport. Restored on exit.
+  # interfere with the drop/recreate/reimport (a one-shot pg_terminate_backend is not enough — the
+  # app reconnects within ms over TCP). Restored on exit.
  restore_hba() {
    cat pg_hba.conf.bak > pg_hba.conf
    rm -f pg_hba.conf.bak
@ -28,16 +41,11 @@ function restore {
  su postgres -c 'pg_ctl reload'
  trap restore_hba EXIT INT TERM

-  # terminate any lingering local sessions before recreate
-  # see https://stackoverflow.com/questions/5108876/kill-a-postgresql-session-connection
+  # Terminate lingering local sessions, then FORCE-drop + recreate + deterministic reimport.
  psql -U "$DB_USER" -d postgres -c \
    "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();"
-
-  # drop database and then recreate it
  psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);"
  createdb -U "$DB_USER" "$DB_NAME"
-
-  # reimport data 
  gunzip -c "$BACKUP_FILE" | psql -U "$DB_USER" -d "$DB_NAME" -1 -v ON_ERROR_STOP=1 -f -
 }