refactor: extract backup/restore into config scripts, trim comments
All checks were successful
cc-ci/testme cc-ci: success

Move the postgres and clickhouse backup/restore hook logic out of inline
compose labels into dedicated pg_backup.sh / clickhouse_backup.sh config
scripts (the pattern other recipes use), and trim the verbose explanatory
comments down to the essential rationale, now living in the scripts.
This commit is contained in:
notplants
2026-06-10 16:55:20 +00:00
committed by notplants
parent 270c8404ce
commit 13458fac56
5 changed files with 88 additions and 45 deletions

View File

@ -26,9 +26,8 @@ services:
- internal
deploy:
restart_policy:
# `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore),
# the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with
# on-failure swarm marks the task Complete and never restarts it, leaving the app down.
# `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then
# shuts down gracefully (exit 0), which on-failure treats as done and never restarts.
condition: any
labels:
- "traefik.enable=true"
@ -54,23 +53,18 @@ services:
interval: 5s
timeout: 5s
retries: 60
configs:
- source: pg_backup
target: /pg_backup.sh
mode: 0555
deploy:
labels:
backupbot.backup: "true"
# backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes.<vol>.path,
# relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump
# written to the container root fs never reaches the snapshot, so restore finds nothing.
# The dump therefore lives (transiently — the hooks remove it) at the db-data volume root.
backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
backupbot.backup.pre-hook: "/pg_backup.sh backup"
backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup"
backupbot.restore: "true"
# --if-exists: without it the DROPs error on objects absent from the live db and
# pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind).
# pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live
# app, so its pooled connections keep stale type-OID caches ('cache lookup failed for
# type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh.
backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"'
backupbot.restore.post-hook: "/pg_backup.sh restore"
plausible_events_db:
image: clickhouse/clickhouse-server:23.4.2.11-alpine
@ -85,24 +79,19 @@ services:
- source: clickhouse_entrypoint
target: /custom-entrypoint.sh
mode: 0555
- source: clickhouse_backup
target: /clickhouse_backup.sh
mode: 0555
networks:
- internal
deploy:
labels:
backupbot.backup: "true"
# v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output
# inside the event-data volume — not the live raw data files (restoring those under a
# running server is unsafe; the restore post-hook performs the logical restore instead).
backupbot.backup.volumes.event-data.path: "backup/events"
# schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree
# data, so it backs up that table's SCHEMA but not its rows, and a restore would leave
# the migration ledger empty: the next app boot then re-runs every ClickHouse migration
# against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows
# into the backup dir alongside the clickhouse-backup output, and reload them on restore.
backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv'
backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
backupbot.backup.pre-hook: "/clickhouse_backup.sh backup"
backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup"
backupbot.restore: "true"
backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events'
backupbot.restore.post-hook: "/clickhouse_backup.sh restore"
volumes:
db-data:
@ -123,3 +112,9 @@ configs:
clickhouse_entrypoint:
name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
file: entrypoint.clickhouse.sh
pg_backup:
name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION}
file: pg_backup.sh
clickhouse_backup:
name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION}
file: clickhouse_backup.sh