refactor: extract backup/restore into config scripts, trim comments
All checks were successful
cc-ci/testme cc-ci: success
All checks were successful
cc-ci/testme cc-ci: success
Move the postgres and clickhouse backup/restore hook logic out of inline compose labels into dedicated pg_backup.sh / clickhouse_backup.sh config scripts (the pattern other recipes use), and trim the verbose explanatory comments down to the essential rationale, now living in the scripts.
This commit is contained in:
4
abra.sh
4
abra.sh
@ -1,3 +1,5 @@
|
||||
export CLICKHOUSE_CONF_VERSION=v2
|
||||
export CLICKHOUSE_USER_CONF_VERSION=v2
|
||||
export CLICKHOUSE_ENTRYPOINT_VERSION=v5
|
||||
export CLICKHOUSE_ENTRYPOINT_VERSION=v6
|
||||
export PG_BACKUP_VERSION=v1
|
||||
export CLICKHOUSE_BACKUP_SCRIPT_VERSION=v1
|
||||
|
||||
30
clickhouse_backup.sh
Normal file
30
clickhouse_backup.sh
Normal file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# clickhouse-backup output lives inside the event-data volume (snapshotted via
|
||||
# backupbot.backup.volumes.event-data.path). Restoring the raw data files under a
|
||||
# running server is unsafe, so restore performs a logical restore instead.
|
||||
BACKUP_DIR=/var/lib/clickhouse/backup/events
|
||||
MIGRATIONS_TSV="$BACKUP_DIR/schema_migrations.tsv"
|
||||
|
||||
backup() {
|
||||
clickhouse-backup create events
|
||||
# schema_migrations is a TinyLog table — clickhouse-backup only FREEZEs MergeTree
|
||||
# data, so its rows aren't captured. Export them alongside the backup, else a restore
|
||||
# leaves the ledger empty and the next boot re-runs every migration (DUPLICATE_COLUMN).
|
||||
clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > "$MIGRATIONS_TSV"
|
||||
}
|
||||
|
||||
backup_cleanup() {
|
||||
rm -rf "$BACKUP_DIR"
|
||||
}
|
||||
|
||||
restore() {
|
||||
clickhouse-backup restore --rm events
|
||||
clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations"
|
||||
clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < "$MIGRATIONS_TSV"
|
||||
rm -rf "$BACKUP_DIR"
|
||||
}
|
||||
|
||||
"$@"
|
||||
47
compose.yml
47
compose.yml
@ -26,9 +26,8 @@ services:
|
||||
- internal
|
||||
deploy:
|
||||
restart_policy:
|
||||
# `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore),
|
||||
# the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with
|
||||
# on-failure swarm marks the task Complete and never restarts it, leaving the app down.
|
||||
# `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then
|
||||
# shuts down gracefully (exit 0), which on-failure treats as done and never restarts.
|
||||
condition: any
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
@ -54,23 +53,18 @@ services:
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 60
|
||||
configs:
|
||||
- source: pg_backup
|
||||
target: /pg_backup.sh
|
||||
mode: 0555
|
||||
deploy:
|
||||
labels:
|
||||
backupbot.backup: "true"
|
||||
# backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes.<vol>.path,
|
||||
# relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump
|
||||
# written to the container root fs never reaches the snapshot, so restore finds nothing.
|
||||
# The dump therefore lives (transiently — the hooks remove it) at the db-data volume root.
|
||||
backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
|
||||
backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
|
||||
backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
|
||||
backupbot.backup.pre-hook: "/pg_backup.sh backup"
|
||||
backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup"
|
||||
backupbot.restore: "true"
|
||||
# --if-exists: without it the DROPs error on objects absent from the live db and
|
||||
# pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind).
|
||||
# pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live
|
||||
# app, so its pooled connections keep stale type-OID caches ('cache lookup failed for
|
||||
# type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh.
|
||||
backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"'
|
||||
backupbot.restore.post-hook: "/pg_backup.sh restore"
|
||||
|
||||
plausible_events_db:
|
||||
image: clickhouse/clickhouse-server:23.4.2.11-alpine
|
||||
@ -85,24 +79,19 @@ services:
|
||||
- source: clickhouse_entrypoint
|
||||
target: /custom-entrypoint.sh
|
||||
mode: 0555
|
||||
- source: clickhouse_backup
|
||||
target: /clickhouse_backup.sh
|
||||
mode: 0555
|
||||
networks:
|
||||
- internal
|
||||
deploy:
|
||||
labels:
|
||||
backupbot.backup: "true"
|
||||
# v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output
|
||||
# inside the event-data volume — not the live raw data files (restoring those under a
|
||||
# running server is unsafe; the restore post-hook performs the logical restore instead).
|
||||
backupbot.backup.volumes.event-data.path: "backup/events"
|
||||
# schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree
|
||||
# data, so it backs up that table's SCHEMA but not its rows, and a restore would leave
|
||||
# the migration ledger empty: the next app boot then re-runs every ClickHouse migration
|
||||
# against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows
|
||||
# into the backup dir alongside the clickhouse-backup output, and reload them on restore.
|
||||
backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv'
|
||||
backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
|
||||
backupbot.backup.pre-hook: "/clickhouse_backup.sh backup"
|
||||
backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup"
|
||||
backupbot.restore: "true"
|
||||
backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events'
|
||||
backupbot.restore.post-hook: "/clickhouse_backup.sh restore"
|
||||
|
||||
volumes:
|
||||
db-data:
|
||||
@ -123,3 +112,9 @@ configs:
|
||||
clickhouse_entrypoint:
|
||||
name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
|
||||
file: entrypoint.clickhouse.sh
|
||||
pg_backup:
|
||||
name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION}
|
||||
file: pg_backup.sh
|
||||
clickhouse_backup:
|
||||
name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION}
|
||||
file: clickhouse_backup.sh
|
||||
|
||||
@ -1,22 +1,9 @@
|
||||
#!/bin/bash
|
||||
# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run
|
||||
# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups,
|
||||
# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade.
|
||||
#
|
||||
# The published recipe fetched it with a single silenced no-retry wget at every container start:
|
||||
# any transient GitHub failure exited the container (set -e) before clickhouse-server started,
|
||||
# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a
|
||||
# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7).
|
||||
#
|
||||
# Hardening (no behaviour change when the fetch succeeds first try):
|
||||
# - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is
|
||||
# downloaded at most once per app and container restarts never re-fetch;
|
||||
# - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect);
|
||||
# - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the
|
||||
# deploy and a transient failure cannot kill it;
|
||||
# - verify the binary actually executes before trusting or caching it (catches truncated
|
||||
# downloads and a corrupt cache);
|
||||
# - un-silenced: every attempt and the final verdict are visible in `docker service logs`.
|
||||
# Install clickhouse-backup (powers this recipe's backup/restore hooks) before starting the
|
||||
# server. The binary is cached on the persistent volume keyed by version (downloaded at most
|
||||
# once per app) and fetched with bounded retries + a read timeout; the binary is verified before
|
||||
# being trusted or cached. If it truly cannot be installed the deploy fails loudly rather than
|
||||
# silently shipping broken backups.
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
29
pg_backup.sh
Normal file
29
pg_backup.sh
Normal file
@ -0,0 +1,29 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
# The dump lives at the db-data volume root: backup-bot-two v2 snapshots paths inside
|
||||
# named volumes (backupbot.backup.volumes.db-data.path), not the container root fs.
|
||||
DUMP=/var/lib/postgresql/data/postgres.dump
|
||||
|
||||
backup() {
|
||||
pg_dump -U "$POSTGRES_USER" -Fc "$POSTGRES_DB" | gzip > "$DUMP.gz"
|
||||
}
|
||||
|
||||
backup_cleanup() {
|
||||
rm -f "$DUMP.gz"
|
||||
}
|
||||
|
||||
restore() {
|
||||
gzip -d "$DUMP.gz"
|
||||
# --if-exists: otherwise DROPs on objects absent from the live db error out and
|
||||
# pg_restore exits 1, killing the chain and leaving the dump behind.
|
||||
pg_restore --clean --if-exists -U "$POSTGRES_USER" --dbname="$POSTGRES_DB" < "$DUMP"
|
||||
rm -f "$DUMP"
|
||||
# pg_restore --clean recreates objects under the live app, so its pooled connections
|
||||
# keep stale type-OID caches ('cache lookup failed for type ...' crash loops, e.g.
|
||||
# Oban). Terminate them so Ecto reconnects fresh.
|
||||
psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"
|
||||
}
|
||||
|
||||
"$@"
|
||||
Reference in New Issue
Block a user