From 13458fac56a19601267192acc769f3ba8fc55036 Mon Sep 17 00:00:00 2001 From: notplants <@notplants> Date: Wed, 10 Jun 2026 16:55:20 +0000 Subject: [PATCH] refactor: extract backup/restore into config scripts, trim comments Move the postgres and clickhouse backup/restore hook logic out of inline compose labels into dedicated pg_backup.sh / clickhouse_backup.sh config scripts (the pattern other recipes use), and trim the verbose explanatory comments down to the essential rationale, now living in the scripts. --- abra.sh | 4 +++- clickhouse_backup.sh | 30 +++++++++++++++++++++++++ compose.yml | 47 ++++++++++++++++++---------------------- entrypoint.clickhouse.sh | 23 +++++--------------- pg_backup.sh | 29 +++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 45 deletions(-) create mode 100644 clickhouse_backup.sh create mode 100644 pg_backup.sh diff --git a/abra.sh b/abra.sh index 9e41df0..60974a7 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,5 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v5 +export CLICKHOUSE_ENTRYPOINT_VERSION=v6 +export PG_BACKUP_VERSION=v1 +export CLICKHOUSE_BACKUP_SCRIPT_VERSION=v1 diff --git a/clickhouse_backup.sh b/clickhouse_backup.sh new file mode 100644 index 0000000..ac537b1 --- /dev/null +++ b/clickhouse_backup.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# clickhouse-backup output lives inside the event-data volume (snapshotted via +# backupbot.backup.volumes.event-data.path). Restoring the raw data files under a +# running server is unsafe, so restore performs a logical restore instead. +BACKUP_DIR=/var/lib/clickhouse/backup/events +MIGRATIONS_TSV="$BACKUP_DIR/schema_migrations.tsv" + +backup() { + clickhouse-backup create events + # schema_migrations is a TinyLog table — clickhouse-backup only FREEZEs MergeTree + # data, so its rows aren't captured. Export them alongside the backup, else a restore + # leaves the ledger empty and the next boot re-runs every migration (DUPLICATE_COLUMN). + clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > "$MIGRATIONS_TSV" +} + +backup_cleanup() { + rm -rf "$BACKUP_DIR" +} + +restore() { + clickhouse-backup restore --rm events + clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" + clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < "$MIGRATIONS_TSV" + rm -rf "$BACKUP_DIR" +} + +"$@" diff --git a/compose.yml b/compose.yml index 3c1a68a..ca1e152 100644 --- a/compose.yml +++ b/compose.yml @@ -26,9 +26,8 @@ services: - internal deploy: restart_policy: - # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore), - # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with - # on-failure swarm marks the task Complete and never restarts it, leaving the app down. + # `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then + # shuts down gracefully (exit 0), which on-failure treats as done and never restarts. condition: any labels: - "traefik.enable=true" @@ -54,23 +53,18 @@ services: interval: 5s timeout: 5s retries: 60 + configs: + - source: pg_backup + target: /pg_backup.sh + mode: 0555 deploy: labels: backupbot.backup: "true" - # backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes..path, - # relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump - # written to the container root fs never reaches the snapshot, so restore finds nothing. - # The dump therefore lives (transiently — the hooks remove it) at the db-data volume root. backupbot.backup.volumes.db-data.path: "postgres.dump.gz" - backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz' - backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz" + backupbot.backup.pre-hook: "/pg_backup.sh backup" + backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup" backupbot.restore: "true" - # --if-exists: without it the DROPs error on objects absent from the live db and - # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind). - # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live - # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for - # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh. - backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"' + backupbot.restore.post-hook: "/pg_backup.sh restore" plausible_events_db: image: clickhouse/clickhouse-server:23.4.2.11-alpine @@ -85,24 +79,19 @@ services: - source: clickhouse_entrypoint target: /custom-entrypoint.sh mode: 0555 + - source: clickhouse_backup + target: /clickhouse_backup.sh + mode: 0555 networks: - internal deploy: labels: backupbot.backup: "true" - # v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output - # inside the event-data volume — not the live raw data files (restoring those under a - # running server is unsafe; the restore post-hook performs the logical restore instead). backupbot.backup.volumes.event-data.path: "backup/events" - # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree - # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave - # the migration ledger empty: the next app boot then re-runs every ClickHouse migration - # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows - # into the backup dir alongside the clickhouse-backup output, and reload them on restore. - backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv' - backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" + backupbot.backup.pre-hook: "/clickhouse_backup.sh backup" + backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup" backupbot.restore: "true" - backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events' + backupbot.restore.post-hook: "/clickhouse_backup.sh restore" volumes: db-data: @@ -123,3 +112,9 @@ configs: clickhouse_entrypoint: name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION} file: entrypoint.clickhouse.sh + pg_backup: + name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION} + file: pg_backup.sh + clickhouse_backup: + name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION} + file: clickhouse_backup.sh diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index c60f48d..5af4d4f 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,22 +1,9 @@ #!/bin/bash -# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run -# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups, -# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade. -# -# The published recipe fetched it with a single silenced no-retry wget at every container start: -# any transient GitHub failure exited the container (set -e) before clickhouse-server started, -# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a -# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7). -# -# Hardening (no behaviour change when the fetch succeeds first try): -# - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is -# downloaded at most once per app and container restarts never re-fetch; -# - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect); -# - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the -# deploy and a transient failure cannot kill it; -# - verify the binary actually executes before trusting or caching it (catches truncated -# downloads and a corrupt cache); -# - un-silenced: every attempt and the final verdict are visible in `docker service logs`. +# Install clickhouse-backup (powers this recipe's backup/restore hooks) before starting the +# server. The binary is cached on the persistent volume keyed by version (downloaded at most +# once per app) and fetched with bounded retries + a read timeout; the binary is verified before +# being trusted or cached. If it truly cannot be installed the deploy fails loudly rather than +# silently shipping broken backups. set -e diff --git a/pg_backup.sh b/pg_backup.sh new file mode 100644 index 0000000..84f5451 --- /dev/null +++ b/pg_backup.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +set -e + +# The dump lives at the db-data volume root: backup-bot-two v2 snapshots paths inside +# named volumes (backupbot.backup.volumes.db-data.path), not the container root fs. +DUMP=/var/lib/postgresql/data/postgres.dump + +backup() { + pg_dump -U "$POSTGRES_USER" -Fc "$POSTGRES_DB" | gzip > "$DUMP.gz" +} + +backup_cleanup() { + rm -f "$DUMP.gz" +} + +restore() { + gzip -d "$DUMP.gz" + # --if-exists: otherwise DROPs on objects absent from the live db error out and + # pg_restore exits 1, killing the chain and leaving the dump behind. + pg_restore --clean --if-exists -U "$POSTGRES_USER" --dbname="$POSTGRES_DB" < "$DUMP" + rm -f "$DUMP" + # pg_restore --clean recreates objects under the live app, so its pooled connections + # keep stale type-OID caches ('cache lookup failed for type ...' crash loops, e.g. + # Oban). Terminate them so Ecto reconnects fresh. + psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();" +} + +"$@"