diff --git a/compose.yml b/compose.yml index d8cb158..3c1a68a 100644 --- a/compose.yml +++ b/compose.yml @@ -26,7 +26,10 @@ services: - internal deploy: restart_policy: - condition: on-failure + # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore), + # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with + # on-failure swarm marks the task Complete and never restarts it, leaving the app down. + condition: any labels: - "traefik.enable=true" - "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=8000" @@ -62,7 +65,12 @@ services: backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz' backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz" backupbot.restore: "true" - backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump' + # --if-exists: without it the DROPs error on objects absent from the live db and + # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind). + # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live + # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for + # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh. + backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"' plausible_events_db: image: clickhouse/clickhouse-server:23.4.2.11-alpine @@ -86,10 +94,15 @@ services: # inside the event-data volume — not the live raw data files (restoring those under a # running server is unsafe; the restore post-hook performs the logical restore instead). backupbot.backup.volumes.event-data.path: "backup/events" - backupbot.backup.pre-hook: clickhouse-backup create events + # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree + # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave + # the migration ledger empty: the next app boot then re-runs every ClickHouse migration + # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows + # into the backup dir alongside the clickhouse-backup output, and reload them on restore. + backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv' backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" backupbot.restore: "true" - backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events + backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events' volumes: db-data: