All checks were successful
cc-ci/testme cc-ci: success
Three independent bugs made `abra app restore` leave the stack broken:
1. ClickHouse: schema_migrations is a TinyLog table and clickhouse-backup
can only FREEZE MergeTree data - it backed up the table schema but
not its rows, so a restore emptied the migration ledger. The next app
boot re-ran every IngestRepo migration against the fully-built tables
and crash-looped (DUPLICATE_COLUMN: utm_medium) - the post-restore 502
in CI build 237. Fix: export the ledger to TSV into the backup dir
(rides in the snapshotted backup/events path) in the backup pre-hook,
reload it in the restore post-hook.
2. App restart policy: condition was on-failure, but when postgres is
disrupted under the app the BEAM supervision tree escalates and Erlang
exits GRACEFULLY (status 0) - swarm marks the task Complete and never
restarts it (reproduced: app stranded at 0/1). Fix: condition any.
3. pg_restore: --clean without --if-exists exits 1 when a dropped object
is absent ("errors ignored"), killing the && chain and leaving the
dump behind. Fix: --if-exists, plus pg_terminate_backend afterwards so
the app pooled connections reconnect against the recreated objects.
Validated on a dev deploy: marker + truncated ClickHouse events both
return on restore, migration ledger intact (17 rows), post-restore event
ingestion for a new site works, and an app reboot after restore migrates
cleanly. Known cosmetic caveat: until the app is restarted, its Postgrex
type cache holds stale OIDs and background Oban jobs log "cache lookup
failed for type" - ingestion and serving are unaffected; an operator
restart after a restore clears it.
126 lines
6.2 KiB
YAML
126 lines
6.2 KiB
YAML
---
|
|
version: "3.8"
|
|
|
|
services:
|
|
app:
|
|
image: plausible/analytics:v2.0.0
|
|
command: sh -c "sleep 10 && /entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run"
|
|
depends_on:
|
|
- db
|
|
- plausible_events_db
|
|
environment:
|
|
- BASE_URL=https://$DOMAIN
|
|
- SECRET_KEY_BASE
|
|
- DATABASE_URL=postgres://plausible:plausible@${STACK_NAME}_db:5432/plausible
|
|
- CLICKHOUSE_DATABASE_URL=http://${STACK_NAME}_plausible_events_db:8123/plausible_events_db
|
|
- SMTP_HOST_ADDR
|
|
- MAILER_EMAIL
|
|
- SMTP_HOST_PORT
|
|
- SMTP_USER_NAME
|
|
- SMTP_USER_PWD
|
|
- SMTP_HOST_SSL_ENABLED
|
|
- DISABLE_REGISTRATION
|
|
- DISABLE_AUTH
|
|
networks:
|
|
- proxy
|
|
- internal
|
|
deploy:
|
|
restart_policy:
|
|
# `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore),
|
|
# the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with
|
|
# on-failure swarm marks the task Complete and never restarts it, leaving the app down.
|
|
condition: any
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=8000"
|
|
- "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})"
|
|
- "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure"
|
|
- "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
|
- coop-cloud.${STACK_NAME}.version=3.1.0+v2.0.0
|
|
db:
|
|
image: pgautoupgrade/pgautoupgrade:18-alpine
|
|
volumes:
|
|
- db-data:/var/lib/postgresql/data
|
|
environment:
|
|
# pin legacy PGDATA so the existing cluster on the volume is upgraded in place, not re-init'd
|
|
- PGDATA=/var/lib/postgresql/data
|
|
- POSTGRES_USER=plausible
|
|
- POSTGRES_PASSWORD=plausible
|
|
- POSTGRES_DB=plausible
|
|
networks:
|
|
- internal
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U plausible -d plausible"]
|
|
interval: 5s
|
|
timeout: 5s
|
|
retries: 60
|
|
deploy:
|
|
labels:
|
|
backupbot.backup: "true"
|
|
# backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes.<vol>.path,
|
|
# relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump
|
|
# written to the container root fs never reaches the snapshot, so restore finds nothing.
|
|
# The dump therefore lives (transiently — the hooks remove it) at the db-data volume root.
|
|
backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
|
|
backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
|
|
backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
|
|
backupbot.restore: "true"
|
|
# --if-exists: without it the DROPs error on objects absent from the live db and
|
|
# pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind).
|
|
# pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live
|
|
# app, so its pooled connections keep stale type-OID caches ('cache lookup failed for
|
|
# type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh.
|
|
backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"'
|
|
|
|
plausible_events_db:
|
|
image: clickhouse/clickhouse-server:23.4.2.11-alpine
|
|
volumes:
|
|
- event-data:/var/lib/clickhouse
|
|
entrypoint: /custom-entrypoint.sh
|
|
configs:
|
|
- source: clickhouse-config
|
|
target: /etc/clickhouse-server/config.d/logging.xml
|
|
- source: clickhouse-user-config
|
|
target: /etc/clickhouse-server/users.d/clickhouse-user-config.xml
|
|
- source: clickhouse_entrypoint
|
|
target: /custom-entrypoint.sh
|
|
mode: 0555
|
|
networks:
|
|
- internal
|
|
deploy:
|
|
labels:
|
|
backupbot.backup: "true"
|
|
# v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output
|
|
# inside the event-data volume — not the live raw data files (restoring those under a
|
|
# running server is unsafe; the restore post-hook performs the logical restore instead).
|
|
backupbot.backup.volumes.event-data.path: "backup/events"
|
|
# schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree
|
|
# data, so it backs up that table's SCHEMA but not its rows, and a restore would leave
|
|
# the migration ledger empty: the next app boot then re-runs every ClickHouse migration
|
|
# against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows
|
|
# into the backup dir alongside the clickhouse-backup output, and reload them on restore.
|
|
backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv'
|
|
backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
|
|
backupbot.restore: "true"
|
|
backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events'
|
|
|
|
volumes:
|
|
db-data:
|
|
event-data:
|
|
|
|
networks:
|
|
proxy:
|
|
external: true
|
|
internal:
|
|
|
|
configs:
|
|
clickhouse-config:
|
|
name: ${STACK_NAME}_clickhouse_config_${CLICKHOUSE_CONF_VERSION}
|
|
file: clickhouse-config.xml
|
|
clickhouse-user-config:
|
|
name: ${STACK_NAME}_clickhouse_user_config_${CLICKHOUSE_USER_CONF_VERSION}
|
|
file: clickhouse-user-config.xml
|
|
clickhouse_entrypoint:
|
|
name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
|
|
file: entrypoint.clickhouse.sh
|