From 50a3715caa89ad6b7a28652920c08d6e33d97ef8 Mon Sep 17 00:00:00 2001 From: notplants <@notplants> Date: Tue, 9 Jun 2026 15:46:28 +0000 Subject: [PATCH 1/6] chore: upgrade to 3.1.0+v2.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor bump — no operator action required (Postgres/ClickHouse changes are automatic). - Postgres: use pgautoupgrade/pgautoupgrade:18-alpine in place of the custom pg_upgrade entrypoint. The existing cluster is upgraded in place automatically on deploy; PGDATA pinned to the legacy path; adds a pg_isready healthcheck. Removes entrypoint.postgres.sh.tmpl and DB_ENTRYPOINT_VERSION. - ClickHouse backup fetch: cache the clickhouse-backup binary on the persistent volume and retry with backoff to avoid the download crash-loop. The tool is required — if it can't be installed after retries the entrypoint aborts and the server does not start, rather than coming up without backup/restore. - Add CLICKHOUSE_DATABASE_URL; bump the clickhouse entrypoint config version. - Remove a stray broken link reference in the README. --- README.md | 1 - abra.sh | 3 +-- compose.yml | 22 ++++++++---------- entrypoint.clickhouse.sh | 46 +++++++++++++++++++++++++++++-------- entrypoint.postgres.sh.tmpl | 44 ----------------------------------- 5 files changed, 48 insertions(+), 68 deletions(-) delete mode 100644 entrypoint.postgres.sh.tmpl diff --git a/README.md b/README.md index b3f8f53..b7b4e28 100644 --- a/README.md +++ b/README.md @@ -26,4 +26,3 @@ [`abra`]: https://git.coopcloud.tech/coop-cloud/abra [`coop-cloud/traefik`]: https://git.coopcloud.tech/coop-cloud/traefik -p-cloud/traefik diff --git a/abra.sh b/abra.sh index ea81aaa..1a82402 100644 --- a/abra.sh +++ b/abra.sh @@ -1,4 +1,3 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export DB_ENTRYPOINT_VERSION=v1 -export CLICKHOUSE_ENTRYPOINT_VERSION=v2 +export CLICKHOUSE_ENTRYPOINT_VERSION=v3 diff --git a/compose.yml b/compose.yml index d20575c..bdb4c13 100644 --- a/compose.yml +++ b/compose.yml @@ -12,6 +12,7 @@ services: - BASE_URL=https://$DOMAIN - SECRET_KEY_BASE - DATABASE_URL=postgres://plausible:plausible@${STACK_NAME}_db:5432/plausible + - CLICKHOUSE_DATABASE_URL=http://${STACK_NAME}_plausible_events_db:8123/plausible_events_db - SMTP_HOST_ADDR - MAILER_EMAIL - SMTP_HOST_PORT @@ -32,23 +33,24 @@ services: - "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})" - "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}" - - coop-cloud.${STACK_NAME}.version=3.0.1+v2.0.0 + - coop-cloud.${STACK_NAME}.version=3.1.0+v2.0.0 db: - image: postgres:13.12 - configs: - - source: db_entrypoint - target: /docker-entrypoint.sh - mode: 0555 - # Custom docker entrypoint to handle major Postgres version upgrades + image: pgautoupgrade/pgautoupgrade:18-alpine volumes: - db-data:/var/lib/postgresql/data - entrypoint: /docker-entrypoint.sh environment: + # pin legacy PGDATA so the existing cluster on the volume is upgraded in place, not re-init'd + - PGDATA=/var/lib/postgresql/data - POSTGRES_USER=plausible - POSTGRES_PASSWORD=plausible - POSTGRES_DB=plausible networks: - internal + healthcheck: + test: ["CMD-SHELL", "pg_isready -U plausible -d plausible"] + interval: 5s + timeout: 5s + retries: 60 deploy: labels: backupbot.backup: "true" @@ -98,10 +100,6 @@ configs: clickhouse-user-config: name: ${STACK_NAME}_clickhouse_user_config_${CLICKHOUSE_USER_CONF_VERSION} file: clickhouse-user-config.xml - db_entrypoint: - name: ${STACK_NAME}_db_entrypoint_${DB_ENTRYPOINT_VERSION} - file: entrypoint.postgres.sh.tmpl - template_driver: golang clickhouse_entrypoint: name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION} file: entrypoint.clickhouse.sh diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index 48f7ac7..8cebd82 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,6 +1,14 @@ #!/bin/bash -set -ex +# clickhouse-backup is a backup tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). +# It is a 22 MB GitHub download (rate-limit / network), which can fail to download, and lead to crash loop and download throttling. +# +# to make the download smoother: +# - cache the binary on the persistent clickhouse data volume (/var/lib/clickhouse) so it is fetched +# at most once and reused on every container restart (no re-download amplification); +# - retry with backoff to ride out transient GitHub failures + +set -e CLICKHOUSE_BACKUP_VERSION=2.4.2 @@ -17,13 +25,33 @@ elif [[ $ARCH =~ "x86_64" ]]; then ARCH="amd64" fi -wget \ - --quiet \ - --continue \ - --no-clobber \ - --output-document=/tmp/clickhouse-backup.tar.gz \ - "https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" 2>/dev/null +CACHE_DIR=/var/lib/clickhouse/.ccci-bin +CACHED="${CACHE_DIR}/clickhouse-backup" +BIN=/usr/local/bin/clickhouse-backup +URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" -tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3 +install_clickhouse_backup() { + mkdir -p "$CACHE_DIR" + if [ -x "$CACHED" ]; then + cp -f "$CACHED" "$BIN" + echo "clickhouse-backup: restored from persistent cache ($CACHED)" + return 0 + fi + for attempt in 1 2 3 4 5; do + if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ + && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then + cp -f "$BIN" "$CACHED" 2>/dev/null || true + echo "clickhouse-backup: downloaded + cached (attempt ${attempt})" + return 0 + fi + echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 + sleep $((attempt * 10)) + done + echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2 + return 1 +} -/entrypoint.sh +#if the backup tool cannot be installed after retries, it aborts (set -e) so the deploy fails +install_clickhouse_backup + +exec /entrypoint.sh diff --git a/entrypoint.postgres.sh.tmpl b/entrypoint.postgres.sh.tmpl deleted file mode 100644 index 8ecc4fe..0000000 --- a/entrypoint.postgres.sh.tmpl +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -set -e - -MIGRATION_MARKER=$PGDATA/migration_in_progress -OLDDATA=$PGDATA/old_data -NEWDATA=$PGDATA/new_data - -if [ -e $MIGRATION_MARKER ]; then - echo "FATAL: migration was started but did not complete in a previous run. manual recovery necessary" - exit 1 -fi - -if [ -f $PGDATA/PG_VERSION ]; then - DATA_VERSION=$(cat $PGDATA/PG_VERSION) - - if [ -n "$DATA_VERSION" -a "$PG_MAJOR" != "$DATA_VERSION" ]; then - echo "postgres data version $DATA_VERSION found, but need $PG_MAJOR. Starting migration" - echo "Installing postgres $DATA_VERSION" - sed -i "s/$/ $DATA_VERSION/" /etc/apt/sources.list.d/pgdg.list - apt-get update && apt-get install -y --no-install-recommends \ - postgresql-$DATA_VERSION \ - && rm -rf /var/lib/apt/lists/* - echo "shuffling around" - gosu postgres mkdir $OLDDATA $NEWDATA - chmod 700 $OLDDATA $NEWDATA - mv $PGDATA/* $OLDDATA/ || true - touch $MIGRATION_MARKER - echo "running initdb" - # abuse entrypoint script for initdb by making server error out - gosu postgres bash -c "export PGDATA=$NEWDATA ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true" - echo "running pg_upgrade" - cd /tmp - gosu postgres pg_upgrade --link -b /usr/lib/postgresql/$DATA_VERSION/bin -d $OLDDATA -D $NEWDATA -U $POSTGRES_USER - cp $OLDDATA/pg_hba.conf $NEWDATA/ - mv $NEWDATA/* $PGDATA - rm -rf $OLDDATA - rmdir $NEWDATA - rm $MIGRATION_MARKER - echo "migration complete" - fi -fi - -/usr/local/bin/docker-entrypoint.sh postgres -- 2.49.0 From b90a8c42392566a106124115e113760590ca005f Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 18:30:18 +0000 Subject: [PATCH 2/6] fix: clickhouse entrypoint - backup download is best-effort (server must start regardless) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous entrypoint treated clickhouse-backup as required: a download failure (rate-limit or transient network) caused install_clickhouse_backup to return 1 which with set -e exited the entrypoint before /entrypoint.sh ran. ClickHouse never started, the swarm restarted it, the download was retried, amplifying the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7b). Fix: install_clickhouse_backup || true — the server starts even if the backup tool cannot be fetched. Backup/restore degrades until a later restart fetches it. Also: fix stray trailing quote in backupbot.restore.post-hook; bump CLICKHOUSE_ENTRYPOINT_VERSION v3->v4 (config content changed). --- abra.sh | 2 +- compose.yml | 2 +- entrypoint.clickhouse.sh | 23 ++++++++++++++--------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/abra.sh b/abra.sh index 1a82402..f9a789f 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,3 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v3 +export CLICKHOUSE_ENTRYPOINT_VERSION=v4 diff --git a/compose.yml b/compose.yml index bdb4c13..a1bb359 100644 --- a/compose.yml +++ b/compose.yml @@ -82,7 +82,7 @@ services: backupbot.backup.path: "/var/lib/clickhouse/backup/events" backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" backupbot.restore: "true" - backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events" + backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events volumes: db-data: diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index 8cebd82..861cdbc 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,12 +1,17 @@ #!/bin/bash - -# clickhouse-backup is a backup tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). -# It is a 22 MB GitHub download (rate-limit / network), which can fail to download, and lead to crash loop and download throttling. +# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). +# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it +# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the +# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm +# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7). # -# to make the download smoother: -# - cache the binary on the persistent clickhouse data volume (/var/lib/clickhouse) so it is fetched +# Hardening (no behaviour change when the download succeeds first try): +# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched # at most once and reused on every container restart (no re-download amplification); -# - retry with backoff to ride out transient GitHub failures +# - retry with backoff; +# - NEVER let a download failure block the server start (best-effort: the server comes up, backup/ +# restore degrade until the next successful fetch); +# - un-silenced so a failure is diagnosable in `docker service logs`. set -e @@ -47,11 +52,11 @@ install_clickhouse_backup() { echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 sleep $((attempt * 10)) done - echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2 + echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2 return 1 } -#if the backup tool cannot be installed after retries, it aborts (set -e) so the deploy fails -install_clickhouse_backup +# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency). +install_clickhouse_backup || true exec /entrypoint.sh -- 2.49.0 From 9f8bcbc9e302e13ec0ecb7f01bab1b823722449d Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 19:09:13 +0000 Subject: [PATCH 3/6] fix: clickhouse-backup install must succeed loudly, never silently degrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the previous best-effort (|| true) approach: a deploy without clickhouse-backup would have silently broken backup/restore, so the entrypoint now hard-fails (visibly, in service logs) if the tool truly cannot be installed — but makes that case effectively unreachable: - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version: downloaded at most once per app; container restarts never re-fetch (kills the re-download amplification that turned a GitHub throttle into a permanent crash-loop) - canonical Altinity release URL (project moved; old path is a redirect) - bounded retries with backoff + wget read timeout (a stalled connection can no longer hang the deploy) - verify the binary executes before trusting or caching it (catches truncated downloads and a corrupt cache) - compose: fix app depends_on to the real service name (plausible_events_db) — docker compose config was failing on it, which disabled CI image prepull and pushed pulls into the deploy window - bump CLICKHOUSE_ENTRYPOINT_VERSION v4 -> v5 (swarm configs immutable) Verified on a dev deploy: fresh download path, cached-restart path, clickhouse-backup create/list/delete, and /api/health all green. --- abra.sh | 2 +- compose.yml | 2 +- entrypoint.clickhouse.sh | 58 +++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/abra.sh b/abra.sh index f9a789f..9e41df0 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,3 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v4 +export CLICKHOUSE_ENTRYPOINT_VERSION=v5 diff --git a/compose.yml b/compose.yml index a1bb359..037d79e 100644 --- a/compose.yml +++ b/compose.yml @@ -7,7 +7,7 @@ services: command: sh -c "sleep 10 && /entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run" depends_on: - db - - events_db + - plausible_events_db environment: - BASE_URL=https://$DOMAIN - SECRET_KEY_BASE diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index 861cdbc..c60f48d 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,17 +1,22 @@ #!/bin/bash -# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). -# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it -# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the -# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm -# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7). +# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run +# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups, +# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade. # -# Hardening (no behaviour change when the download succeeds first try): -# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched -# at most once and reused on every container restart (no re-download amplification); -# - retry with backoff; -# - NEVER let a download failure block the server start (best-effort: the server comes up, backup/ -# restore degrade until the next successful fetch); -# - un-silenced so a failure is diagnosable in `docker service logs`. +# The published recipe fetched it with a single silenced no-retry wget at every container start: +# any transient GitHub failure exited the container (set -e) before clickhouse-server started, +# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a +# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7). +# +# Hardening (no behaviour change when the fetch succeeds first try): +# - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is +# downloaded at most once per app and container restarts never re-fetch; +# - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect); +# - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the +# deploy and a transient failure cannot kill it; +# - verify the binary actually executes before trusting or caching it (catches truncated +# downloads and a corrupt cache); +# - un-silenced: every attempt and the final verdict are visible in `docker service logs`. set -e @@ -31,32 +36,37 @@ elif [[ $ARCH =~ "x86_64" ]]; then fi CACHE_DIR=/var/lib/clickhouse/.ccci-bin -CACHED="${CACHE_DIR}/clickhouse-backup" +CACHED="${CACHE_DIR}/clickhouse-backup-v${CLICKHOUSE_BACKUP_VERSION}" BIN=/usr/local/bin/clickhouse-backup -URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" +URL="https://github.com/Altinity/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" + +binary_ok() { + "$1" --version >/dev/null 2>&1 +} install_clickhouse_backup() { mkdir -p "$CACHE_DIR" - if [ -x "$CACHED" ]; then + if [ -x "$CACHED" ] && binary_ok "$CACHED"; then cp -f "$CACHED" "$BIN" - echo "clickhouse-backup: restored from persistent cache ($CACHED)" + echo "clickhouse-backup: using verified cached binary ($CACHED)" return 0 fi + rm -f "$CACHED" # absent or fails to execute — re-fetch for attempt in 1 2 3 4 5; do - if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ - && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then + if wget -T 30 --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ + && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3 \ + && binary_ok "$BIN"; then cp -f "$BIN" "$CACHED" 2>/dev/null || true - echo "clickhouse-backup: downloaded + cached (attempt ${attempt})" + echo "clickhouse-backup: downloaded, verified + cached (attempt ${attempt})" return 0 fi - echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 - sleep $((attempt * 10)) + echo "clickhouse-backup: fetch attempt ${attempt}/5 failed" >&2 + [ "$attempt" -lt 5 ] && sleep $((attempt * 10)) done - echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2 + echo "clickhouse-backup: could not install after 5 attempts — failing the deploy (without it backup/restore would be silently broken)" >&2 return 1 } -# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency). -install_clickhouse_backup || true +install_clickhouse_backup exec /entrypoint.sh -- 2.49.0 From 4cab6b5146594ebd1425aa1b0aece363d9b31833 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 21:53:18 +0000 Subject: [PATCH 4/6] fix: backup labels to backup-bot-two v2 volume syntax (restore was a no-op) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backup-bot-two 2.4.0 snapshots paths INSIDE named volumes (backupbot.backup.volumes..path, relative to the volume root) and IGNORES the old backupbot.backup.path label. The db pre-hook wrote /postgres.dump.gz to the container's ephemeral root fs — outside every volume — so the dump never reached the snapshot and the restore post-hook failed on a missing file (gzip: /postgres.dump.gz: No such file). - db: dump into the db-data volume (transient; hooks remove it) and snapshot only that file via backupbot.backup.volumes.db-data.path — same pattern as keycloak, which passes backup/restore on this CI. Also use $POSTGRES_DB in the restore hook: the previous $PLAUSIBLE_DB is defined nowhere and only connected via libpq's username fallback. - clickhouse: snapshot only backup/events (the clickhouse-backup output) inside the event-data volume instead of the whole volume — restoring raw data files under a running server is unsafe; the post-hook performs the logical restore. --- compose.yml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/compose.yml b/compose.yml index 037d79e..d8cb158 100644 --- a/compose.yml +++ b/compose.yml @@ -54,11 +54,15 @@ services: deploy: labels: backupbot.backup: "true" - backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > "/postgres.dump.gz"' - backupbot.backup.path: "/postgres.dump.gz" - backupbot.backup.post-hook: "rm -f /postgres.dump.gz" + # backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes..path, + # relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump + # written to the container root fs never reaches the snapshot, so restore finds nothing. + # The dump therefore lives (transiently — the hooks remove it) at the db-data volume root. + backupbot.backup.volumes.db-data.path: "postgres.dump.gz" + backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz' + backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz" backupbot.restore: "true" - backupbot.restore.post-hook: sh -c 'gzip -d /postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$PLAUSIBLE_DB" < /postgres.dump && rm -f /postgres.dump' + backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump' plausible_events_db: image: clickhouse/clickhouse-server:23.4.2.11-alpine @@ -78,8 +82,11 @@ services: deploy: labels: backupbot.backup: "true" + # v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output + # inside the event-data volume — not the live raw data files (restoring those under a + # running server is unsafe; the restore post-hook performs the logical restore instead). + backupbot.backup.volumes.event-data.path: "backup/events" backupbot.backup.pre-hook: clickhouse-backup create events - backupbot.backup.path: "/var/lib/clickhouse/backup/events" backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" backupbot.restore: "true" backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events -- 2.49.0 From 270c8404ce481fda2aaac870fd0c26262dac8f25 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 23:01:24 +0000 Subject: [PATCH 5/6] fix: make restore correct under a live app (CI restore + custom tiers) Three independent bugs made `abra app restore` leave the stack broken: 1. ClickHouse: schema_migrations is a TinyLog table and clickhouse-backup can only FREEZE MergeTree data - it backed up the table schema but not its rows, so a restore emptied the migration ledger. The next app boot re-ran every IngestRepo migration against the fully-built tables and crash-looped (DUPLICATE_COLUMN: utm_medium) - the post-restore 502 in CI build 237. Fix: export the ledger to TSV into the backup dir (rides in the snapshotted backup/events path) in the backup pre-hook, reload it in the restore post-hook. 2. App restart policy: condition was on-failure, but when postgres is disrupted under the app the BEAM supervision tree escalates and Erlang exits GRACEFULLY (status 0) - swarm marks the task Complete and never restarts it (reproduced: app stranded at 0/1). Fix: condition any. 3. pg_restore: --clean without --if-exists exits 1 when a dropped object is absent ("errors ignored"), killing the && chain and leaving the dump behind. Fix: --if-exists, plus pg_terminate_backend afterwards so the app pooled connections reconnect against the recreated objects. Validated on a dev deploy: marker + truncated ClickHouse events both return on restore, migration ledger intact (17 rows), post-restore event ingestion for a new site works, and an app reboot after restore migrates cleanly. Known cosmetic caveat: until the app is restarted, its Postgrex type cache holds stale OIDs and background Oban jobs log "cache lookup failed for type" - ingestion and serving are unaffected; an operator restart after a restore clears it. --- compose.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/compose.yml b/compose.yml index d8cb158..3c1a68a 100644 --- a/compose.yml +++ b/compose.yml @@ -26,7 +26,10 @@ services: - internal deploy: restart_policy: - condition: on-failure + # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore), + # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with + # on-failure swarm marks the task Complete and never restarts it, leaving the app down. + condition: any labels: - "traefik.enable=true" - "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=8000" @@ -62,7 +65,12 @@ services: backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz' backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz" backupbot.restore: "true" - backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump' + # --if-exists: without it the DROPs error on objects absent from the live db and + # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind). + # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live + # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for + # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh. + backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"' plausible_events_db: image: clickhouse/clickhouse-server:23.4.2.11-alpine @@ -86,10 +94,15 @@ services: # inside the event-data volume — not the live raw data files (restoring those under a # running server is unsafe; the restore post-hook performs the logical restore instead). backupbot.backup.volumes.event-data.path: "backup/events" - backupbot.backup.pre-hook: clickhouse-backup create events + # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree + # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave + # the migration ledger empty: the next app boot then re-runs every ClickHouse migration + # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows + # into the backup dir alongside the clickhouse-backup output, and reload them on restore. + backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv' backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" backupbot.restore: "true" - backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events + backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events' volumes: db-data: -- 2.49.0 From 13458fac56a19601267192acc769f3ba8fc55036 Mon Sep 17 00:00:00 2001 From: notplants <@notplants> Date: Wed, 10 Jun 2026 16:55:20 +0000 Subject: [PATCH 6/6] refactor: extract backup/restore into config scripts, trim comments Move the postgres and clickhouse backup/restore hook logic out of inline compose labels into dedicated pg_backup.sh / clickhouse_backup.sh config scripts (the pattern other recipes use), and trim the verbose explanatory comments down to the essential rationale, now living in the scripts. --- abra.sh | 4 +++- clickhouse_backup.sh | 30 +++++++++++++++++++++++++ compose.yml | 47 ++++++++++++++++++---------------------- entrypoint.clickhouse.sh | 23 +++++--------------- pg_backup.sh | 29 +++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 45 deletions(-) create mode 100644 clickhouse_backup.sh create mode 100644 pg_backup.sh diff --git a/abra.sh b/abra.sh index 9e41df0..60974a7 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,5 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v5 +export CLICKHOUSE_ENTRYPOINT_VERSION=v6 +export PG_BACKUP_VERSION=v1 +export CLICKHOUSE_BACKUP_SCRIPT_VERSION=v1 diff --git a/clickhouse_backup.sh b/clickhouse_backup.sh new file mode 100644 index 0000000..ac537b1 --- /dev/null +++ b/clickhouse_backup.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# clickhouse-backup output lives inside the event-data volume (snapshotted via +# backupbot.backup.volumes.event-data.path). Restoring the raw data files under a +# running server is unsafe, so restore performs a logical restore instead. +BACKUP_DIR=/var/lib/clickhouse/backup/events +MIGRATIONS_TSV="$BACKUP_DIR/schema_migrations.tsv" + +backup() { + clickhouse-backup create events + # schema_migrations is a TinyLog table — clickhouse-backup only FREEZEs MergeTree + # data, so its rows aren't captured. Export them alongside the backup, else a restore + # leaves the ledger empty and the next boot re-runs every migration (DUPLICATE_COLUMN). + clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > "$MIGRATIONS_TSV" +} + +backup_cleanup() { + rm -rf "$BACKUP_DIR" +} + +restore() { + clickhouse-backup restore --rm events + clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" + clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < "$MIGRATIONS_TSV" + rm -rf "$BACKUP_DIR" +} + +"$@" diff --git a/compose.yml b/compose.yml index 3c1a68a..ca1e152 100644 --- a/compose.yml +++ b/compose.yml @@ -26,9 +26,8 @@ services: - internal deploy: restart_policy: - # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore), - # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with - # on-failure swarm marks the task Complete and never restarts it, leaving the app down. + # `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then + # shuts down gracefully (exit 0), which on-failure treats as done and never restarts. condition: any labels: - "traefik.enable=true" @@ -54,23 +53,18 @@ services: interval: 5s timeout: 5s retries: 60 + configs: + - source: pg_backup + target: /pg_backup.sh + mode: 0555 deploy: labels: backupbot.backup: "true" - # backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes..path, - # relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump - # written to the container root fs never reaches the snapshot, so restore finds nothing. - # The dump therefore lives (transiently — the hooks remove it) at the db-data volume root. backupbot.backup.volumes.db-data.path: "postgres.dump.gz" - backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz' - backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz" + backupbot.backup.pre-hook: "/pg_backup.sh backup" + backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup" backupbot.restore: "true" - # --if-exists: without it the DROPs error on objects absent from the live db and - # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind). - # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live - # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for - # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh. - backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"' + backupbot.restore.post-hook: "/pg_backup.sh restore" plausible_events_db: image: clickhouse/clickhouse-server:23.4.2.11-alpine @@ -85,24 +79,19 @@ services: - source: clickhouse_entrypoint target: /custom-entrypoint.sh mode: 0555 + - source: clickhouse_backup + target: /clickhouse_backup.sh + mode: 0555 networks: - internal deploy: labels: backupbot.backup: "true" - # v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output - # inside the event-data volume — not the live raw data files (restoring those under a - # running server is unsafe; the restore post-hook performs the logical restore instead). backupbot.backup.volumes.event-data.path: "backup/events" - # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree - # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave - # the migration ledger empty: the next app boot then re-runs every ClickHouse migration - # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows - # into the backup dir alongside the clickhouse-backup output, and reload them on restore. - backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv' - backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" + backupbot.backup.pre-hook: "/clickhouse_backup.sh backup" + backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup" backupbot.restore: "true" - backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events' + backupbot.restore.post-hook: "/clickhouse_backup.sh restore" volumes: db-data: @@ -123,3 +112,9 @@ configs: clickhouse_entrypoint: name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION} file: entrypoint.clickhouse.sh + pg_backup: + name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION} + file: pg_backup.sh + clickhouse_backup: + name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION} + file: clickhouse_backup.sh diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index c60f48d..5af4d4f 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,22 +1,9 @@ #!/bin/bash -# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run -# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups, -# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade. -# -# The published recipe fetched it with a single silenced no-retry wget at every container start: -# any transient GitHub failure exited the container (set -e) before clickhouse-server started, -# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a -# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7). -# -# Hardening (no behaviour change when the fetch succeeds first try): -# - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is -# downloaded at most once per app and container restarts never re-fetch; -# - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect); -# - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the -# deploy and a transient failure cannot kill it; -# - verify the binary actually executes before trusting or caching it (catches truncated -# downloads and a corrupt cache); -# - un-silenced: every attempt and the final verdict are visible in `docker service logs`. +# Install clickhouse-backup (powers this recipe's backup/restore hooks) before starting the +# server. The binary is cached on the persistent volume keyed by version (downloaded at most +# once per app) and fetched with bounded retries + a read timeout; the binary is verified before +# being trusted or cached. If it truly cannot be installed the deploy fails loudly rather than +# silently shipping broken backups. set -e diff --git a/pg_backup.sh b/pg_backup.sh new file mode 100644 index 0000000..84f5451 --- /dev/null +++ b/pg_backup.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +set -e + +# The dump lives at the db-data volume root: backup-bot-two v2 snapshots paths inside +# named volumes (backupbot.backup.volumes.db-data.path), not the container root fs. +DUMP=/var/lib/postgresql/data/postgres.dump + +backup() { + pg_dump -U "$POSTGRES_USER" -Fc "$POSTGRES_DB" | gzip > "$DUMP.gz" +} + +backup_cleanup() { + rm -f "$DUMP.gz" +} + +restore() { + gzip -d "$DUMP.gz" + # --if-exists: otherwise DROPs on objects absent from the live db error out and + # pg_restore exits 1, killing the chain and leaving the dump behind. + pg_restore --clean --if-exists -U "$POSTGRES_USER" --dbname="$POSTGRES_DB" < "$DUMP" + rm -f "$DUMP" + # pg_restore --clean recreates objects under the live app, so its pooled connections + # keep stale type-OID caches ('cache lookup failed for type ...' crash loops, e.g. + # Oban). Terminate them so Ecto reconnects fresh. + psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();" +} + +"$@" -- 2.49.0