From 50a3715caa89ad6b7a28652920c08d6e33d97ef8 Mon Sep 17 00:00:00 2001
From: notplants <@notplants>
Date: Tue, 9 Jun 2026 15:46:28 +0000
Subject: [PATCH 1/6] chore: upgrade to 3.1.0+v2.0.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Minor bump — no operator action required (Postgres/ClickHouse changes are automatic).

- Postgres: use pgautoupgrade/pgautoupgrade:18-alpine in place of the custom
  pg_upgrade entrypoint. The existing cluster is upgraded in place automatically
  on deploy; PGDATA pinned to the legacy path; adds a pg_isready healthcheck.
  Removes entrypoint.postgres.sh.tmpl and DB_ENTRYPOINT_VERSION.
- ClickHouse backup fetch: cache the clickhouse-backup binary on the persistent
  volume and retry with backoff to avoid the download crash-loop. The tool is
  required — if it can't be installed after retries the entrypoint aborts and
  the server does not start, rather than coming up without backup/restore.
- Add CLICKHOUSE_DATABASE_URL; bump the clickhouse entrypoint config version.
- Remove a stray broken link reference in the README.
---
 README.md                   |  1 -
 abra.sh                     |  3 +--
 compose.yml                 | 22 ++++++++----------
 entrypoint.clickhouse.sh    | 46 +++++++++++++++++++++++++++++--------
 entrypoint.postgres.sh.tmpl | 44 -----------------------------------
 5 files changed, 48 insertions(+), 68 deletions(-)
 delete mode 100644 entrypoint.postgres.sh.tmpl

diff --git a/README.md b/README.md
index b3f8f53..b7b4e28 100644
--- a/README.md
+++ b/README.md
@@ -26,4 +26,3 @@
 
 [`abra`]: https://git.coopcloud.tech/coop-cloud/abra
 [`coop-cloud/traefik`]: https://git.coopcloud.tech/coop-cloud/traefik
-p-cloud/traefik
diff --git a/abra.sh b/abra.sh
index ea81aaa..1a82402 100644
--- a/abra.sh
+++ b/abra.sh
@@ -1,4 +1,3 @@
 export CLICKHOUSE_CONF_VERSION=v2
 export CLICKHOUSE_USER_CONF_VERSION=v2
-export DB_ENTRYPOINT_VERSION=v1
-export CLICKHOUSE_ENTRYPOINT_VERSION=v2
+export CLICKHOUSE_ENTRYPOINT_VERSION=v3
diff --git a/compose.yml b/compose.yml
index d20575c..bdb4c13 100644
--- a/compose.yml
+++ b/compose.yml
@@ -12,6 +12,7 @@ services:
       - BASE_URL=https://$DOMAIN
       - SECRET_KEY_BASE
       - DATABASE_URL=postgres://plausible:plausible@${STACK_NAME}_db:5432/plausible
+      - CLICKHOUSE_DATABASE_URL=http://${STACK_NAME}_plausible_events_db:8123/plausible_events_db
       - SMTP_HOST_ADDR
       - MAILER_EMAIL
       - SMTP_HOST_PORT
@@ -32,23 +33,24 @@ services:
         - "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})"
         - "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure"
         - "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}"
-        - coop-cloud.${STACK_NAME}.version=3.0.1+v2.0.0
+        - coop-cloud.${STACK_NAME}.version=3.1.0+v2.0.0
   db:
-    image: postgres:13.12
-    configs:
-      - source: db_entrypoint
-        target: /docker-entrypoint.sh
-        mode: 0555
-    # Custom docker entrypoint to handle major Postgres version upgrades
+    image: pgautoupgrade/pgautoupgrade:18-alpine
     volumes:
       - db-data:/var/lib/postgresql/data
-    entrypoint: /docker-entrypoint.sh
     environment:
+      # pin legacy PGDATA so the existing cluster on the volume is upgraded in place, not re-init'd
+      - PGDATA=/var/lib/postgresql/data
       - POSTGRES_USER=plausible
       - POSTGRES_PASSWORD=plausible
       - POSTGRES_DB=plausible
     networks:
       - internal
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U plausible -d plausible"]
+      interval: 5s
+      timeout: 5s
+      retries: 60
     deploy:
       labels:
         backupbot.backup: "true"
@@ -98,10 +100,6 @@ configs:
   clickhouse-user-config:
     name: ${STACK_NAME}_clickhouse_user_config_${CLICKHOUSE_USER_CONF_VERSION}
     file: clickhouse-user-config.xml
-  db_entrypoint:
-    name: ${STACK_NAME}_db_entrypoint_${DB_ENTRYPOINT_VERSION}
-    file: entrypoint.postgres.sh.tmpl
-    template_driver: golang
   clickhouse_entrypoint:
     name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
     file: entrypoint.clickhouse.sh
diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh
index 48f7ac7..8cebd82 100644
--- a/entrypoint.clickhouse.sh
+++ b/entrypoint.clickhouse.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 
-set -ex
+# clickhouse-backup is a backup tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`).
+# It is a 22 MB GitHub download (rate-limit / network), which can fail to download, and lead to crash loop and download throttling. 
+#
+# to make the download smoother:
+#   - cache the binary on the persistent clickhouse data volume (/var/lib/clickhouse) so it is fetched
+#     at most once and reused on every container restart (no re-download amplification);
+#   - retry with backoff to ride out transient GitHub failures
+
+set -e
 
 CLICKHOUSE_BACKUP_VERSION=2.4.2
 
@@ -17,13 +25,33 @@ elif [[ $ARCH =~ "x86_64" ]]; then
   ARCH="amd64"
 fi
 
-wget \
-    --quiet \
-    --continue \
-    --no-clobber \
-    --output-document=/tmp/clickhouse-backup.tar.gz \
-    "https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" 2>/dev/null
+CACHE_DIR=/var/lib/clickhouse/.ccci-bin
+CACHED="${CACHE_DIR}/clickhouse-backup"
+BIN=/usr/local/bin/clickhouse-backup
+URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz"
 
-tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3
+install_clickhouse_backup() {
+  mkdir -p "$CACHE_DIR"
+  if [ -x "$CACHED" ]; then
+    cp -f "$CACHED" "$BIN"
+    echo "clickhouse-backup: restored from persistent cache ($CACHED)"
+    return 0
+  fi
+  for attempt in 1 2 3 4 5; do
+    if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \
+       && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then
+      cp -f "$BIN" "$CACHED" 2>/dev/null || true
+      echo "clickhouse-backup: downloaded + cached (attempt ${attempt})"
+      return 0
+    fi
+    echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2
+    sleep $((attempt * 10))
+  done
+  echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2
+  return 1
+}
 
-/entrypoint.sh
+#if the backup tool cannot be installed after retries, it aborts (set -e) so the deploy fails
+install_clickhouse_backup
+
+exec /entrypoint.sh
diff --git a/entrypoint.postgres.sh.tmpl b/entrypoint.postgres.sh.tmpl
deleted file mode 100644
index 8ecc4fe..0000000
--- a/entrypoint.postgres.sh.tmpl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-set -e
-
-MIGRATION_MARKER=$PGDATA/migration_in_progress
-OLDDATA=$PGDATA/old_data
-NEWDATA=$PGDATA/new_data
-
-if [ -e $MIGRATION_MARKER ]; then
-  echo "FATAL: migration was started but did not complete in a previous run. manual recovery necessary"
-  exit 1
-fi
-
-if [ -f $PGDATA/PG_VERSION ]; then
-  DATA_VERSION=$(cat $PGDATA/PG_VERSION)
-
-  if [ -n "$DATA_VERSION" -a "$PG_MAJOR" != "$DATA_VERSION" ]; then
-    echo "postgres data version $DATA_VERSION found, but need $PG_MAJOR. Starting migration"
-    echo "Installing postgres $DATA_VERSION"
-    sed -i "s/$/ $DATA_VERSION/" /etc/apt/sources.list.d/pgdg.list
-    apt-get update && apt-get install -y --no-install-recommends \
-      postgresql-$DATA_VERSION \
-      && rm -rf /var/lib/apt/lists/*
-    echo "shuffling around"
-    gosu postgres mkdir $OLDDATA $NEWDATA
-    chmod 700 $OLDDATA $NEWDATA
-    mv $PGDATA/* $OLDDATA/ || true
-    touch $MIGRATION_MARKER
-    echo "running initdb"
-    # abuse entrypoint script for initdb by making server error out
-    gosu postgres bash -c "export PGDATA=$NEWDATA ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true"
-    echo "running pg_upgrade"
-    cd /tmp
-    gosu postgres pg_upgrade --link -b /usr/lib/postgresql/$DATA_VERSION/bin -d $OLDDATA -D $NEWDATA -U $POSTGRES_USER
-    cp $OLDDATA/pg_hba.conf $NEWDATA/
-    mv $NEWDATA/* $PGDATA
-    rm -rf $OLDDATA
-    rmdir $NEWDATA
-    rm $MIGRATION_MARKER
-    echo "migration complete"
-  fi
-fi
-
-/usr/local/bin/docker-entrypoint.sh postgres
-- 
2.49.0


From b90a8c42392566a106124115e113760590ca005f Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@autonomic.zone>
Date: Tue, 9 Jun 2026 18:30:18 +0000
Subject: [PATCH 2/6] fix: clickhouse entrypoint - backup download is
 best-effort (server must start regardless)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous entrypoint treated clickhouse-backup as required: a download failure
(rate-limit or transient network) caused install_clickhouse_backup to return 1 which
with set -e exited the entrypoint before /entrypoint.sh ran. ClickHouse never started,
the swarm restarted it, the download was retried, amplifying the throttle -> crash-loop
-> deploy timeout (cc-ci Q4.7b).

Fix: install_clickhouse_backup || true — the server starts even if the backup tool
cannot be fetched. Backup/restore degrades until a later restart fetches it.

Also: fix stray trailing quote in backupbot.restore.post-hook; bump
CLICKHOUSE_ENTRYPOINT_VERSION v3->v4 (config content changed).
---
 abra.sh                  |  2 +-
 compose.yml              |  2 +-
 entrypoint.clickhouse.sh | 23 ++++++++++++++---------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/abra.sh b/abra.sh
index 1a82402..f9a789f 100644
--- a/abra.sh
+++ b/abra.sh
@@ -1,3 +1,3 @@
 export CLICKHOUSE_CONF_VERSION=v2
 export CLICKHOUSE_USER_CONF_VERSION=v2
-export CLICKHOUSE_ENTRYPOINT_VERSION=v3
+export CLICKHOUSE_ENTRYPOINT_VERSION=v4
diff --git a/compose.yml b/compose.yml
index bdb4c13..a1bb359 100644
--- a/compose.yml
+++ b/compose.yml
@@ -82,7 +82,7 @@ services:
         backupbot.backup.path: "/var/lib/clickhouse/backup/events"
         backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
         backupbot.restore: "true"
-        backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events"
+        backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events
 
 volumes:
   db-data:
diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh
index 8cebd82..861cdbc 100644
--- a/entrypoint.clickhouse.sh
+++ b/entrypoint.clickhouse.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
-
-# clickhouse-backup is a backup tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`).
-# It is a 22 MB GitHub download (rate-limit / network), which can fail to download, and lead to crash loop and download throttling. 
+# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`).
+# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it
+# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the
+# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm
+# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7).
 #
-# to make the download smoother:
-#   - cache the binary on the persistent clickhouse data volume (/var/lib/clickhouse) so it is fetched
+# Hardening (no behaviour change when the download succeeds first try):
+#   - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched
 #     at most once and reused on every container restart (no re-download amplification);
-#   - retry with backoff to ride out transient GitHub failures
+#   - retry with backoff;
+#   - NEVER let a download failure block the server start (best-effort: the server comes up, backup/
+#     restore degrade until the next successful fetch);
+#   - un-silenced so a failure is diagnosable in `docker service logs`.
 
 set -e
 
@@ -47,11 +52,11 @@ install_clickhouse_backup() {
     echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2
     sleep $((attempt * 10))
   done
-  echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2
+  echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2
   return 1
 }
 
-#if the backup tool cannot be installed after retries, it aborts (set -e) so the deploy fails
-install_clickhouse_backup
+# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency).
+install_clickhouse_backup || true
 
 exec /entrypoint.sh
-- 
2.49.0


From 9f8bcbc9e302e13ec0ecb7f01bab1b823722449d Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@autonomic.zone>
Date: Tue, 9 Jun 2026 19:09:13 +0000
Subject: [PATCH 3/6] fix: clickhouse-backup install must succeed loudly, never
 silently degrade
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the previous best-effort (|| true) approach: a deploy without
clickhouse-backup would have silently broken backup/restore, so the
entrypoint now hard-fails (visibly, in service logs) if the tool truly
cannot be installed — but makes that case effectively unreachable:

- cache the VERIFIED binary on the persistent clickhouse volume, keyed
  by version: downloaded at most once per app; container restarts never
  re-fetch (kills the re-download amplification that turned a GitHub
  throttle into a permanent crash-loop)
- canonical Altinity release URL (project moved; old path is a redirect)
- bounded retries with backoff + wget read timeout (a stalled connection
  can no longer hang the deploy)
- verify the binary executes before trusting or caching it (catches
  truncated downloads and a corrupt cache)
- compose: fix app depends_on to the real service name
  (plausible_events_db) — docker compose config was failing on it, which
  disabled CI image prepull and pushed pulls into the deploy window
- bump CLICKHOUSE_ENTRYPOINT_VERSION v4 -> v5 (swarm configs immutable)

Verified on a dev deploy: fresh download path, cached-restart path,
clickhouse-backup create/list/delete, and /api/health all green.
---
 abra.sh                  |  2 +-
 compose.yml              |  2 +-
 entrypoint.clickhouse.sh | 58 +++++++++++++++++++++++-----------------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/abra.sh b/abra.sh
index f9a789f..9e41df0 100644
--- a/abra.sh
+++ b/abra.sh
@@ -1,3 +1,3 @@
 export CLICKHOUSE_CONF_VERSION=v2
 export CLICKHOUSE_USER_CONF_VERSION=v2
-export CLICKHOUSE_ENTRYPOINT_VERSION=v4
+export CLICKHOUSE_ENTRYPOINT_VERSION=v5
diff --git a/compose.yml b/compose.yml
index a1bb359..037d79e 100644
--- a/compose.yml
+++ b/compose.yml
@@ -7,7 +7,7 @@ services:
     command: sh -c "sleep 10 && /entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run"
     depends_on:
       - db
-      - events_db
+      - plausible_events_db
     environment:
       - BASE_URL=https://$DOMAIN
       - SECRET_KEY_BASE
diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh
index 861cdbc..c60f48d 100644
--- a/entrypoint.clickhouse.sh
+++ b/entrypoint.clickhouse.sh
@@ -1,17 +1,22 @@
 #!/bin/bash
-# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`).
-# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it
-# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the
-# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm
-# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7).
+# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run
+# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups,
+# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade.
 #
-# Hardening (no behaviour change when the download succeeds first try):
-#   - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched
-#     at most once and reused on every container restart (no re-download amplification);
-#   - retry with backoff;
-#   - NEVER let a download failure block the server start (best-effort: the server comes up, backup/
-#     restore degrade until the next successful fetch);
-#   - un-silenced so a failure is diagnosable in `docker service logs`.
+# The published recipe fetched it with a single silenced no-retry wget at every container start:
+# any transient GitHub failure exited the container (set -e) before clickhouse-server started,
+# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a
+# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7).
+#
+# Hardening (no behaviour change when the fetch succeeds first try):
+#   - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is
+#     downloaded at most once per app and container restarts never re-fetch;
+#   - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect);
+#   - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the
+#     deploy and a transient failure cannot kill it;
+#   - verify the binary actually executes before trusting or caching it (catches truncated
+#     downloads and a corrupt cache);
+#   - un-silenced: every attempt and the final verdict are visible in `docker service logs`.
 
 set -e
 
@@ -31,32 +36,37 @@ elif [[ $ARCH =~ "x86_64" ]]; then
 fi
 
 CACHE_DIR=/var/lib/clickhouse/.ccci-bin
-CACHED="${CACHE_DIR}/clickhouse-backup"
+CACHED="${CACHE_DIR}/clickhouse-backup-v${CLICKHOUSE_BACKUP_VERSION}"
 BIN=/usr/local/bin/clickhouse-backup
-URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz"
+URL="https://github.com/Altinity/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz"
+
+binary_ok() {
+  "$1" --version >/dev/null 2>&1
+}
 
 install_clickhouse_backup() {
   mkdir -p "$CACHE_DIR"
-  if [ -x "$CACHED" ]; then
+  if [ -x "$CACHED" ] && binary_ok "$CACHED"; then
     cp -f "$CACHED" "$BIN"
-    echo "clickhouse-backup: restored from persistent cache ($CACHED)"
+    echo "clickhouse-backup: using verified cached binary ($CACHED)"
     return 0
   fi
+  rm -f "$CACHED" # absent or fails to execute — re-fetch
   for attempt in 1 2 3 4 5; do
-    if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \
-       && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then
+    if wget -T 30 --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \
+       && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3 \
+       && binary_ok "$BIN"; then
       cp -f "$BIN" "$CACHED" 2>/dev/null || true
-      echo "clickhouse-backup: downloaded + cached (attempt ${attempt})"
+      echo "clickhouse-backup: downloaded, verified + cached (attempt ${attempt})"
       return 0
     fi
-    echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2
-    sleep $((attempt * 10))
+    echo "clickhouse-backup: fetch attempt ${attempt}/5 failed" >&2
+    [ "$attempt" -lt 5 ] && sleep $((attempt * 10))
   done
-  echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2
+  echo "clickhouse-backup: could not install after 5 attempts — failing the deploy (without it backup/restore would be silently broken)" >&2
   return 1
 }
 
-# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency).
-install_clickhouse_backup || true
+install_clickhouse_backup
 
 exec /entrypoint.sh
-- 
2.49.0


From 4cab6b5146594ebd1425aa1b0aece363d9b31833 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@autonomic.zone>
Date: Tue, 9 Jun 2026 21:53:18 +0000
Subject: [PATCH 4/6] fix: backup labels to backup-bot-two v2 volume syntax
 (restore was a no-op)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

backup-bot-two 2.4.0 snapshots paths INSIDE named volumes
(backupbot.backup.volumes.<vol>.path, relative to the volume root) and
IGNORES the old backupbot.backup.path label. The db pre-hook wrote
/postgres.dump.gz to the container's ephemeral root fs — outside every
volume — so the dump never reached the snapshot and the restore post-hook
failed on a missing file (gzip: /postgres.dump.gz: No such file).

- db: dump into the db-data volume (transient; hooks remove it) and
  snapshot only that file via backupbot.backup.volumes.db-data.path —
  same pattern as keycloak, which passes backup/restore on this CI.
  Also use $POSTGRES_DB in the restore hook: the previous $PLAUSIBLE_DB
  is defined nowhere and only connected via libpq's username fallback.
- clickhouse: snapshot only backup/events (the clickhouse-backup output)
  inside the event-data volume instead of the whole volume — restoring
  raw data files under a running server is unsafe; the post-hook performs
  the logical restore.
---
 compose.yml | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/compose.yml b/compose.yml
index 037d79e..d8cb158 100644
--- a/compose.yml
+++ b/compose.yml
@@ -54,11 +54,15 @@ services:
     deploy:
       labels:
         backupbot.backup: "true"
-        backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > "/postgres.dump.gz"'
-        backupbot.backup.path: "/postgres.dump.gz"
-        backupbot.backup.post-hook: "rm -f /postgres.dump.gz"
+        # backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes.<vol>.path,
+        # relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump
+        # written to the container root fs never reaches the snapshot, so restore finds nothing.
+        # The dump therefore lives (transiently — the hooks remove it) at the db-data volume root.
+        backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
+        backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
+        backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
         backupbot.restore: "true"
-        backupbot.restore.post-hook: sh -c 'gzip -d /postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$PLAUSIBLE_DB" < /postgres.dump && rm -f /postgres.dump'
+        backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump'
 
   plausible_events_db:
     image: clickhouse/clickhouse-server:23.4.2.11-alpine
@@ -78,8 +82,11 @@ services:
     deploy:
       labels:
         backupbot.backup: "true"
+        # v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output
+        # inside the event-data volume — not the live raw data files (restoring those under a
+        # running server is unsafe; the restore post-hook performs the logical restore instead).
+        backupbot.backup.volumes.event-data.path: "backup/events"
         backupbot.backup.pre-hook: clickhouse-backup create events
-        backupbot.backup.path: "/var/lib/clickhouse/backup/events"
         backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
         backupbot.restore: "true"
         backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events
-- 
2.49.0


From 270c8404ce481fda2aaac870fd0c26262dac8f25 Mon Sep 17 00:00:00 2001
From: autonomic-bot <autonomic-bot@autonomic.zone>
Date: Tue, 9 Jun 2026 23:01:24 +0000
Subject: [PATCH 5/6] fix: make restore correct under a live app (CI restore +
 custom tiers)

Three independent bugs made `abra app restore` leave the stack broken:

1. ClickHouse: schema_migrations is a TinyLog table and clickhouse-backup
   can only FREEZE MergeTree data - it backed up the table schema but
   not its rows, so a restore emptied the migration ledger. The next app
   boot re-ran every IngestRepo migration against the fully-built tables
   and crash-looped (DUPLICATE_COLUMN: utm_medium) - the post-restore 502
   in CI build 237. Fix: export the ledger to TSV into the backup dir
   (rides in the snapshotted backup/events path) in the backup pre-hook,
   reload it in the restore post-hook.

2. App restart policy: condition was on-failure, but when postgres is
   disrupted under the app the BEAM supervision tree escalates and Erlang
   exits GRACEFULLY (status 0) - swarm marks the task Complete and never
   restarts it (reproduced: app stranded at 0/1). Fix: condition any.

3. pg_restore: --clean without --if-exists exits 1 when a dropped object
   is absent ("errors ignored"), killing the && chain and leaving the
   dump behind. Fix: --if-exists, plus pg_terminate_backend afterwards so
   the app pooled connections reconnect against the recreated objects.

Validated on a dev deploy: marker + truncated ClickHouse events both
return on restore, migration ledger intact (17 rows), post-restore event
ingestion for a new site works, and an app reboot after restore migrates
cleanly. Known cosmetic caveat: until the app is restarted, its Postgrex
type cache holds stale OIDs and background Oban jobs log "cache lookup
failed for type" - ingestion and serving are unaffected; an operator
restart after a restore clears it.
---
 compose.yml | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/compose.yml b/compose.yml
index d8cb158..3c1a68a 100644
--- a/compose.yml
+++ b/compose.yml
@@ -26,7 +26,10 @@ services:
       - internal
     deploy:
       restart_policy:
-        condition: on-failure
+        # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore),
+        # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with
+        # on-failure swarm marks the task Complete and never restarts it, leaving the app down.
+        condition: any
       labels:
         - "traefik.enable=true"
         - "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=8000"
@@ -62,7 +65,12 @@ services:
         backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
         backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
         backupbot.restore: "true"
-        backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump'
+        # --if-exists: without it the DROPs error on objects absent from the live db and
+        # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind).
+        # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live
+        # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for
+        # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh.
+        backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"'
 
   plausible_events_db:
     image: clickhouse/clickhouse-server:23.4.2.11-alpine
@@ -86,10 +94,15 @@ services:
         # inside the event-data volume — not the live raw data files (restoring those under a
         # running server is unsafe; the restore post-hook performs the logical restore instead).
         backupbot.backup.volumes.event-data.path: "backup/events"
-        backupbot.backup.pre-hook: clickhouse-backup create events
+        # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree
+        # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave
+        # the migration ledger empty: the next app boot then re-runs every ClickHouse migration
+        # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows
+        # into the backup dir alongside the clickhouse-backup output, and reload them on restore.
+        backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv'
         backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
         backupbot.restore: "true"
-        backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events
+        backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events'
 
 volumes:
   db-data:
-- 
2.49.0


From 13458fac56a19601267192acc769f3ba8fc55036 Mon Sep 17 00:00:00 2001
From: notplants <@notplants>
Date: Wed, 10 Jun 2026 16:55:20 +0000
Subject: [PATCH 6/6] refactor: extract backup/restore into config scripts,
 trim comments

Move the postgres and clickhouse backup/restore hook logic out of inline
compose labels into dedicated pg_backup.sh / clickhouse_backup.sh config
scripts (the pattern other recipes use), and trim the verbose explanatory
comments down to the essential rationale, now living in the scripts.
---
 abra.sh                  |  4 +++-
 clickhouse_backup.sh     | 30 +++++++++++++++++++++++++
 compose.yml              | 47 ++++++++++++++++++----------------------
 entrypoint.clickhouse.sh | 23 +++++---------------
 pg_backup.sh             | 29 +++++++++++++++++++++++++
 5 files changed, 88 insertions(+), 45 deletions(-)
 create mode 100644 clickhouse_backup.sh
 create mode 100644 pg_backup.sh

diff --git a/abra.sh b/abra.sh
index 9e41df0..60974a7 100644
--- a/abra.sh
+++ b/abra.sh
@@ -1,3 +1,5 @@
 export CLICKHOUSE_CONF_VERSION=v2
 export CLICKHOUSE_USER_CONF_VERSION=v2
-export CLICKHOUSE_ENTRYPOINT_VERSION=v5
+export CLICKHOUSE_ENTRYPOINT_VERSION=v6
+export PG_BACKUP_VERSION=v1
+export CLICKHOUSE_BACKUP_SCRIPT_VERSION=v1
diff --git a/clickhouse_backup.sh b/clickhouse_backup.sh
new file mode 100644
index 0000000..ac537b1
--- /dev/null
+++ b/clickhouse_backup.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+
+# clickhouse-backup output lives inside the event-data volume (snapshotted via
+# backupbot.backup.volumes.event-data.path). Restoring the raw data files under a
+# running server is unsafe, so restore performs a logical restore instead.
+BACKUP_DIR=/var/lib/clickhouse/backup/events
+MIGRATIONS_TSV="$BACKUP_DIR/schema_migrations.tsv"
+
+backup() {
+  clickhouse-backup create events
+  # schema_migrations is a TinyLog table — clickhouse-backup only FREEZEs MergeTree
+  # data, so its rows aren't captured. Export them alongside the backup, else a restore
+  # leaves the ledger empty and the next boot re-runs every migration (DUPLICATE_COLUMN).
+  clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > "$MIGRATIONS_TSV"
+}
+
+backup_cleanup() {
+  rm -rf "$BACKUP_DIR"
+}
+
+restore() {
+  clickhouse-backup restore --rm events
+  clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations"
+  clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < "$MIGRATIONS_TSV"
+  rm -rf "$BACKUP_DIR"
+}
+
+"$@"
diff --git a/compose.yml b/compose.yml
index 3c1a68a..ca1e152 100644
--- a/compose.yml
+++ b/compose.yml
@@ -26,9 +26,8 @@ services:
       - internal
     deploy:
       restart_policy:
-        # `any`, not `on-failure`: when postgres is disrupted under the app (e.g. a restore),
-        # the BEAM supervision tree escalates and Erlang shuts down GRACEFULLY (exit 0) — with
-        # on-failure swarm marks the task Complete and never restarts it, leaving the app down.
+        # `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then
+        # shuts down gracefully (exit 0), which on-failure treats as done and never restarts.
         condition: any
       labels:
         - "traefik.enable=true"
@@ -54,23 +53,18 @@ services:
       interval: 5s
       timeout: 5s
       retries: 60
+    configs:
+      - source: pg_backup
+        target: /pg_backup.sh
+        mode: 0555
     deploy:
       labels:
         backupbot.backup: "true"
-        # backup-bot-two v2 snapshots paths INSIDE named volumes (backupbot.backup.volumes.<vol>.path,
-        # relative to the volume root) and ignores the old `backupbot.backup.path` label — a dump
-        # written to the container root fs never reaches the snapshot, so restore finds nothing.
-        # The dump therefore lives (transiently — the hooks remove it) at the db-data volume root.
         backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
-        backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > /var/lib/postgresql/data/postgres.dump.gz'
-        backupbot.backup.post-hook: "rm -f /var/lib/postgresql/data/postgres.dump.gz"
+        backupbot.backup.pre-hook: "/pg_backup.sh backup"
+        backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup"
         backupbot.restore: "true"
-        # --if-exists: without it the DROPs error on objects absent from the live db and
-        # pg_restore exits 1 ("errors ignored"), killing the && chain (dump left behind).
-        # pg_terminate_backend afterwards: pg_restore --clean recreates objects under the live
-        # app, so its pooled connections keep stale type-OID caches ('cache lookup failed for
-        # type ...' crash loops, e.g. Oban) — terminating them makes Ecto reconnect fresh.
-        backupbot.restore.post-hook: sh -c 'gzip -d /var/lib/postgresql/data/postgres.dump.gz && pg_restore --clean --if-exists -U "$$POSTGRES_USER" --dbname="$$POSTGRES_DB" < /var/lib/postgresql/data/postgres.dump && rm -f /var/lib/postgresql/data/postgres.dump && psql -U "$$POSTGRES_USER" -d "$$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"'
+        backupbot.restore.post-hook: "/pg_backup.sh restore"
 
   plausible_events_db:
     image: clickhouse/clickhouse-server:23.4.2.11-alpine
@@ -85,24 +79,19 @@ services:
       - source: clickhouse_entrypoint
         target: /custom-entrypoint.sh
         mode: 0555
+      - source: clickhouse_backup
+        target: /clickhouse_backup.sh
+        mode: 0555
     networks:
       - internal
     deploy:
       labels:
         backupbot.backup: "true"
-        # v2 volumes-include syntax (see db service): snapshot only the clickhouse-backup output
-        # inside the event-data volume — not the live raw data files (restoring those under a
-        # running server is unsafe; the restore post-hook performs the logical restore instead).
         backupbot.backup.volumes.event-data.path: "backup/events"
-        # schema_migrations is a TinyLog table — clickhouse-backup can only FREEZE MergeTree
-        # data, so it backs up that table's SCHEMA but not its rows, and a restore would leave
-        # the migration ledger empty: the next app boot then re-runs every ClickHouse migration
-        # against the fully-built tables and crash-loops (DUPLICATE_COLUMN). Export its rows
-        # into the backup dir alongside the clickhouse-backup output, and reload them on restore.
-        backupbot.backup.pre-hook: sh -c 'clickhouse-backup create events && clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > /var/lib/clickhouse/backup/events/schema_migrations.tsv'
-        backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
+        backupbot.backup.pre-hook: "/clickhouse_backup.sh backup"
+        backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup"
         backupbot.restore: "true"
-        backupbot.restore.post-hook: sh -c 'clickhouse-backup restore --rm events && clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations" && clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < /var/lib/clickhouse/backup/events/schema_migrations.tsv && rm -rf /var/lib/clickhouse/backup/events'
+        backupbot.restore.post-hook: "/clickhouse_backup.sh restore"
 
 volumes:
   db-data:
@@ -123,3 +112,9 @@ configs:
   clickhouse_entrypoint:
     name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
     file: entrypoint.clickhouse.sh
+  pg_backup:
+    name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION}
+    file: pg_backup.sh
+  clickhouse_backup:
+    name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION}
+    file: clickhouse_backup.sh
diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh
index c60f48d..5af4d4f 100644
--- a/entrypoint.clickhouse.sh
+++ b/entrypoint.clickhouse.sh
@@ -1,22 +1,9 @@
 #!/bin/bash
-# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run
-# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups,
-# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade.
-#
-# The published recipe fetched it with a single silenced no-retry wget at every container start:
-# any transient GitHub failure exited the container (set -e) before clickhouse-server started,
-# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a
-# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7).
-#
-# Hardening (no behaviour change when the fetch succeeds first try):
-#   - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is
-#     downloaded at most once per app and container restarts never re-fetch;
-#   - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect);
-#   - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the
-#     deploy and a transient failure cannot kill it;
-#   - verify the binary actually executes before trusting or caching it (catches truncated
-#     downloads and a corrupt cache);
-#   - un-silenced: every attempt and the final verdict are visible in `docker service logs`.
+# Install clickhouse-backup (powers this recipe's backup/restore hooks) before starting the
+# server. The binary is cached on the persistent volume keyed by version (downloaded at most
+# once per app) and fetched with bounded retries + a read timeout; the binary is verified before
+# being trusted or cached. If it truly cannot be installed the deploy fails loudly rather than
+# silently shipping broken backups.
 
 set -e
 
diff --git a/pg_backup.sh b/pg_backup.sh
new file mode 100644
index 0000000..84f5451
--- /dev/null
+++ b/pg_backup.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+set -e
+
+# The dump lives at the db-data volume root: backup-bot-two v2 snapshots paths inside
+# named volumes (backupbot.backup.volumes.db-data.path), not the container root fs.
+DUMP=/var/lib/postgresql/data/postgres.dump
+
+backup() {
+  pg_dump -U "$POSTGRES_USER" -Fc "$POSTGRES_DB" | gzip > "$DUMP.gz"
+}
+
+backup_cleanup() {
+  rm -f "$DUMP.gz"
+}
+
+restore() {
+  gzip -d "$DUMP.gz"
+  # --if-exists: otherwise DROPs on objects absent from the live db error out and
+  # pg_restore exits 1, killing the chain and leaving the dump behind.
+  pg_restore --clean --if-exists -U "$POSTGRES_USER" --dbname="$POSTGRES_DB" < "$DUMP"
+  rm -f "$DUMP"
+  # pg_restore --clean recreates objects under the live app, so its pooled connections
+  # keep stale type-OID caches ('cache lookup failed for type ...' crash loops, e.g.
+  # Oban). Terminate them so Ecto reconnects fresh.
+  psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"
+}
+
+"$@"
-- 
2.49.0