7 Commits

Author SHA1 Message Date
09730b0e7c feat(db): use pgautoupgrade instead of custom pg_upgrade entrypoint
Some checks failed
cc-ci/testme cc-ci: failure
Replace the hand-rolled entrypoint.postgres.sh.tmpl (which apt-installed
the old PG binaries and ran initdb + pg_upgrade --link by hand) with the
pgautoupgrade/pgautoupgrade:18-alpine image, matching the other recipes.
PGDATA is pinned to the legacy /var/lib/postgresql/data so the existing
cluster on the volume is upgraded in place rather than re-initialised.
Drops the db_entrypoint config and DB_ENTRYPOINT_VERSION.
2026-06-09 15:18:11 +00:00
2ab49fab62 fix(clickhouse): require backup tool — abort if fetch fails after retries
Some checks failed
cc-ci/testme cc-ci: failure
Make the clickhouse-backup install REQUIRED: if it cannot be fetched
after all retries the entrypoint aborts (non-zero exit, set -e) and
clickhouse-server is not started, so the deploy fails loudly rather than
coming up without backup/restore capability.
2026-06-09 15:10:07 +00:00
71234e23e0 chore: upgrade to 4.0.0+v2.0.0
Some checks failed
cc-ci/testme cc-ci: failure
2026-06-05 05:36:21 +00:00
fbe0475ddb chore: upgrade to 4.0.0+v2.0.0
Some checks failed
cc-ci/testme cc-ci: failure
2026-06-05 05:05:50 +00:00
ca89e2024e chore: upgrade to 4.0.0+v2.0.0
Some checks failed
cc-ci/testme cc-ci: failure
2026-06-05 04:36:59 +00:00
0b08d7ed11 chore: upgrade to 4.0.0+v2.1.5
Some checks failed
cc-ci/testme cc-ci: failure
2026-06-02 06:50:21 +00:00
d063f0136e chore: upgrade to 4.0.0+v2.1.5
Some checks failed
cc-ci/testme cc-ci: failure
2026-06-02 05:38:09 +00:00
6 changed files with 42 additions and 109 deletions

View File

@ -26,3 +26,4 @@
[`abra`]: https://git.coopcloud.tech/coop-cloud/abra
[`coop-cloud/traefik`]: https://git.coopcloud.tech/coop-cloud/traefik
p-cloud/traefik

View File

@ -1,5 +1,3 @@
export CLICKHOUSE_CONF_VERSION=v2
export CLICKHOUSE_USER_CONF_VERSION=v2
export CLICKHOUSE_ENTRYPOINT_VERSION=v6
export PG_BACKUP_VERSION=v1
export CLICKHOUSE_BACKUP_SCRIPT_VERSION=v1
export CLICKHOUSE_ENTRYPOINT_VERSION=v3

View File

@ -1,30 +0,0 @@
#!/bin/bash
set -e
# clickhouse-backup output lives inside the event-data volume (snapshotted via
# backupbot.backup.volumes.event-data.path). Restoring the raw data files under a
# running server is unsafe, so restore performs a logical restore instead.
BACKUP_DIR=/var/lib/clickhouse/backup/events
MIGRATIONS_TSV="$BACKUP_DIR/schema_migrations.tsv"
backup() {
clickhouse-backup create events
# schema_migrations is a TinyLog table — clickhouse-backup only FREEZEs MergeTree
# data, so its rows aren't captured. Export them alongside the backup, else a restore
# leaves the ledger empty and the next boot re-runs every migration (DUPLICATE_COLUMN).
clickhouse-client --query "SELECT * FROM plausible_events_db.schema_migrations FORMAT TSV" > "$MIGRATIONS_TSV"
}
backup_cleanup() {
rm -rf "$BACKUP_DIR"
}
restore() {
clickhouse-backup restore --rm events
clickhouse-client --query "TRUNCATE TABLE plausible_events_db.schema_migrations"
clickhouse-client --query "INSERT INTO plausible_events_db.schema_migrations FORMAT TSV" < "$MIGRATIONS_TSV"
rm -rf "$BACKUP_DIR"
}
"$@"

View File

@ -7,7 +7,7 @@ services:
command: sh -c "sleep 10 && /entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run"
depends_on:
- db
- plausible_events_db
- events_db
environment:
- BASE_URL=https://$DOMAIN
- SECRET_KEY_BASE
@ -26,16 +26,14 @@ services:
- internal
deploy:
restart_policy:
# `any`, not `on-failure`: a restore disrupts postgres under the app and Erlang then
# shuts down gracefully (exit 0), which on-failure treats as done and never restarts.
condition: any
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=8000"
- "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})"
- "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}"
- coop-cloud.${STACK_NAME}.version=3.1.0+v2.0.0
- coop-cloud.${STACK_NAME}.version=4.0.0+v2.0.0
db:
image: pgautoupgrade/pgautoupgrade:18-alpine
volumes:
@ -53,18 +51,14 @@ services:
interval: 5s
timeout: 5s
retries: 60
configs:
- source: pg_backup
target: /pg_backup.sh
mode: 0555
deploy:
labels:
backupbot.backup: "true"
backupbot.backup.volumes.db-data.path: "postgres.dump.gz"
backupbot.backup.pre-hook: "/pg_backup.sh backup"
backupbot.backup.post-hook: "/pg_backup.sh backup_cleanup"
backupbot.backup.pre-hook: sh -c 'pg_dump -U "$$POSTGRES_USER" -Fc "$$POSTGRES_DB" | gzip > "/postgres.dump.gz"'
backupbot.backup.path: "/postgres.dump.gz"
backupbot.backup.post-hook: "rm -f /postgres.dump.gz"
backupbot.restore: "true"
backupbot.restore.post-hook: "/pg_backup.sh restore"
backupbot.restore.post-hook: sh -c 'gzip -d /postgres.dump.gz && pg_restore --clean -U "$$POSTGRES_USER" --dbname="$$PLAUSIBLE_DB" < /postgres.dump && rm -f /postgres.dump'
plausible_events_db:
image: clickhouse/clickhouse-server:23.4.2.11-alpine
@ -79,19 +73,16 @@ services:
- source: clickhouse_entrypoint
target: /custom-entrypoint.sh
mode: 0555
- source: clickhouse_backup
target: /clickhouse_backup.sh
mode: 0555
networks:
- internal
deploy:
labels:
backupbot.backup: "true"
backupbot.backup.volumes.event-data.path: "backup/events"
backupbot.backup.pre-hook: "/clickhouse_backup.sh backup"
backupbot.backup.post-hook: "/clickhouse_backup.sh backup_cleanup"
backupbot.backup.pre-hook: clickhouse-backup create events
backupbot.backup.path: "/var/lib/clickhouse/backup/events"
backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events"
backupbot.restore: "true"
backupbot.restore.post-hook: "/clickhouse_backup.sh restore"
backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events"
volumes:
db-data:
@ -112,9 +103,3 @@ configs:
clickhouse_entrypoint:
name: ${STACK_NAME}_clickhouse_entrypoint_${CLICKHOUSE_ENTRYPOINT_VERSION}
file: entrypoint.clickhouse.sh
pg_backup:
name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION}
file: pg_backup.sh
clickhouse_backup:
name: ${STACK_NAME}_clickhouse_backup_${CLICKHOUSE_BACKUP_SCRIPT_VERSION}
file: clickhouse_backup.sh

View File

@ -1,9 +1,21 @@
#!/bin/bash
# Install clickhouse-backup (powers this recipe's backup/restore hooks) before starting the
# server. The binary is cached on the persistent volume keyed by version (downloaded at most
# once per app) and fetched with bounded retries + a read timeout; the binary is verified before
# being trusted or cached. If it truly cannot be installed the deploy fails loudly rather than
# silently shipping broken backups.
# clickhouse entrypoint (cc-ci Q4.7b hardening — recipe-PR for recipe-maintainers/plausible).
#
# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`).
# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it
# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the
# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started → swarm
# restarted it → re-downloaded → amplified the throttle → crash-loop → deploy timeout (cc-ci Q4.7).
#
# Hardening (no behaviour change when the download succeeds first try):
# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched
# at most once and reused on every container restart (no re-download amplification);
# - retry with backoff to ride out transient GitHub failures;
# - un-silenced so a failure is diagnosable in `docker service logs`.
#
# Policy: clickhouse-backup is REQUIRED. If it cannot be installed after all retries the entrypoint
# aborts (non-zero exit) and the server is NOT started — we deliberately fail the deploy loudly rather
# than come up silently without backup/restore capability.
set -e
@ -23,37 +35,33 @@ elif [[ $ARCH =~ "x86_64" ]]; then
fi
CACHE_DIR=/var/lib/clickhouse/.ccci-bin
CACHED="${CACHE_DIR}/clickhouse-backup-v${CLICKHOUSE_BACKUP_VERSION}"
CACHED="${CACHE_DIR}/clickhouse-backup"
BIN=/usr/local/bin/clickhouse-backup
URL="https://github.com/Altinity/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz"
binary_ok() {
"$1" --version >/dev/null 2>&1
}
URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz"
install_clickhouse_backup() {
mkdir -p "$CACHE_DIR"
if [ -x "$CACHED" ] && binary_ok "$CACHED"; then
if [ -x "$CACHED" ]; then
cp -f "$CACHED" "$BIN"
echo "clickhouse-backup: using verified cached binary ($CACHED)"
echo "clickhouse-backup: restored from persistent cache ($CACHED)"
return 0
fi
rm -f "$CACHED" # absent or fails to execute — re-fetch
for attempt in 1 2 3 4 5; do
if wget -T 30 --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \
&& tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3 \
&& binary_ok "$BIN"; then
if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \
&& tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then
cp -f "$BIN" "$CACHED" 2>/dev/null || true
echo "clickhouse-backup: downloaded, verified + cached (attempt ${attempt})"
echo "clickhouse-backup: downloaded + cached (attempt ${attempt})"
return 0
fi
echo "clickhouse-backup: fetch attempt ${attempt}/5 failed" >&2
[ "$attempt" -lt 5 ] && sleep $((attempt * 10))
echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2
sleep $((attempt * 10))
done
echo "clickhouse-backup: could not install after 5 attempts — failing the deploy (without it backup/restore would be silently broken)" >&2
echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2
return 1
}
# Required: if the backup tool cannot be installed after retries, abort (set -e) so the deploy fails
# loudly instead of coming up without backup/restore capability.
install_clickhouse_backup
exec /entrypoint.sh

View File

@ -1,29 +0,0 @@
#!/bin/sh
set -e
# The dump lives at the db-data volume root: backup-bot-two v2 snapshots paths inside
# named volumes (backupbot.backup.volumes.db-data.path), not the container root fs.
DUMP=/var/lib/postgresql/data/postgres.dump
backup() {
pg_dump -U "$POSTGRES_USER" -Fc "$POSTGRES_DB" | gzip > "$DUMP.gz"
}
backup_cleanup() {
rm -f "$DUMP.gz"
}
restore() {
gzip -d "$DUMP.gz"
# --if-exists: otherwise DROPs on objects absent from the live db error out and
# pg_restore exits 1, killing the chain and leaving the dump behind.
pg_restore --clean --if-exists -U "$POSTGRES_USER" --dbname="$POSTGRES_DB" < "$DUMP"
rm -f "$DUMP"
# pg_restore --clean recreates objects under the live app, so its pooled connections
# keep stale type-OID caches ('cache lookup failed for type ...' crash loops, e.g.
# Oban). Terminate them so Ecto reconnects fresh.
psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = current_database() AND pid <> pg_backend_pid();"
}
"$@"