From 9f8bcbc9e302e13ec0ecb7f01bab1b823722449d Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Tue, 9 Jun 2026 19:09:13 +0000 Subject: [PATCH] fix: clickhouse-backup install must succeed loudly, never silently degrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the previous best-effort (|| true) approach: a deploy without clickhouse-backup would have silently broken backup/restore, so the entrypoint now hard-fails (visibly, in service logs) if the tool truly cannot be installed — but makes that case effectively unreachable: - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version: downloaded at most once per app; container restarts never re-fetch (kills the re-download amplification that turned a GitHub throttle into a permanent crash-loop) - canonical Altinity release URL (project moved; old path is a redirect) - bounded retries with backoff + wget read timeout (a stalled connection can no longer hang the deploy) - verify the binary executes before trusting or caching it (catches truncated downloads and a corrupt cache) - compose: fix app depends_on to the real service name (plausible_events_db) — docker compose config was failing on it, which disabled CI image prepull and pushed pulls into the deploy window - bump CLICKHOUSE_ENTRYPOINT_VERSION v4 -> v5 (swarm configs immutable) Verified on a dev deploy: fresh download path, cached-restart path, clickhouse-backup create/list/delete, and /api/health all green. --- abra.sh | 2 +- compose.yml | 2 +- entrypoint.clickhouse.sh | 58 +++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/abra.sh b/abra.sh index f9a789f..9e41df0 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,3 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v4 +export CLICKHOUSE_ENTRYPOINT_VERSION=v5 diff --git a/compose.yml b/compose.yml index a1bb359..037d79e 100644 --- a/compose.yml +++ b/compose.yml @@ -7,7 +7,7 @@ services: command: sh -c "sleep 10 && /entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run" depends_on: - db - - events_db + - plausible_events_db environment: - BASE_URL=https://$DOMAIN - SECRET_KEY_BASE diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index 861cdbc..c60f48d 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,17 +1,22 @@ #!/bin/bash -# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). -# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it -# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the -# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm -# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7). +# clickhouse-backup powers this recipe's backup/restore (the backupbot pre/post-hooks run +# `clickhouse-backup create/restore`). A deploy without it would have silently broken backups, +# so if it truly cannot be installed the deploy must FAIL LOUDLY rather than degrade. # -# Hardening (no behaviour change when the download succeeds first try): -# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched -# at most once and reused on every container restart (no re-download amplification); -# - retry with backoff; -# - NEVER let a download failure block the server start (best-effort: the server comes up, backup/ -# restore degrade until the next successful fetch); -# - un-silenced so a failure is diagnosable in `docker service logs`. +# The published recipe fetched it with a single silenced no-retry wget at every container start: +# any transient GitHub failure exited the container (set -e) before clickhouse-server started, +# and the swarm restart loop re-downloaded the 22 MB asset on every restart, amplifying a +# throttle into a permanent crash-loop and a deploy timeout (cc-ci Q4.7). +# +# Hardening (no behaviour change when the fetch succeeds first try): +# - cache the VERIFIED binary on the persistent clickhouse volume, keyed by version, so it is +# downloaded at most once per app and container restarts never re-fetch; +# - canonical Altinity URL (the project moved; the old AlexAkulov path is just a redirect); +# - bounded retries with backoff + a read timeout, so a stalled connection cannot hang the +# deploy and a transient failure cannot kill it; +# - verify the binary actually executes before trusting or caching it (catches truncated +# downloads and a corrupt cache); +# - un-silenced: every attempt and the final verdict are visible in `docker service logs`. set -e @@ -31,32 +36,37 @@ elif [[ $ARCH =~ "x86_64" ]]; then fi CACHE_DIR=/var/lib/clickhouse/.ccci-bin -CACHED="${CACHE_DIR}/clickhouse-backup" +CACHED="${CACHE_DIR}/clickhouse-backup-v${CLICKHOUSE_BACKUP_VERSION}" BIN=/usr/local/bin/clickhouse-backup -URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" +URL="https://github.com/Altinity/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" + +binary_ok() { + "$1" --version >/dev/null 2>&1 +} install_clickhouse_backup() { mkdir -p "$CACHE_DIR" - if [ -x "$CACHED" ]; then + if [ -x "$CACHED" ] && binary_ok "$CACHED"; then cp -f "$CACHED" "$BIN" - echo "clickhouse-backup: restored from persistent cache ($CACHED)" + echo "clickhouse-backup: using verified cached binary ($CACHED)" return 0 fi + rm -f "$CACHED" # absent or fails to execute — re-fetch for attempt in 1 2 3 4 5; do - if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ - && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then + if wget -T 30 --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ + && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3 \ + && binary_ok "$BIN"; then cp -f "$BIN" "$CACHED" 2>/dev/null || true - echo "clickhouse-backup: downloaded + cached (attempt ${attempt})" + echo "clickhouse-backup: downloaded, verified + cached (attempt ${attempt})" return 0 fi - echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 - sleep $((attempt * 10)) + echo "clickhouse-backup: fetch attempt ${attempt}/5 failed" >&2 + [ "$attempt" -lt 5 ] && sleep $((attempt * 10)) done - echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2 + echo "clickhouse-backup: could not install after 5 attempts — failing the deploy (without it backup/restore would be silently broken)" >&2 return 1 } -# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency). -install_clickhouse_backup || true +install_clickhouse_backup exec /entrypoint.sh