diff --git a/abra.sh b/abra.sh index 1a82402..f9a789f 100644 --- a/abra.sh +++ b/abra.sh @@ -1,3 +1,3 @@ export CLICKHOUSE_CONF_VERSION=v2 export CLICKHOUSE_USER_CONF_VERSION=v2 -export CLICKHOUSE_ENTRYPOINT_VERSION=v3 +export CLICKHOUSE_ENTRYPOINT_VERSION=v4 diff --git a/compose.yml b/compose.yml index bdb4c13..a1bb359 100644 --- a/compose.yml +++ b/compose.yml @@ -82,7 +82,7 @@ services: backupbot.backup.path: "/var/lib/clickhouse/backup/events" backupbot.backup.post-hook: "rm -rf /var/lib/clickhouse/backup/events" backupbot.restore: "true" - backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events" + backupbot.restore.post-hook: clickhouse-backup restore --rm events && rm -rf /var/lib/clickhouse/backup/events volumes: db-data: diff --git a/entrypoint.clickhouse.sh b/entrypoint.clickhouse.sh index 8cebd82..861cdbc 100644 --- a/entrypoint.clickhouse.sh +++ b/entrypoint.clickhouse.sh @@ -1,12 +1,17 @@ #!/bin/bash - -# clickhouse-backup is a backup tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). -# It is a 22 MB GitHub download (rate-limit / network), which can fail to download, and lead to crash loop and download throttling. +# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). +# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it +# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the +# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started -> swarm +# restarted it -> re-downloaded -> amplified the throttle -> crash-loop -> deploy timeout (cc-ci Q4.7). # -# to make the download smoother: -# - cache the binary on the persistent clickhouse data volume (/var/lib/clickhouse) so it is fetched +# Hardening (no behaviour change when the download succeeds first try): +# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched # at most once and reused on every container restart (no re-download amplification); -# - retry with backoff to ride out transient GitHub failures +# - retry with backoff; +# - NEVER let a download failure block the server start (best-effort: the server comes up, backup/ +# restore degrade until the next successful fetch); +# - un-silenced so a failure is diagnosable in `docker service logs`. set -e @@ -47,11 +52,11 @@ install_clickhouse_backup() { echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 sleep $((attempt * 10)) done - echo "clickhouse-backup: fetch FAILED after all retries — aborting; clickhouse-server will NOT start (backup tool is required)" >&2 + echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2 return 1 } -#if the backup tool cannot be installed after retries, it aborts (set -e) so the deploy fails -install_clickhouse_backup +# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency). +install_clickhouse_backup || true exec /entrypoint.sh