From 7f15367d1f2d0ae777748a0ad2378504312397e3 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Sun, 31 May 2026 05:21:23 +0000 Subject: [PATCH] backlog(2): plausible Q4.7b scoped + ready (staged hardened entrypoint.clickhouse.sh; mirror+PR+run steps); queued behind Adversary Q4.6/F2-14c verifies --- machine-docs/BACKLOG-2.md | 22 +++++-- .../plausible-entrypoint.clickhouse.sh.fixed | 64 +++++++++++++++++++ 2 files changed, 81 insertions(+), 5 deletions(-) create mode 100644 machine-docs/plausible-entrypoint.clickhouse.sh.fixed diff --git a/machine-docs/BACKLOG-2.md b/machine-docs/BACKLOG-2.md index cf12a5a..1c4514a 100644 --- a/machine-docs/BACKLOG-2.md +++ b/machine-docs/BACKLOG-2.md @@ -199,11 +199,23 @@ Phase plan: `/srv/cc-ci/cc-ci-plan/plan-phase2-recipe-tests.md` when GitHub answers the first wget (proven: install,custom run + probe). Path to green: GitHub cooldown + ONE clean full run. Test content is correct; this is upstream-recipe fragility. - [ ] **Q4.7b** — plausible recipe PR (DEFERRED robustness, like Q3.2b/immich): harden - `entrypoint.clickhouse.sh` — cache clickhouse-backup on the persistent `/var/lib/clickhouse` - volume (skip-if-present → no re-download amplification), retry-with-backoff, `set +e` so a - download failure never blocks clickhouse-server start. NOTE: only fixes the upgrade tier + FUTURE - installs once released (install tier deploys the prev PUBLISHED version), so it does NOT unblock - this gate's install tier under throttle. Use recipe-create-pr skill; merge rule per Q3.2b. + `entrypoint.clickhouse.sh`. **READY-TO-EXECUTE (scoped 2026-05-31):** the fixed file is staged at + `machine-docs/plausible-entrypoint.clickhouse.sh.fixed` — caches clickhouse-backup on the persistent + `event-data:/var/lib/clickhouse/.ccci-bin` volume (skip-if-present → no re-download amplification), + retry×5 w/ backoff, best-effort `install_clickhouse_backup || true` so a download failure NEVER + blocks `exec /entrypoint.sh` (the server start), un-silenced. Root cause confirmed: published + entrypoint is `set -ex` + single silenced no-retry wget of a 22MB GitHub tarball to ephemeral /tmp + → any transient throttle exits before the server starts → swarm restart-storm → amplified throttle. + **Execution steps (node-free except the final run):** (1) mirror `coop-cloud/plausible` → + `recipe-maintainers/plausible` (NOT mirrored yet; gitea API POST /orgs/recipe-maintainers/repos + + `git clone --mirror` upstream → push, incl tags — plan §0b / recipe-create-pr). (2) branch + `ci/clickhouse-backup-resilient`, replace `entrypoint.clickhouse.sh` with the staged file, push, + open PR. (3) on the FRESH-IP Hetzner box the first wget should succeed (no accumulated throttle), + so a single full `RECIPE=plausible PR= REF= SRC=recipe-maintainers/plausible` run should + go green (install+upgrade+backup-restore). NOTE: the install tier deploys the prev PUBLISHED + version (old entrypoint), so its green-ness still depends on the fresh-IP download succeeding; the + PR makes the upgrade-tier head deploy + within-run restarts resilient (cache). Merge rule per Q3.2b. + **QUEUED behind the Adversary's Q4.6 + F2-14c cold-verifies (single node, MAX_TESTS=1).** - [ ] **Q4.7 gate** — full lifecycle (install+upgrade+backup-restore) green via clean run + Adversary. - [x] **Q4.8** — uptime-kuma: enrolled. PARITY.md + recipe_meta.py + 3 functional tests (health_check, socketio_handshake, spa_branding). Cold green (commit `1aaf3bd`). diff --git a/machine-docs/plausible-entrypoint.clickhouse.sh.fixed b/machine-docs/plausible-entrypoint.clickhouse.sh.fixed new file mode 100644 index 0000000..f8c49c4 --- /dev/null +++ b/machine-docs/plausible-entrypoint.clickhouse.sh.fixed @@ -0,0 +1,64 @@ +#!/bin/bash +# clickhouse entrypoint (cc-ci Q4.7b hardening — recipe-PR for recipe-maintainers/plausible). +# +# clickhouse-backup is the BACKUP tool (backupbot pre/post-hooks: `clickhouse-backup create/restore`). +# It is NOT required for clickhouse-SERVER (`/entrypoint.sh`) to run. The published recipe fetched it +# with `set -ex` + a single silenced no-retry wget to ephemeral /tmp, so ANY transient failure of the +# 22 MB GitHub download (rate-limit / network) exited the container BEFORE the server started → swarm +# restarted it → re-downloaded → amplified the throttle → crash-loop → deploy timeout (cc-ci Q4.7). +# +# Hardening (no behaviour change when the download succeeds first try): +# - cache the binary on the PERSISTENT clickhouse data volume (/var/lib/clickhouse) so it is fetched +# at most once and reused on every container restart (no re-download amplification); +# - retry with backoff; +# - NEVER let a download failure block the server start (best-effort: the server comes up, backup/ +# restore degrade until the next successful fetch); +# - un-silenced so a failure is diagnosable in `docker service logs`. + +set -e + +CLICKHOUSE_BACKUP_VERSION=2.4.2 + +ARCH=$(uname -m) +if [[ $ARCH =~ "aarch64" ]]; then + ARCH="arm64" +elif [[ $ARCH =~ "armv5l" ]]; then + ARCH="armv5" +elif [[ $ARCH =~ "armv6l" ]]; then + ARCH="armv6" +elif [[ $ARCH =~ "armv7l" ]]; then + ARCH="armv7" +elif [[ $ARCH =~ "x86_64" ]]; then + ARCH="amd64" +fi + +CACHE_DIR=/var/lib/clickhouse/.ccci-bin +CACHED="${CACHE_DIR}/clickhouse-backup" +BIN=/usr/local/bin/clickhouse-backup +URL="https://github.com/AlexAkulov/clickhouse-backup/releases/download/v${CLICKHOUSE_BACKUP_VERSION}/clickhouse-backup-linux-${ARCH}.tar.gz" + +install_clickhouse_backup() { + mkdir -p "$CACHE_DIR" + if [ -x "$CACHED" ]; then + cp -f "$CACHED" "$BIN" + echo "clickhouse-backup: restored from persistent cache ($CACHED)" + return 0 + fi + for attempt in 1 2 3 4 5; do + if wget --continue --output-document=/tmp/clickhouse-backup.tar.gz "$URL" \ + && tar -xf /tmp/clickhouse-backup.tar.gz --directory=/usr/local/bin --strip-components=3; then + cp -f "$BIN" "$CACHED" 2>/dev/null || true + echo "clickhouse-backup: downloaded + cached (attempt ${attempt})" + return 0 + fi + echo "clickhouse-backup: fetch attempt ${attempt} failed; backing off $((attempt * 10))s" >&2 + sleep $((attempt * 10)) + done + echo "clickhouse-backup: fetch FAILED after retries — starting clickhouse-server WITHOUT the backup tool (backup/restore unavailable until a later restart fetches it)" >&2 + return 1 +} + +# Best-effort: the server MUST start even if the backup-tool fetch fails (it is not a server dependency). +install_clickhouse_backup || true + +exec /entrypoint.sh