From 13da216f8d62daf1006c8b5a1517ca5608a99fa6 Mon Sep 17 00:00:00 2001 From: autonomic-bot Date: Sat, 30 May 2026 05:23:47 +0100 Subject: [PATCH] =?UTF-8?q?fix(2):=20ghost=20healthcheck=20start=5Fperiod?= =?UTF-8?q?=20overlay=20=E2=80=94=20fixes=20fresh-migration=20lock=20deadl?= =?UTF-8?q?ock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: Ghost's fresh-DB first boot runs a ~6-9min schema migration (round-trip-bound, not CPU); the recipe healthcheck start_period:1m (~6min grace) kills the still-migrating task, leaving a stale migrations_lock → every later task deadlocks (MigrationsAreLockedError). Hit on both 2- and 4-vCPU. Fix (cc-ci deploy overlay, NOT a recipe/test change): compose.ccci-health.yml raises app healthcheck start_period to 900s, wired via recipe_meta COMPOSE_FILE + install_steps.sh (+ CHAOS_BASE_DEPLOY for the untracked overlay). No assertion weakened. Budget 1200s = migration + convergence. Only the install tier needs it (upgrade redeploys on the populated DB → fast boot). --- tests/ghost/compose.ccci-health.yml | 18 ++++++++++++++++++ tests/ghost/install_steps.sh | 26 ++++++++++++++++++++++++++ tests/ghost/recipe_meta.py | 25 ++++++++++++++++--------- 3 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 tests/ghost/compose.ccci-health.yml create mode 100755 tests/ghost/install_steps.sh diff --git a/tests/ghost/compose.ccci-health.yml b/tests/ghost/compose.ccci-health.yml new file mode 100644 index 0000000..7ec2b1a --- /dev/null +++ b/tests/ghost/compose.ccci-health.yml @@ -0,0 +1,18 @@ +# cc-ci deploy overlay (NOT a recipe change) — raises ONLY the app healthcheck start_period. +# +# Ghost's first-boot runs a full schema migration (dozens of CREATE TABLEs, each a separate MySQL +# round-trip → ~6-9min on cc-ci) against the fresh `ghost` DB. The upstream recipe healthcheck uses +# `start_period: 1m` (+ 10×30s retries ≈ 6min grace); on cc-ci the migration regularly exceeds that, +# so swarm marks the still-migrating task unhealthy and KILLS it mid-migration — which leaves a stale +# `migrations_lock` row, and every later task then refuses to boot (`MigrationsAreLockedError` +# deadlock). This is round-trip-bound, so more vCPU does not close the gap. +# +# Raising the START_PERIOD (failures ignored during it; a PASS still marks healthy immediately) lets +# the fresh migration finish + release the lock, after which Ghost serves and the (unchanged) check +# passes. This is DEPLOY/infra tuning, not a test change — no assertion is weakened, and the app's +# real healthcheck still gates readiness. Applied via recipe_meta COMPOSE_FILE; only the install +# tier's fresh migration needs it (the upgrade redeploy boots on the already-populated DB → fast). +services: + app: + healthcheck: + start_period: 900s diff --git a/tests/ghost/install_steps.sh b/tests/ghost/install_steps.sh new file mode 100755 index 0000000..d816bce --- /dev/null +++ b/tests/ghost/install_steps.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# ghost — INSTALL-TIME hook (Phase 2 Q4.4). Runs during the install tier AFTER `abra app new` + +# EXTRA_ENV + `abra app secret generate` and BEFORE the single `abra app deploy` +# (lifecycle.py::_run_install_steps), with CCCI_RECIPE / CCCI_APP_DOMAIN / CCCI_APP_ENV in env. +# +# Purpose: provide the cc-ci deploy overlay `compose.ccci-health.yml` (app healthcheck start_period +# bump) into the recipe checkout so recipe_meta's COMPOSE_FILE (compose.yml:compose.ccci-health.yml) +# resolves. Without the larger start_period, Ghost's ~6-9min fresh-DB migration is killed mid-flight +# by the recipe's 1m-start_period healthcheck, leaving a stale migrations_lock → deadlock (see the +# overlay file header). The overlay is an UNTRACKED file in the recipe repo, so `git checkout -f` +# (the upgrade tier's re-checkout to PR head) preserves it — COMPOSE_FILE keeps resolving across +# install AND upgrade deploys. CHAOS_BASE_DEPLOY=True (recipe_meta) lets the pinned base deploy +# proceed despite this untracked file (abra's clean-tree check would otherwise FATA). +set -euo pipefail + +: "${CCCI_RECIPE:?missing CCCI_RECIPE}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}" + +if [ ! -d "$RECIPE_DIR" ]; then + echo " ghost install_steps: recipe dir $RECIPE_DIR missing — cannot provide health overlay" >&2 + exit 1 +fi + +cp "$SCRIPT_DIR/compose.ccci-health.yml" "$RECIPE_DIR/compose.ccci-health.yml" +echo " ghost install_steps: provided compose.ccci-health.yml (healthcheck start_period bump) to ${CCCI_RECIPE}" diff --git a/tests/ghost/recipe_meta.py b/tests/ghost/recipe_meta.py index d5a5861..e007239 100644 --- a/tests/ghost/recipe_meta.py +++ b/tests/ghost/recipe_meta.py @@ -8,14 +8,21 @@ # mysqldump pre-hook; P4 (ops.py + test_{backup,restore,upgrade}.py) seeds a `ci_marker` row there. HEALTH_PATH = "/" # Ghost serves a themed site HTML at root (200) HEALTH_OK = (200,) -DEPLOY_TIMEOUT = 900 # subprocess timeout for `abra app deploy` +DEPLOY_TIMEOUT = 1200 # subprocess timeout for `abra app deploy` HTTP_TIMEOUT = 900 -# Ghost's first-boot does a full schema migration (dozens of tables) against a fresh MySQL `ghost` -# DB. The migration must finish within the recipe healthcheck grace (start_period 1m + 10×30s ≈ 6min) -# — otherwise swarm kills the still-migrating task, which leaves a stale `migrations_lock` row and -# every later task then refuses to boot (`MigrationsAreLockedError` deadlock). On the cc-ci node with -# 4 dedicated vCPU the migration completes well inside that grace and the app converges in a few -# minutes, so 900s is an ample-but-bounded budget (fails fast if the deadlock ever recurs, rather -# than a long blackout). See DECISIONS (ghost MySQL cold-boot). -EXTRA_ENV = {"TIMEOUT": "900"} +# Ghost's fresh-DB first boot runs a full schema migration (dozens of CREATE TABLEs, each a separate +# MySQL round-trip → ~6-9min on cc-ci, round-trip-bound so more vCPU doesn't help). The upstream +# recipe healthcheck (`start_period: 1m` + 10×30s ≈ 6min grace) is too tight: swarm kills the still- +# migrating task, leaving a stale `migrations_lock` → every later task deadlocks +# (`MigrationsAreLockedError`). cc-ci provides a DEPLOY overlay `compose.ccci-health.yml` (raises the +# app healthcheck start_period to 900s; failures ignored during it, a PASS still marks healthy at +# once) via COMPOSE_FILE + install_steps.sh, so the fresh migration finishes + releases the lock. +# This is infra/deploy tuning — NO test/assertion is weakened. CHAOS_BASE_DEPLOY lets the pinned base +# deploy proceed with the untracked overlay present. TIMEOUT 1200s = migration (≤9min) + convergence, +# bounded so a genuine failure still fails (not a long blackout). See DECISIONS (ghost MySQL cold-boot). +CHAOS_BASE_DEPLOY = True +EXTRA_ENV = { + "TIMEOUT": "1200", + "COMPOSE_FILE": "compose.yml:compose.ccci-health.yml", +}