Compare commits
4 Commits
fix/lint-a
...
test/plaus
| Author | SHA1 | Date | |
|---|---|---|---|
| 79c652ddd3 | |||
| 68ef0f84fb | |||
| c828f6cdd0 | |||
| 1ba0d961a3 |
@ -340,18 +340,22 @@ def _stack_name(domain: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def services_converged(domain: str) -> bool:
|
def services_converged(domain: str) -> bool:
|
||||||
"""True when every service in the stack reports replicas N/N (N>0)."""
|
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
|
||||||
|
mid-rolling-update (swarm UpdateStatus settled)."""
|
||||||
stack = _stack_name(domain)
|
stack = _stack_name(domain)
|
||||||
proc = subprocess.run(
|
proc = subprocess.run(
|
||||||
["docker", "stack", "services", stack, "--format", "{{.Replicas}}"],
|
["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
)
|
)
|
||||||
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
||||||
if not rows:
|
if not rows:
|
||||||
return False
|
return False
|
||||||
|
names = []
|
||||||
for r in rows:
|
for r in rows:
|
||||||
cur, _, want = r.partition("/")
|
name, _, replicas = r.partition(" ")
|
||||||
|
names.append(name)
|
||||||
|
cur, _, want = replicas.partition("/")
|
||||||
# A service at its DESIRED replica count is converged — including a `replicas: 0`
|
# A service at its DESIRED replica count is converged — including a `replicas: 0`
|
||||||
# on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up
|
# on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up
|
||||||
# manually only when buckets need (re)creating), which reports "0/0". The earlier
|
# manually only when buckets need (re)creating), which reports "0/0". The earlier
|
||||||
@ -360,6 +364,28 @@ def services_converged(domain: str) -> bool:
|
|||||||
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
||||||
if not want or cur != want:
|
if not want or cur != want:
|
||||||
return False
|
return False
|
||||||
|
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
|
||||||
|
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
|
||||||
|
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later
|
||||||
|
# (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require
|
||||||
|
# every service's UpdateStatus to be settled too, so the wait spans the whole rolling update.
|
||||||
|
proc = subprocess.run(
|
||||||
|
[
|
||||||
|
"docker",
|
||||||
|
"service",
|
||||||
|
"inspect",
|
||||||
|
*names,
|
||||||
|
"--format",
|
||||||
|
"{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if proc.returncode != 0:
|
||||||
|
return False # a service vanished mid-check — not settled
|
||||||
|
for state in proc.stdout.split("\n"):
|
||||||
|
if state.strip() not in ("", "completed", "rollback_completed"):
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
@ -572,6 +598,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None:
|
|||||||
|
|
||||||
def backup_app(domain: str) -> str:
|
def backup_app(domain: str) -> str:
|
||||||
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
|
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
|
||||||
|
# Never back up a stack that is still converging/rolling-updating: backupbot resolves each
|
||||||
|
# service's hook container ONCE up front, so a task that cycles between that lookup and the
|
||||||
|
# pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout
|
||||||
|
# we still attempt the backup and let the tier's assertion deliver the verdict.
|
||||||
|
deadline = time.time() + 300
|
||||||
|
while time.time() < deadline and not services_converged(domain):
|
||||||
|
print(
|
||||||
|
f" backup: {domain} stack not settled yet — waiting before backup create", flush=True
|
||||||
|
)
|
||||||
|
time.sleep(5)
|
||||||
return abra.backup_create(domain)
|
return abra.backup_create(domain)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -59,7 +59,7 @@ def _register_site(domain: str, site: str) -> None:
|
|||||||
f"SELECT domain FROM sites WHERE domain = '{site}';"
|
f"SELECT domain FROM sites WHERE domain = '{site}';"
|
||||||
)
|
)
|
||||||
out = lifecycle.exec_in_app(
|
out = lifecycle.exec_in_app(
|
||||||
domain, ["psql", "-U", "plausible", "-d", "plausible", "-tAc", sql], service="db"
|
domain, ["psql", "-q", "-U", "plausible", "-d", "plausible", "-tAc", sql], service="db"
|
||||||
).strip()
|
).strip()
|
||||||
assert out == site, f"site {site!r} not registered in postgres (got {out!r})"
|
assert out == site, f"site {site!r} not registered in postgres (got {out!r})"
|
||||||
|
|
||||||
|
|||||||
@ -20,3 +20,12 @@ EXTRA_ENV = {
|
|||||||
# 64-char stable value for CI — plausible (Phoenix) requires >= 64 chars
|
# 64-char stable value for CI — plausible (Phoenix) requires >= 64 chars
|
||||||
"SECRET_KEY_BASE": "ccciplausibletestkeybase64charsexactlyforCIephemeral4567890123",
|
"SECRET_KEY_BASE": "ccciplausibletestkeybase64charsexactlyforCIephemeral4567890123",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# The upgrade tier defaults its base to recipe_versions[-2]. For the 3.1.0 upgrade PR the
|
||||||
|
# published tags end [..., 3.0.0+v2.0.0, 3.0.1+v2.0.0], so [-2] picks 3.0.0 — whose clickhouse
|
||||||
|
# entrypoint has no x86_64 ARCH mapping (added in 3.0.1): on amd64 it wgets the nonexistent
|
||||||
|
# clickhouse-backup-linux-x86_64.tar.gz (HTTP 404), exits 1 silently (set -e + silenced wget)
|
||||||
|
# and crash-loops, so the base deploy can NEVER converge on this host. The PR adds its version
|
||||||
|
# ABOVE the newest published tag — the documented case where the correct base is [-1], the
|
||||||
|
# newest published version. Pin it.
|
||||||
|
UPGRADE_BASE_VERSION = "3.0.1+v2.0.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user