cc-ci/tests/lasuite-drive/ops.py

"""lasuite-drive — pre-op seed hooks (Phase 1e HC3). The orchestrator runs these BEFORE the op; the
matching test_<op>.py asserts post-op (assertion-only). The marker is a dedicated `ci_marker` row in
postgres (independent of the app's Django migrations — CREATE TABLE IF NOT EXISTS), written via psql
in the `db` service. The backup path exercises the recipe's pg_backup.sh DB-dump hook (postgres is
backupbot-labelled)."""

import os
import subprocess
import sys
import time

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
from harness import lifecycle  # noqa: E402


def pre_install(ctx):
    """Post-deploy seed for the custom tier (the former setup_custom_tests.sh, moved here in rcust
    P2b — install_steps.sh runs PRE-deploy and cannot touch the live stack). The deploy alone does
    NOT create the MinIO bucket: `minio-createbuckets` is a `replicas:0` one-shot (restart_policy:
    none) that must be triggered. The MinIO storage test asserts the bucket exists, so trigger it
    here and poll. `--detach` is REQUIRED: the job creates the bucket then EXITS 0, so it never
    holds a steady 1/1 replica — a blocking scale would wait forever.

    BEST-EFFORT, like the setup_custom_tests.sh it replaced: on poll timeout we WARN and continue
    (the one-shot often lands just after the window). The custom-tier MinIO storage test is the
    real gate for a genuinely missing bucket — failing the install op here was an rcust M2
    regression (the original hook fell through on timeout by design)."""
    stack = ctx.domain.replace(".", "_")
    print("  pre_install: creating MinIO bucket via the minio-createbuckets one-shot", flush=True)
    subprocess.run(
        ["docker", "service", "scale", "--detach", f"{stack}_minio-createbuckets=1"],
        capture_output=True,
        check=False,
    )
    check = (
        'mc alias set _c http://localhost:9000 "$(cat /run/secrets/minio_ru)" '
        '"$(cat /run/secrets/minio_rp)" >/dev/null 2>&1 && '
        "mc ls _c/drive-media-storage >/dev/null 2>&1"
    )
    for i in range(30):
        cid = subprocess.run(
            ["docker", "ps", "-q", "-f", f"name={stack}_minio.1"],
            capture_output=True,
            text=True,
            check=False,
        ).stdout.split()
        if cid and (
            subprocess.run(
                ["docker", "exec", cid[0], "sh", "-c", check], capture_output=True, check=False
            ).returncode
            == 0
        ):
            print(
                f"  pre_install: bucket drive-media-storage present after {i + 1} poll(s)",
                flush=True,
            )
            return
        time.sleep(3)
    print(
        "  !! pre_install: minio-createbuckets one-shot did not create drive-media-storage in 90s "
        "— continuing (best-effort, as the pre-restructure hook did); the custom-tier MinIO test "
        "gates a genuinely missing bucket",
        flush=True,
    )


def _wait_collabora_ready(domain, timeout=420):
    """Gate the upgrade op on collabora being FULLY ready (WOPI discovery endpoint → 200), not just
    container 1/1 'running'. coolwsd takes ~2min to boot (pre-reads 1300+ l10n files + RSA keygen);
    the install wait_healthy returns on container 1/1 while coolwsd is still loading. An in-place
    `abra app deploy --chaos` upgrade that lands on a still-booting collabora SIGTERMs it mid-init
    ("Shutdown requested while starting up", forced exit 70) → abra aborts the deploy (Q3.2a run 1,
    JOURNAL 2026-05-29). Waiting for discovery=200 first makes the redeploy replace a ready collabora
    cleanly. collabora routes on the COLLABORA_DOMAIN sibling (collabora-<domain>); /hosting/discovery
    is the WOPI discovery endpoint celery's configure_wopi calls."""
    host = f"collabora-{domain}"
    deadline = time.time() + timeout
    last = 0
    while time.time() < deadline:
        last = lifecycle.http_get(host, "/hosting/discovery", timeout=15)
        if last == 200:
            print(f"  pre_upgrade: collabora WOPI discovery ready (200) on {host}", flush=True)
            return
        time.sleep(5)
    raise AssertionError(
        f"collabora WOPI discovery not ready on {host} (last status {last}) within {timeout}s"
    )


def _psql(domain, sql):
    cmd = f'PGPASSWORD=$(cat /run/secrets/postgres_p) psql -U drive -d drive -tAc "{sql}"'
    return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()


def _seed(domain, value):
    _psql(
        domain,
        "CREATE TABLE IF NOT EXISTS ci_marker(v text); DELETE FROM ci_marker; "
        f"INSERT INTO ci_marker VALUES('{value}');",
    )
    assert _psql(domain, "SELECT v FROM ci_marker;") == value


def pre_upgrade(ctx):
    # Gate the chaos redeploy on a fully-ready collabora (else it kills a still-booting coolwsd and
    # abra aborts the upgrade deploy — Q3.2a run 1). Then seed the data-integrity marker.
    _wait_collabora_ready(ctx.domain)
    _seed(ctx.domain, "upgrade-survives")


def pre_backup(ctx):
    _seed(ctx.domain, "original")


def pre_restore(ctx):
    # drop the marker table (diverge from the backup) so a successful restore is observable
    _psql(ctx.domain, "DROP TABLE ci_marker;")
    assert _psql(ctx.domain, "SELECT to_regclass('public.ci_marker');") in (
        "",
        "NULL",
    ), "drop did not take"