cc-ci/runner/harness/deps.py

"""Dependency-resolver harness primitive (Phase 2 §4.2 / Q2.3).

A Phase-2 recipe may declare a set of OTHER recipes it requires to run its tests (e.g.
lasuite-docs requires keycloak as its SSO provider). The orchestrator reads the deps list,
deploys each one BEFORE the recipe-under-test, persists their per-run identity to a JSON file
the recipe's tests can read, and tears them down at the end of the run.

Per Phase-2 DECISIONS:
- Deps are declared on the cc-ci side in `tests/<recipe>/recipe_meta.py` as
  `DEPS = ["keycloak", ...]` (a list of recipe names). This keeps the cc-ci surface authoritative
  per plan §1.4 (cc-ci is self-contained at runtime).
- Each dep is deployed at a unique per-run domain `<dep[:4]>-<6hex>` (the same naming scheme as
  the recipe under test, but the 6hex is derived from `recipe + pr + ref + dep_name` so two deps
  of the same kind by different recipes never collide on a host).
- Dep deploys are SEQUENTIAL, never concurrent (per plan §4.2 — heavy deps + recipe under test
  must share the single node's MAX_TESTS budget without exceeding it).
- Each dep is undeployed in the orchestrator's `finally`, in **reverse** order so a recipe-under-
  test can depend on multiple deps with a dependency chain (a → b → c teardown is c → b → a).
- Dep deploys DO count toward the DG4.1 deploy-count invariant. The formula in run_recipe_ci.py is
  `expected_deploy_count = 1 + deps_deployed_count`, so each dep deploy increments the counter.

Run state:
- `$CCCI_DEPS_FILE` — JSON file written by the orchestrator after each dep deploys; each entry is
  `{"recipe": "<dep-recipe>", "domain": "<dep-domain>", "version": null}`. Tests access via the
  `deps` pytest fixture defined in `tests/conftest.py`.
"""

from __future__ import annotations

import contextlib
import json
import os
from collections.abc import Iterable

from . import lifecycle, naming
from . import meta as meta_mod


def dep_domain(parent_recipe: str, pr: str, ref: str | None, dep_recipe: str) -> str:
    """Per-run domain for a dep app. Distinct from the parent's domain so two recipes' deps don't
    collide. The 6hex is derived from (parent_recipe, pr, ref, dep_recipe) — stable per run, but
    different for every (parent, dep) pair so deps belonging to different parents don't collide
    on the same node."""
    # naming.app_domain hashes (recipe, pr, ref). Bake parent_recipe + dep_recipe into the ref so
    # the hash distinguishes (parent_A,dep_X) from (parent_B,dep_X). The recipe arg drives the
    # <recipe[:4]> prefix — passing dep_recipe keeps the visible prefix correct (`keyc-...`).
    synthetic_ref = f"{parent_recipe}|{ref or ''}|dep|{dep_recipe}"
    return naming.app_domain(dep_recipe, pr, synthetic_ref)


def write_run_state(deps_state) -> None:
    """Write the deps state file ($CCCI_DEPS_FILE). Two shapes supported (canonical=keyed dict):

    1. **Legacy list-of-entries:** `[{"recipe": "<dep>", "domain": "<d>"}, ...]` (Q2.3 original).
       Still accepted by `load_run_state` for backwards compat — the `deps` fixture flattens.
    2. **NEW per-spec dict (operator-2026-05-28 SSO-dep plan §3.2):**
       `{"<dep_recipe>": {"recipe": "<dep>", "domain": "<d>", "realm": "...",
       "client_id": "...", "client_secret": "...", "admin_user": "...", "admin_password": "..."}}`.
       The per-recipe `install_steps.sh` hook reads this via `jq` to wire OIDC env.

    No-op if `$CCCI_DEPS_FILE` isn't set."""
    path = os.environ.get("CCCI_DEPS_FILE")
    if not path:
        return
    with open(path, "w") as f:
        json.dump(deps_state, f)


def deploy_deps(
    parent_recipe: str,
    pr: str,
    ref: str | None,
    deps: Iterable[str],
    meta_for: dict | None = None,
) -> list[dict]:
    """Deploy each declared dep, sequentially, at its per-run domain. Returns the list of state
    dicts (one per dep). `meta_for` maps dep_recipe -> RecipeMeta (HEALTH_PATH/HEALTH_OK/timeouts)
    so the readiness wait uses per-dep config; a missing dep meta is loaded via meta.load()
    (defaults: /, 200/301/302, 600s)."""
    meta_for = meta_for or {}
    state: list[dict] = []
    for dep in deps:
        domain = dep_domain(parent_recipe, pr, ref, dep)
        print(f"  dep: deploying {dep} -> {domain}", flush=True)
        # Dep deploys count toward DG4.1 — the check expects (1 + len(cold-deps)), so each
        # dep that deploys here MUST be counted. The formula is authoritative in run_recipe_ci.py:
        #   expected_deploy_count = 1 + deps_deployed_count
        dm = meta_for.get(dep) or meta_mod.load(dep)
        lifecycle.deploy_app(
            dep,
            domain,
            secrets=True,
            deploy_timeout=int(dm.DEPLOY_TIMEOUT),
            meta=dm,
        )
        try:
            lifecycle.wait_healthy(
                domain,
                ok_codes=tuple(dm.HEALTH_OK),
                path=dm.HEALTH_PATH,
                deploy_timeout=int(dm.DEPLOY_TIMEOUT),
                http_timeout=int(dm.HTTP_TIMEOUT),
            )
        except Exception:
            # If a dep fails to converge, abort the whole resolve — let the caller teardown
            print(f"  dep: {dep} ({domain}) failed readiness; tearing down", flush=True)
            with contextlib.suppress(Exception):
                lifecycle.teardown_app(domain, verify=False)
            raise
        state.append({"recipe": dep, "domain": domain})
        print(f"  dep: {dep} ready @ {domain}", flush=True)
    write_run_state(state)
    return state


def teardown_deps(state: list[dict]) -> None:
    """Undeploy each dep in reverse order. **VERIFY=True (F2-5 fix)**: per plan §9 teardown is
    sacred — a dep that leaks containers/volumes/secrets corrupts the next run that uses the same
    deterministic dep domain.

    Failures are LOGGED LOUDLY (not silently suppressed) so a leak is visible in the run output;
    we continue to teardown other deps so one failure doesn't strand the rest; after all attempts
    we **raise** if any dep failed to fully teardown — the orchestrator's outer `finally` then
    decides whether the leak is a run-failure (it should be, mirroring lifecycle.teardown_app's
    own raise-on-residual behaviour at `verify=True`).
    """
    errors: list[str] = []
    for entry in reversed(state):
        domain = entry.get("domain")
        if not domain:
            continue
        recipe = entry.get("recipe", "?")
        print(f"  dep: tearing down {recipe} @ {domain}", flush=True)
        try:
            lifecycle.teardown_app(domain, verify=True)
        except Exception as e:  # noqa: BLE001 — every failure must be visible, but we want to try the rest first
            msg = f"dep {recipe} @ {domain} teardown failed: {e}"
            print(f"  !! {msg}", flush=True)
            errors.append(msg)
    if errors:
        raise lifecycle.TeardownError("dep teardown failures: " + " ; ".join(errors))


def load_run_state():
    """Read the current run's deps state. Returns the JSON content (list OR dict — both shapes
    supported, see write_run_state). Returns [] if file is empty/unset."""
    path = os.environ.get("CCCI_DEPS_FILE")
    if not path or not os.path.exists(path):
        return []
    try:
        with open(path) as f:
            return json.load(f) or []
    except (OSError, ValueError):
        return []


def deps_as_dict(state) -> dict[str, dict]:
    """Coerce either shape (legacy list or new dict) into a recipe→entry dict for the `deps`
    fixture + dependent-tests consumption."""
    if isinstance(state, dict):
        return state
    out: dict[str, dict] = {}
    for entry in state or []:
        if isinstance(entry, dict) and entry.get("recipe"):
            out[entry["recipe"]] = entry
    return out