diff --git a/DECISIONS.md b/DECISIONS.md index f1361cb..8df9f76 100644 --- a/DECISIONS.md +++ b/DECISIONS.md @@ -69,6 +69,27 @@ Architecture decisions and dead-ends. One line of rationale each. (§0, §8) matter: polling makes it irrelevant; the operator was whitelisting `ci.commoninternet.net` in Gitea's `ALLOWED_HOST_LIST`, but D1 no longer depends on that.) +- **Resource safety: bound live test apps — SETTLED (orchestrator design change 2026-05-27, + plan §4.2/§4.3).** Do NOT keep multiple test apps deployed at once. Three layers, all configurable: + - **MAX_TESTS = `DRONE_RUNNER_CAPACITY` = 1** (`modules/drone-runner.nix`, `maxTests` let-binding). + Drone runs at most MAX_TESTS builds at once and **auto-queues the rest in its native pending + queue** — no custom queue. Kept at 1 (single 28GiB node, heavy recipes). At capacity=1 there is + never a concurrent in-flight run, so the bound "at most 1 test app live" holds exactly. + - **Per-build TIMEOUT = 60 min** (`modules/drone.nix`, `buildTimeoutMinutes`; reconciled + best-effort via `PATCH /api/repos/recipe-maintainers/cc-ci {"timeout":60}` using the bridge's + Drone admin token, local `--resolve`, non-fatal). A build over the limit is cancelled by Drone → + the exec runner kills it → the MAX_TESTS slot frees → the queue advances. Satisfies "continue + once a test finishes OR times out". + - **Teardown + janitor backstop.** Each build deploys → runs the 3 stages → undeploys + (guaranteed `try/finally` in `conftest`/orchestrator). A SIGKILL'd/timed-out build can't run its + own teardown, so the **run-start janitor** (`lifecycle.janitor`, called before every deploy in + both fixtures + `run_recipe_ci`) reaps orphaned run apps as the backstop. At capacity=1 the CI + path will set `CCCI_JANITOR_MAX_AGE=0` (reap any orphan immediately — safe with no concurrent + runs) in the recipe-CI Drone pipeline; with capacity>1 the janitor MUST stay age-based (default + 2h) to avoid reaping a live concurrent run. Net: at most MAX_TESTS apps ever live. + - Optional `concurrency: {limit: 1}` in the recipe-CI `.drone.yml` is a redundant belt — primary + mechanism is `DRONE_RUNNER_CAPACITY`. (Wired when the recipe-CI pipeline lands — see backlog.) + ## Open (defaults from §8, to confirm as reality lands) - **Deploy mechanism — SETTLED (M0):** `nixos-rebuild switch --flake /root/cc-ci#cc-ci` run *on diff --git a/modules/drone-runner.nix b/modules/drone-runner.nix index a6000b5..1d707c4 100644 --- a/modules/drone-runner.nix +++ b/modules/drone-runner.nix @@ -6,6 +6,17 @@ # DECISIONS.md "CI engine"). It connects to the server over RPC at drone.ci.commoninternet.net, # sharing DRONE_RPC_SECRET with the server via the sops-rendered EnvironmentFile. { pkgs, config, lib, ... }: +let + # MAX_TESTS (plan §4.2/§4.3 resource safety): max CI builds the exec runner runs at once. Drone + # queues the rest in its native pending-build queue (no custom queue). THE concurrency cap that + # bounds how many test apps can be live at once — kept LOW (1) on this single 28GiB node since + # recipes are heavy (immich/matrix large volumes). With capacity=1 there is never a concurrent + # in-flight run, so the run-start janitor can safely reap *any* orphan (a SIGKILL'd build runs no + # teardown) and the "at most MAX_TESTS apps live" bound holds exactly. Raise to 2 only if the node + # is shown to handle two light recipes at once (then the janitor MUST stay age-based to avoid + # reaping a concurrent run — see DECISIONS.md "Resource safety"). + maxTests = "1"; +in { # Drone ships under the Polyform Small Business license (nixpkgs marks it unfree); # permitted for our internal CI use. Allow only this package. @@ -20,7 +31,7 @@ environment = { DRONE_RPC_PROTO = "https"; DRONE_RPC_HOST = "drone.ci.commoninternet.net"; - DRONE_RUNNER_CAPACITY = "2"; # concurrency cap (plan §4.2) + DRONE_RUNNER_CAPACITY = maxTests; # MAX_TESTS concurrency cap (see let-binding above) DRONE_RUNNER_NAME = "cc-ci-exec"; # exec runner needs a writable root for build workspaces DRONE_RUNNER_ROOT = "/var/lib/drone-runner"; diff --git a/modules/drone.nix b/modules/drone.nix index 2ba2f05..ebe6dff 100644 --- a/modules/drone.nix +++ b/modules/drone.nix @@ -8,9 +8,14 @@ { pkgs, ... }: let giteaClientId = "ab4cdb9d-ee96-4867-875f-87384505fc52"; + # Per-build TIMEOUT (plan §4.2/§4.3 resource safety): if a CI build runs longer than this, Drone + # cancels it (the exec runner kills the process), freeing the MAX_TESTS slot so the queue advances. + # The killed build can't run its own teardown — the run-start janitor reaps its orphaned app + # (modules/drone-runner.nix MAX_TESTS note). Configurable here; reconciled best-effort below. + buildTimeoutMinutes = "60"; reconcile = pkgs.writeShellApplication { name = "cc-ci-reconcile-drone"; - runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git ]; + runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git curl ]; text = '' DRONE_DOMAIN="drone.ci.commoninternet.net" ENV_FILE="$HOME/.abra/servers/default/$DRONE_DOMAIN.env" @@ -44,6 +49,19 @@ let have_secret client_secret || abra app secret insert "$DRONE_DOMAIN" client_secret v1 /run/secrets/drone_gitea_client_secret -f -n abra app deploy "$DRONE_DOMAIN" -n -C + + # Best-effort: set the cc-ci repo's build timeout (resource safety). Non-fatal — never break + # the core server reconcile if Drone/token isn't ready. Uses the bridge's Drone admin token and + # hits the local traefik (hairpin-free) keeping SNI=drone... so the wildcard cert validates. + if [ -r /run/secrets/bridge_drone_token ]; then + DT="$(cat /run/secrets/bridge_drone_token)" + curl -fsS -k --resolve "$DRONE_DOMAIN:443:127.0.0.1" \ + -X PATCH -H "Authorization: Bearer $DT" -H "Content-Type: application/json" \ + -d '{"timeout": ${buildTimeoutMinutes}}' \ + "https://$DRONE_DOMAIN/api/repos/recipe-maintainers/cc-ci" >/dev/null \ + && echo "set cc-ci build timeout = ${buildTimeoutMinutes}m" \ + || echo "WARN: could not set build timeout (non-fatal)" >&2 + fi ''; }; in