Files
cc-ci/modules/drone.nix
autonomic-bot c8bbd35f2a
All checks were successful
continuous-integration/drone/push Build is passing
1c/E2E-TESTME finding+fix: inject bridge_drone_token as Drone bot MACHINE TOKEN (DRONE_USER_CREATE token:)
Clean-room finding caught by the e2e: DRONE_USER_CREATE had no token: => a fresh-DB rebuild's Drone
auto-generates a random bot token, so the committed (sops) bridge_drone_token gets 401 and the bridge
can't trigger builds. The original cc-ci only matched because its token was captured out-of-band. Now
the bot's machine token == bridge_drone_token deterministically on every rebuild. (Evolves the toplevel
again; re-establish byte-identical on cc-ci after the e2e + Adversary re-verifies C1.)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 19:27:00 +01:00

91 lines
4.8 KiB
Nix

# Drone CI server = coop-cloud `drone` recipe via abra (swarm, traefik-routed at
# drone.ci.commoninternet.net, Gitea SSO, wildcard cert / no ACME). The exec *runner* is a
# separate host systemd service (modules/drone-runner.nix). See DECISIONS.md "CI engine"/"Drone
# deployment shape".
#
# Idempotent-RECONCILE oneshot (same pattern as proxy/swarm-init): converges every boot/activation.
# RPC + OAuth-client secrets come from sops (/run/secrets), inserted as swarm secrets here.
{ pkgs, ... }:
let
giteaClientId = "ab4cdb9d-ee96-4867-875f-87384505fc52";
# Per-build TIMEOUT (plan §4.2/§4.3 resource safety): if a CI build runs longer than this, Drone
# cancels it (the exec runner kills the process), freeing the MAX_TESTS slot so the queue advances.
# The killed build can't run its own teardown — the run-start janitor reaps its orphaned app
# (modules/drone-runner.nix MAX_TESTS note). Configurable here; reconciled best-effort below.
buildTimeoutMinutes = "60";
reconcile = pkgs.writeShellApplication {
name = "cc-ci-reconcile-drone";
runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git curl ];
text = ''
DRONE_DOMAIN="drone.ci.commoninternet.net"
ENV_FILE="$HOME/.abra/servers/default/$DRONE_DOMAIN.env"
if [ ! -r /run/secrets/drone_rpc_secret ] || [ ! -r /run/secrets/drone_gitea_client_secret ]; then
echo "FATAL: drone sops secrets missing at /run/secrets (rebuild ordering?)" >&2
exit 1
fi
abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true
abra recipe fetch drone -n >/dev/null
[ -f "$ENV_FILE" ] || abra app new drone -s default -D "$DRONE_DOMAIN" -n
set_env() {
sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE"
# ensure trailing newline before append (a recipe .env.sample may end without one, which
# would glue the var onto the last line see modules/backupbot.nix for the bite).
if [ -s "$ENV_FILE" ] && [ -n "$(tail -c1 "$ENV_FILE")" ]; then printf '\n' >> "$ENV_FILE"; fi
printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE"
}
set_env LETS_ENCRYPT_ENV ""
set_env EXTRA_DOMAINS ""
# Inject the bridge's Drone token as the bot's MACHINE TOKEN so it is reproducible on a fresh
# Drone DB. Without `token:`, Drone auto-generates a random token that the committed (sops)
# bridge_drone_token can't match on a clean-room rebuild the bridge gets 401 and can't trigger
# builds (the original only matched because its token was captured out-of-band post-hoc). Caught
# by the E2E-TESTME acceptance test. With `token:`, every rebuild's bot carries the sops token.
set_env DRONE_USER_CREATE "username:autonomic-bot,admin:true,token:$(cat /run/secrets/bridge_drone_token)"
set_env GITEA_DOMAIN "git.autonomic.zone"
set_env GITEA_CLIENT_ID "${giteaClientId}"
set_env RPC_SECRET_VERSION "v1"
set_env CLIENT_SECRET_VERSION "v1"
set_env DRONE_ENV_VERSION "v1"
set_env COMPOSE_FILE '"compose.yml:compose.gitea.yml"'
have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; }
have_secret rpc_secret || abra app secret insert "$DRONE_DOMAIN" rpc_secret v1 /run/secrets/drone_rpc_secret -f -n
have_secret client_secret || abra app secret insert "$DRONE_DOMAIN" client_secret v1 /run/secrets/drone_gitea_client_secret -f -n
abra app deploy "$DRONE_DOMAIN" -n -C
# Best-effort: set the cc-ci repo's build timeout (resource safety). Non-fatal never break
# the core server reconcile if Drone/token isn't ready. Uses the bridge's Drone admin token and
# hits the local traefik (hairpin-free) keeping SNI=drone... so the wildcard cert validates.
if [ -r /run/secrets/bridge_drone_token ]; then
DT="$(cat /run/secrets/bridge_drone_token)"
curl -fsS -k --resolve "$DRONE_DOMAIN:443:127.0.0.1" \
-X PATCH -H "Authorization: Bearer $DT" -H "Content-Type: application/json" \
-d '{"timeout": ${buildTimeoutMinutes}}' \
"https://$DRONE_DOMAIN/api/repos/recipe-maintainers/cc-ci" >/dev/null \
&& echo "set cc-ci build timeout = ${buildTimeoutMinutes}m" \
|| echo "WARN: could not set build timeout (non-fatal)" >&2
fi
'';
};
in
{
systemd.services.deploy-drone = {
description = "Reconcile the Drone CI server (coop-cloud recipe, Gitea SSO) via abra";
after = [ "deploy-proxy.service" "swarm-init.service" "docker.service" "network-online.target" ];
requires = [ "swarm-init.service" "docker.service" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
environment.HOME = "/root";
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = "${reconcile}/bin/cc-ci-reconcile-drone";
};
};
}