claim(2w): W0.10a traefik WC1.1 migrated onto shared health-gated reconciler — no-op converge proven; destructive rollback = Adversary cold proof
warm_reconcile.py: per-spec setup hook + health_domain; SPECS[traefik] (stateful=False, version-rollback-only, _traefik_setup preserves wildcard-cert/ file-provider config, health on routed dashboard host). keycloak path unchanged. proxy.nix: deploy-proxy.service now execs warm_reconcile.py traefik. ZERO-disruption migration (traefik already at latest 5.1.1+v3.6.15; pre-seeded TYPE+last_good → clean no-op converge; traefik 200 + keycloak-through-traefik 200 + 0 failed). 65 unit pass. Per operator out: code+converge delivered; destructive rollback (brief TLS blip) = Adversary's required cold proof. Closes the W0.10a tracked-open. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@ -4,55 +4,31 @@
|
||||
# Phase-1c: the cert at CERT_DIR is sops-decrypted from git (cc-ci-secrets) at activation
|
||||
# (modules/secrets.nix wildcard_cert/wildcard_key), NOT an out-of-band operator file drop.
|
||||
#
|
||||
# Declared as an idempotent-RECONCILE systemd oneshot (like swarm-init): it inspects current
|
||||
# state and converges every activation/boot, self-healing drift (redeploys if the stack is gone,
|
||||
# re-inserts secrets if missing). No run-once sentinel. So a from-scratch install is just
|
||||
# `nixos-rebuild switch` + operator preconditions (D8) — no manual post-steps.
|
||||
# Phase-2w / WC1.1: traefik is now UNPINNED + health-gated like keycloak — the deploy is driven by
|
||||
# the shared `runner/warm_reconcile.py traefik` (STATELESS = version-rollback-only, NO snapshot):
|
||||
# record last-good version → deploy latest tag → health-gate (a ROUTED host, the dashboard
|
||||
# ci.commoninternet.net, returns 200) → healthy commits last-good / unhealthy rolls back to last-good
|
||||
# + alert. traefik's wildcard-cert/file-provider config (ssl_cert/ssl_key secrets, WILDCARDS_ENABLED,
|
||||
# COMPOSE_FILE) is preserved EXACTLY by the spec's `setup` (warm_reconcile._traefik_setup). The
|
||||
# runner/ tree is copied into the nix store → D8-clean; recipe fetched at runtime → closure stable.
|
||||
#
|
||||
# Idempotent-RECONCILE systemd oneshot (unchanged unit name `deploy-proxy` — other modules order
|
||||
# after it): converges every activation/boot, self-healing drift. No run-once sentinel.
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
runnerSrc = ../../runner;
|
||||
reconcile = pkgs.writeShellApplication {
|
||||
name = "cc-ci-reconcile-proxy";
|
||||
runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git ];
|
||||
runtimeInputs = with pkgs; [ abra docker git curl jq gnused gnugrep gnutar coreutils ];
|
||||
text = ''
|
||||
PROXY_DOMAIN="traefik.ci.commoninternet.net"
|
||||
CERT_DIR="/var/lib/ci-certs/live"
|
||||
ENV_FILE="$HOME/.abra/servers/default/$PROXY_DOMAIN.env"
|
||||
|
||||
# Fail visibly (failed unit) if the cert is missing — do NOT silently skip. It is
|
||||
# sops-decrypted from git (cc-ci-secrets) at activation; a miss here means the sops decrypt
|
||||
# path is broken (e.g. age identity not present), which must surface, not be papered over.
|
||||
if [ ! -r "$CERT_DIR/fullchain.pem" ] || [ ! -r "$CERT_DIR/privkey.pem" ]; then
|
||||
echo "FATAL: wildcard cert missing at $CERT_DIR (sops decrypt from cc-ci-secrets failed?)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true
|
||||
abra recipe fetch traefik -n >/dev/null
|
||||
|
||||
[ -f "$ENV_FILE" ] || abra app new traefik -s default -D "$PROXY_DOMAIN" -n
|
||||
|
||||
set_env() {
|
||||
sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE"
|
||||
printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE"
|
||||
}
|
||||
set_env LETS_ENCRYPT_ENV ""
|
||||
set_env WILDCARDS_ENABLED "1"
|
||||
set_env SECRET_WILDCARD_CERT_VERSION "v1"
|
||||
set_env SECRET_WILDCARD_KEY_VERSION "v1"
|
||||
set_env COMPOSE_FILE '"compose.yml:compose.wildcard.yml"'
|
||||
|
||||
have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; }
|
||||
have_secret ssl_cert || abra app secret insert "$PROXY_DOMAIN" ssl_cert v1 "$CERT_DIR/fullchain.pem" -f -n
|
||||
have_secret ssl_key || abra app secret insert "$PROXY_DOMAIN" ssl_key v1 "$CERT_DIR/privkey.pem" -f -n
|
||||
|
||||
# Converge the stack (idempotent: no-op if already at desired state).
|
||||
abra app deploy "$PROXY_DOMAIN" -n -C
|
||||
export HOME=/root
|
||||
exec ${pkgs.python3}/bin/python3 ${runnerSrc}/warm_reconcile.py traefik
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.services.deploy-proxy = {
|
||||
description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME) via abra";
|
||||
description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME, health-gated) via abra";
|
||||
after = [ "swarm-init.service" "docker.service" "network-online.target" ];
|
||||
requires = [ "swarm-init.service" "docker.service" ];
|
||||
wants = [ "network-online.target" ];
|
||||
@ -61,6 +37,8 @@ in
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
RemainAfterExit = true;
|
||||
# Generous: a traefik (re)deploy + health-gate; rollback on an unhealthy upgrade.
|
||||
TimeoutStartSec = "900";
|
||||
ExecStart = "${reconcile}/bin/cc-ci-reconcile-proxy";
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user