claim(2w): W0.10a traefik WC1.1 migrated onto shared health-gated reconciler — no-op converge proven; destructive rollback = Adversary cold proof

warm_reconcile.py: per-spec setup hook + health_domain; SPECS[traefik]
(stateful=False, version-rollback-only, _traefik_setup preserves wildcard-cert/
file-provider config, health on routed dashboard host). keycloak path unchanged.
proxy.nix: deploy-proxy.service now execs warm_reconcile.py traefik. ZERO-disruption
migration (traefik already at latest 5.1.1+v3.6.15; pre-seeded TYPE+last_good →
clean no-op converge; traefik 200 + keycloak-through-traefik 200 + 0 failed).
65 unit pass. Per operator out: code+converge delivered; destructive rollback
(brief TLS blip) = Adversary's required cold proof. Closes the W0.10a tracked-open.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-29 03:50:32 +01:00
parent aec6911c68
commit e678d2e006
5 changed files with 145 additions and 45 deletions

View File

@ -4,55 +4,31 @@
# Phase-1c: the cert at CERT_DIR is sops-decrypted from git (cc-ci-secrets) at activation
# (modules/secrets.nix wildcard_cert/wildcard_key), NOT an out-of-band operator file drop.
#
# Declared as an idempotent-RECONCILE systemd oneshot (like swarm-init): it inspects current
# state and converges every activation/boot, self-healing drift (redeploys if the stack is gone,
# re-inserts secrets if missing). No run-once sentinel. So a from-scratch install is just
# `nixos-rebuild switch` + operator preconditions (D8) — no manual post-steps.
# Phase-2w / WC1.1: traefik is now UNPINNED + health-gated like keycloak — the deploy is driven by
# the shared `runner/warm_reconcile.py traefik` (STATELESS = version-rollback-only, NO snapshot):
# record last-good version → deploy latest tag → health-gate (a ROUTED host, the dashboard
# ci.commoninternet.net, returns 200) → healthy commits last-good / unhealthy rolls back to last-good
# + alert. traefik's wildcard-cert/file-provider config (ssl_cert/ssl_key secrets, WILDCARDS_ENABLED,
# COMPOSE_FILE) is preserved EXACTLY by the spec's `setup` (warm_reconcile._traefik_setup). The
# runner/ tree is copied into the nix store → D8-clean; recipe fetched at runtime → closure stable.
#
# Idempotent-RECONCILE systemd oneshot (unchanged unit name `deploy-proxy` — other modules order
# after it): converges every activation/boot, self-healing drift. No run-once sentinel.
{ pkgs, ... }:
let
runnerSrc = ../../runner;
reconcile = pkgs.writeShellApplication {
name = "cc-ci-reconcile-proxy";
runtimeInputs = with pkgs; [ abra docker jq gnused gnugrep coreutils git ];
runtimeInputs = with pkgs; [ abra docker git curl jq gnused gnugrep gnutar coreutils ];
text = ''
PROXY_DOMAIN="traefik.ci.commoninternet.net"
CERT_DIR="/var/lib/ci-certs/live"
ENV_FILE="$HOME/.abra/servers/default/$PROXY_DOMAIN.env"
# Fail visibly (failed unit) if the cert is missing do NOT silently skip. It is
# sops-decrypted from git (cc-ci-secrets) at activation; a miss here means the sops decrypt
# path is broken (e.g. age identity not present), which must surface, not be papered over.
if [ ! -r "$CERT_DIR/fullchain.pem" ] || [ ! -r "$CERT_DIR/privkey.pem" ]; then
echo "FATAL: wildcard cert missing at $CERT_DIR (sops decrypt from cc-ci-secrets failed?)" >&2
exit 1
fi
abra server ls -m -n >/dev/null 2>&1 || abra server add --local -n || true
abra recipe fetch traefik -n >/dev/null
[ -f "$ENV_FILE" ] || abra app new traefik -s default -D "$PROXY_DOMAIN" -n
set_env() {
sed -i -E "/^[[:space:]]*#?[[:space:]]*$1=/d" "$ENV_FILE"
printf '%s=%s\n' "$1" "$2" >> "$ENV_FILE"
}
set_env LETS_ENCRYPT_ENV ""
set_env WILDCARDS_ENABLED "1"
set_env SECRET_WILDCARD_CERT_VERSION "v1"
set_env SECRET_WILDCARD_KEY_VERSION "v1"
set_env COMPOSE_FILE '"compose.yml:compose.wildcard.yml"'
have_secret() { docker secret ls --format '{{.Name}}' | grep -q "_$1_v1$"; }
have_secret ssl_cert || abra app secret insert "$PROXY_DOMAIN" ssl_cert v1 "$CERT_DIR/fullchain.pem" -f -n
have_secret ssl_key || abra app secret insert "$PROXY_DOMAIN" ssl_key v1 "$CERT_DIR/privkey.pem" -f -n
# Converge the stack (idempotent: no-op if already at desired state).
abra app deploy "$PROXY_DOMAIN" -n -C
export HOME=/root
exec ${pkgs.python3}/bin/python3 ${runnerSrc}/warm_reconcile.py traefik
'';
};
in
{
systemd.services.deploy-proxy = {
description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME) via abra";
description = "Reconcile the Co-op Cloud traefik proxy (wildcard/no-ACME, health-gated) via abra";
after = [ "swarm-init.service" "docker.service" "network-online.target" ];
requires = [ "swarm-init.service" "docker.service" ];
wants = [ "network-online.target" ];
@ -61,6 +37,8 @@ in
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
# Generous: a traefik (re)deploy + health-gate; rollback on an unhealthy upgrade.
TimeoutStartSec = "900";
ExecStart = "${reconcile}/bin/cc-ci-reconcile-proxy";
};
};