Files
cc-ci-orchestrator/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix
autonomic-bot ee58027c3e feat(nix): weekly /upgrade-all as a reboot-safe systemd timer (Sun 02:00 UTC)
Replace the boot-fragile busybox-crond-in-tmux (phase 5 §4) with a
systemd service+timer. Service is timer-triggered only (not wantedBy
multi-user.target) so it never runs on boot/activation; mirrors the
cc-ci-loops env fix (CLAUDE_BIN + /home/loops/.local/bin on PATH).
Timer fires Sundays 02:00 UTC, Persistent=true so a missed run (box
down) fires once on next boot. Runs launch-upgrader.py start ->
cc-ci-upgrader agent -> /upgrade-all DEFAULT (opens recipe PRs, never
merges). Activate via nixos-rebuild + retire the old Monday crond after
the phase-5 T0-fire verification completes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-01 22:54:52 +00:00

216 lines
9.6 KiB
Nix

# cc-ci-orchestrator-hetzner — NixOS config for the Hetzner loops runtime host.
#
# Purpose: run the cc-ci Builder/Adversary/Watchdog loops + orchestrator/assistant sessions
# on a Hetzner cpx11 (2 vCPU / 2 GB dedicated AMD / 40 GB NVMe), replacing the slow b1 Incus VM.
#
# Provision with terraform/ then converge with: nixos-rebuild switch --flake .#cc-ci-orchestrator-hetzner
# See terraform/README.md for the full Stage 2 procedure.
{ config, pkgs, lib, ... }:
{
# hardware.nix is the nixos-infect generated hardware-configuration.nix (see README Stage 2a).
services.openssh = {
enable = true;
settings.PermitRootLogin = "yes";
};
# Root SSH access — all keys from the current orchestrator VM's /root/.ssh/authorized_keys.
users.users.root.openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOk8NaeBdPbS2gfUvbny8h0AkZlVjGYHzx4QPXSJ38gd claude@claude-vm"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJVlfoLBPseQ9fA9534KmRg2KWcksKZGzAJIpHJ2JpsI mfowler.email@protonmail.com"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAcyTGb/wVgdhg5oBCZZvBaR1RuUQRY/3WHnOQpNDCsp claude-cc-ci-sandbox@20260526"
];
networking.useDHCP = true;
networking.nameservers = [ "1.1.1.1" "8.8.8.8" ];
networking.firewall = {
enable = true;
trustedInterfaces = [ "tailscale0" ];
# Port 80 open only on the tailscale interface (trusted) — nginx binds there for oc.commoninternet.net.
allowedTCPPorts = [ 22 ];
};
nix.settings.experimental-features = [ "nix-command" "flakes" ];
system.stateVersion = "24.11";
# Tailscale — auth key at /etc/ts-auth-key (placed manually in Stage 2, not in git).
services.tailscale = {
enable = true;
authKeyFile = "/etc/ts-auth-key";
extraUpFlags = [ "--hostname=cc-ci-orchestrator" ];
};
# 4 GB disk swap — claude session memory safety net (2 GB RAM is tight for 3+ sessions).
swapDevices = [ { device = "/swapfile"; size = 4096; } ];
# nix-ld — lets the standalone Claude Code CLI (foreign dynamic ELF / Bun) run on NixOS.
programs.nix-ld.enable = true;
programs.nix-ld.libraries = with pkgs; [
stdenv.cc.cc.lib
zlib
openssl
curl
glibc
];
environment.systemPackages = with pkgs; [
git tmux python3 jq curl cacert
gnused gawk coreutils gnugrep findutils util-linux
nettools openssh
age sops # key management (same toolchain as cc-ci server)
];
# loops user — claude sessions run as non-root (--dangerously-skip-permissions blocked for root).
users.users.loops = {
isNormalUser = true;
home = "/home/loops";
shell = pkgs.bash;
extraGroups = [ "wheel" ];
};
security.sudo.wheelNeedsPassword = false;
security.sudo.extraRules = [{
users = [ "loops" ];
commands = [{ command = "ALL"; options = [ "NOPASSWD" ]; }];
}];
# Ensure /home/loops/.local/bin (claude + opencode) is on the loops user PATH.
# opencode binary is installed there manually (not yet in nixpkgs); re-install if missing:
# curl -sL https://github.com/anomalyco/opencode/releases/download/v1.15.13/opencode-linux-x64.tar.gz \
# | tar -xz -C /home/loops/.local/bin opencode && chmod +x /home/loops/.local/bin/opencode
environment.variables.PATH = lib.mkForce
"/home/loops/.local/bin:/run/current-system/sw/bin:/run/wrappers/bin:/usr/bin:/bin";
# SSH config for the loops user — points to the cc-ci Hetzner server via tailnet.
# HostName is updated post-cutover to the Hetzner cc-ci tailnet IP.
system.activationScripts.loopsSshConfig = ''
mkdir -p /home/loops/.ssh && chown loops:users /home/loops/.ssh && chmod 700 /home/loops/.ssh
# Only write if not already present (preserves manual customisation).
if [ ! -f /home/loops/.ssh/config ]; then
cat > /home/loops/.ssh/config <<'SSHCFG'
Host cc-ci
HostName REPLACE_WITH_CC_CI_HETZNER_TAILNET_IP
User root
IdentityFile /home/loops/.ssh/cc-ci-root-ed25519
IdentitiesOnly yes
StrictHostKeyChecking accept-new
ServerAliveInterval 30
SSHCFG
chmod 600 /home/loops/.ssh/config
chown loops:users /home/loops/.ssh/config
fi
'';
# claude-install — fetch the standalone Claude Code CLI for the loops user if missing.
systemd.services.claude-install = {
description = "Install Claude Code CLI for loops user (idempotent)";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot"; RemainAfterExit = true;
User = "loops"; Group = "users";
};
environment = { HOME = "/home/loops"; };
path = [ pkgs.curl pkgs.bash pkgs.coreutils pkgs.gnutar pkgs.gzip ];
script = ''
if [ ! -x "$HOME/.local/bin/claude" ]; then
echo "installing Claude Code CLI for loops user..."
curl -fsSL https://claude.ai/install.sh | bash || echo "install failed retry on next activation"
fi
'';
};
# opencode web server — one shared instance; all agent sessions attach to it.
# Serves the web UI at http://oc.commoninternet.net (via nginx below, tailscale-only).
# TINFOIL_API_KEY and other creds are read from /srv/cc-ci/.testenv at startup.
systemd.services.opencode-web = {
description = "opencode web server for cc-ci agents (tinfoil/deepseek backend)";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" "tailscaled.service" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "simple";
User = "loops"; Group = "users";
WorkingDirectory = "/srv/cc-ci-orch/cc-ci";
EnvironmentFile = "/srv/cc-ci/.testenv";
ExecStartPre = "${pkgs.coreutils}/bin/rm -rf /tmp/opencode";
ExecStart = "/home/loops/.local/bin/opencode serve --hostname 127.0.0.1 --port 4096";
Restart = "on-failure";
RestartSec = "5s";
};
environment = { HOME = "/home/loops"; };
path = [ pkgs.bash pkgs.coreutils pkgs.git pkgs.python3 pkgs.openssh pkgs.tmux pkgs.nettools ];
};
# nginx — reverse-proxy oc.commoninternet.net → opencode web server.
# Bound to the tailscale IP so it is only reachable on the tailnet.
# DNS: add A record oc.commoninternet.net → 100.84.190.30 (operator step).
services.nginx = {
enable = true;
recommendedProxySettings = true;
virtualHosts."oc.commoninternet.net" = {
# Listen on the tailscale interface only — not the public IP.
listen = [{ addr = "100.84.190.30"; port = 80; ssl = false; }];
locations."/" = {
proxyPass = "http://127.0.0.1:4096";
proxyWebsockets = true;
};
};
};
# cc-ci-loops supervisor — workspace staged 2026-05-31, so ENABLED for reboot-resilience.
systemd.services.cc-ci-loops = {
description = "cc-ci Builder/Adversary loops + watchdog (launch.sh start)";
wantedBy = [ "multi-user.target" ]; # enabled after workspace staged (Hetzner cutover)
after = [ "network-online.target" "tailscaled.service" "claude-install.service" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot"; RemainAfterExit = true;
User = "loops"; Group = "users";
WorkingDirectory = "/srv/cc-ci/cc-ci";
# Append one line to REBOOTS.md per genuine reboot (boot_id-gated; not on manual restart).
ExecStartPre = "${pkgs.bash}/bin/bash /srv/cc-ci/cc-ci-plan/reboot-log.sh";
};
# CLAUDE_BIN points at the standalone CLI installed by claude-install.service; the loops
# backend defaults to claude (persisted in .loop-backend). Without this, launch.py's preflight
# `which(claude)` fails because the systemd `path` below has no /home/loops/.local/bin.
environment = { RESUME_PHASE = "1"; HOME = "/home/loops"; CLAUDE_BIN = "/home/loops/.local/bin/claude"; };
path = [ pkgs.bash pkgs.tmux pkgs.git pkgs.python3 pkgs.openssh pkgs.nettools ];
script = ''
# Put the standalone claude/opencode binaries on PATH. On a cold boot this is the env the
# tmux server (and thus every agent session) inherits, so bare `claude` resolves everywhere.
export PATH="/home/loops/.local/bin:$PATH"
[ -x /srv/cc-ci/cc-ci-plan/launch.sh ] && /srv/cc-ci/cc-ci-plan/launch.sh start || \
echo "workspace not staged yet skipping loop start"
'';
};
# Weekly recipe upgrade — runs /upgrade-all over every enrolled recipe (opens recipe PRs
# verified by !testme, never merges). Replaces the boot-fragile busybox-crond-in-tmux from
# phase 5 §4 with a reboot-safe systemd timer. The service is timer-triggered only (NOT
# wantedBy multi-user.target) so it never runs on boot/activation — only on the schedule.
systemd.services.cc-ci-upgrade-all = {
description = "cc-ci weekly /upgrade-all run (recipe upgrade survey + PRs, never merges)";
after = [ "network-online.target" "tailscaled.service" "claude-install.service" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot"; # launch-upgrader.py spawns the cc-ci-upgrader tmux session and returns
User = "loops"; Group = "users";
WorkingDirectory = "/srv/cc-ci";
};
environment = { HOME = "/home/loops"; CLAUDE_BIN = "/home/loops/.local/bin/claude"; };
path = [ pkgs.bash pkgs.tmux pkgs.git pkgs.python3 pkgs.openssh pkgs.nettools ];
script = ''
export PATH="/home/loops/.local/bin:$PATH"
python3 /srv/cc-ci/cc-ci-plan/launch-upgrader.py start >> /srv/cc-ci/.cc-ci-logs/upgrader-cron.log 2>&1
'';
};
systemd.timers.cc-ci-upgrade-all = {
description = "Weekly trigger for cc-ci-upgrade-all (Sundays 02:00 UTC)";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "Sun *-*-* 02:00:00 UTC";
Persistent = true; # if the box was down at the scheduled time, run once on next boot
};
};
}