diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..ae4d17c --- /dev/null +++ b/flake.nix @@ -0,0 +1,29 @@ +{ + description = "cc-ci-orchestrator — NixOS host for the cc-ci loops runtime (Builder/Adversary/Watchdog)"; + + inputs = { + # Pinned to the same revision as the cc-ci server for ecosystem consistency. + nixpkgs.url = "github:NixOS/nixpkgs/50ab793786d9de88ee30ec4e4c24fb4236fc2674"; + + # Same pin as cc-ci server (buildGo125Module compatibility with nixpkgs 24.11). + sops-nix.url = "github:Mic92/sops-nix/77c423a03b9b2b79709ea2cb63336312e78b72e2"; + sops-nix.inputs.nixpkgs.follows = "nixpkgs"; + }; + + outputs = { nixpkgs, sops-nix, ... }: + let + system = "x86_64-linux"; + in + { + # Hetzner cpx11 host (nixos-infect generated hardware.nix + orchestrator config). + # Provision with terraform/ then run Stage 2 per terraform/README.md. + nixosConfigurations.cc-ci-orchestrator-hetzner = nixpkgs.lib.nixosSystem { + inherit system; + modules = [ + sops-nix.nixosModules.sops + ./nix/hosts/cc-ci-orchestrator-hetzner/hardware.nix + ./nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix + ]; + }; + }; +} diff --git a/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix b/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix new file mode 100644 index 0000000..0d5ca61 --- /dev/null +++ b/nix/hosts/cc-ci-orchestrator-hetzner/configuration.nix @@ -0,0 +1,126 @@ +# cc-ci-orchestrator-hetzner — NixOS config for the Hetzner loops runtime host. +# +# Purpose: run the cc-ci Builder/Adversary/Watchdog loops + orchestrator/assistant sessions +# on a Hetzner cpx11 (2 vCPU / 2 GB dedicated AMD / 40 GB NVMe), replacing the slow b1 Incus VM. +# +# Provision with terraform/ then converge with: nixos-rebuild switch --flake .#cc-ci-orchestrator-hetzner +# See terraform/README.md for the full Stage 2 procedure. +{ config, pkgs, lib, ... }: +{ + # hardware.nix is the nixos-infect generated hardware-configuration.nix (see README Stage 2a). + + services.openssh = { enable = true; settings.PermitRootLogin = "yes"; }; + networking.useDHCP = true; + networking.nameservers = [ "1.1.1.1" "8.8.8.8" ]; + networking.firewall = { + enable = true; + trustedInterfaces = [ "tailscale0" ]; + allowedTCPPorts = [ 22 ]; + }; + nix.settings.experimental-features = [ "nix-command" "flakes" ]; + system.stateVersion = "24.11"; + + # Tailscale — auth key at /etc/ts-auth-key (placed manually in Stage 2, not in git). + services.tailscale = { + enable = true; + authKeyFile = "/etc/ts-auth-key"; + extraUpFlags = [ "--hostname=cc-ci-orchestrator" "--ssh" ]; + }; + + # 4 GB disk swap — claude session memory safety net (2 GB RAM is tight for 3+ sessions). + swapDevices = [ { device = "/swapfile"; size = 4096; } ]; + + # nix-ld — lets the standalone Claude Code CLI (foreign dynamic ELF / Bun) run on NixOS. + programs.nix-ld.enable = true; + programs.nix-ld.libraries = with pkgs; [ + stdenv.cc.cc.lib + zlib + openssl + curl + glibc + ]; + + environment.systemPackages = with pkgs; [ + git tmux python3 jq curl cacert + gnused gawk coreutils gnugrep findutils util-linux + nettools openssh + age sops # key management (same toolchain as cc-ci server) + ]; + + # loops user — claude sessions run as non-root (--dangerously-skip-permissions blocked for root). + users.users.loops = { + isNormalUser = true; + home = "/home/loops"; + shell = pkgs.bash; + extraGroups = [ "wheel" ]; + }; + security.sudo.wheelNeedsPassword = false; + security.sudo.extraRules = [{ + users = [ "loops" ]; + commands = [{ command = "ALL"; options = [ "NOPASSWD" ]; }]; + }]; + + # Ensure /home/loops/.local/bin (claude) is on the loops user PATH. + environment.variables.PATH = lib.mkForce + "/home/loops/.local/bin:/run/current-system/sw/bin:/run/wrappers/bin:/usr/bin:/bin"; + + # SSH config for the loops user — points to the cc-ci Hetzner server via tailnet. + # HostName is updated post-cutover to the Hetzner cc-ci tailnet IP. + system.activationScripts.loopsSshConfig = '' + mkdir -p /home/loops/.ssh && chown loops:users /home/loops/.ssh && chmod 700 /home/loops/.ssh + # Only write if not already present (preserves manual customisation). + if [ ! -f /home/loops/.ssh/config ]; then + cat > /home/loops/.ssh/config <<'SSHCFG' +Host cc-ci + HostName REPLACE_WITH_CC_CI_HETZNER_TAILNET_IP + User root + IdentityFile /home/loops/.ssh/cc-ci-root-ed25519 + IdentitiesOnly yes + StrictHostKeyChecking accept-new + ServerAliveInterval 30 +SSHCFG + chmod 600 /home/loops/.ssh/config + chown loops:users /home/loops/.ssh/config + fi + ''; + + # claude-install — fetch the standalone Claude Code CLI for the loops user if missing. + systemd.services.claude-install = { + description = "Install Claude Code CLI for loops user (idempotent)"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + serviceConfig = { + Type = "oneshot"; RemainAfterExit = true; + User = "loops"; Group = "users"; + }; + environment = { HOME = "/home/loops"; }; + path = [ pkgs.curl pkgs.bash pkgs.coreutils pkgs.gnutar pkgs.gzip ]; + script = '' + if [ ! -x "$HOME/.local/bin/claude" ]; then + echo "installing Claude Code CLI for loops user..." + curl -fsSL https://claude.ai/install.sh | bash || echo "install failed — retry on next activation" + fi + ''; + }; + + # cc-ci-loops supervisor — defined but NOT enabled until workspace is staged. + # Enable by adding wantedBy after staging (Stage 2e) for reboot-resilience. + systemd.services.cc-ci-loops = { + description = "cc-ci Builder/Adversary loops + watchdog (launch.sh start)"; + # wantedBy = [ "multi-user.target" ]; # uncomment after workspace is staged + after = [ "network-online.target" "tailscaled.service" "claude-install.service" ]; + wants = [ "network-online.target" ]; + serviceConfig = { + Type = "oneshot"; RemainAfterExit = true; + User = "loops"; Group = "users"; + WorkingDirectory = "/srv/cc-ci"; + }; + environment = { RESUME_PHASE = "1"; HOME = "/home/loops"; }; + path = [ pkgs.bash pkgs.tmux pkgs.git pkgs.python3 pkgs.openssh pkgs.nettools ]; + script = '' + [ -x /srv/cc-ci/cc-ci-plan/launch.sh ] && /srv/cc-ci/cc-ci-plan/launch.sh start || \ + echo "workspace not staged yet — skipping loop start" + ''; + }; +} diff --git a/nix/hosts/cc-ci-orchestrator-hetzner/hardware.nix b/nix/hosts/cc-ci-orchestrator-hetzner/hardware.nix new file mode 100644 index 0000000..a6d0a46 --- /dev/null +++ b/nix/hosts/cc-ci-orchestrator-hetzner/hardware.nix @@ -0,0 +1,21 @@ +# PLACEHOLDER — replace with the output of: +# ssh root@ 'cat /etc/nixos/hardware-configuration.nix' +# after nixos-infect completes. See terraform/README.md Stage 2a. +# +# A typical Hetzner cpx11 nixos-infect hardware.nix looks like: +# +# { config, lib, pkgs, modulesPath, ... }: { +# imports = [ (modulesPath + "/profiles/qemu-guest.nix") ]; +# boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ]; +# boot.initrd.kernelModules = [ ]; +# boot.kernelModules = [ "kvm-amd" ]; +# boot.extraModulePackages = [ ]; +# fileSystems."/" = { device = "/dev/sda1"; fsType = "ext4"; }; +# boot.loader.grub.enable = true; +# boot.loader.grub.device = "/dev/sda"; +# swapDevices = [ ]; +# nixpkgs.hostPlatform = "x86_64-linux"; +# } +# +# Do not commit this placeholder — replace it with the real hardware-configuration.nix. +throw "Replace this placeholder with the real nixos-infect hardware-configuration.nix" diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..b597b16 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,123 @@ +# terraform — Hetzner cc-ci-orchestrator server + +Provisions a Hetzner **cpx11** (2 vCPU / 2 GB dedicated AMD / 40 GB NVMe) for the cc-ci loops +runtime (Builder + Adversary + Watchdog + Orchestrator sessions), replacing the slow b1 Incus VM. +Uses nixos-infect to convert Debian → NixOS, then converges via the cc-ci-orchestrator flake. + +--- + +## Stage 1 — provision the server + +```bash +# from /srv/cc-ci/terraform/ +source /srv/cc-ci/.testenv # loads HCLOUD_TOKEN +export TF_VAR_ssh_public_key="$(cat /home/loops/.ssh/cc-ci-root-ed25519.pub)" + +tofu init +tofu plan +tofu apply +``` + +Note the `server_ipv4` output. nixos-infect runs on first boot — wait ~5 min, then: + +```bash +# confirm NixOS is up (may need to retry while infect reboots) +ssh root@ 'nixos-version' +``` + +--- + +## Stage 2 — converge to cc-ci-orchestrator-hetzner + +### 2a. Capture hardware config + +```bash +ssh root@ 'cat /etc/nixos/hardware-configuration.nix' +``` + +Copy the output to `nix/hosts/cc-ci-orchestrator-hetzner/hardware.nix` in this repo, commit, push. + +### 2b. Stage workspace on the new server + +```bash +ssh root@ + +# Install Tailscale auth key (from .testenv TS_AUTH_KEY) +echo "" > /etc/ts-auth-key && chmod 600 /etc/ts-auth-key + +# Clone this repo as the loops user workspace +git clone --recursive \ + https://autonomic-bot:@git.autonomic.zone/recipe-maintainers/cc-ci-orchestrator.git \ + /srv/cc-ci-orch +ln -sfn /srv/cc-ci-orch /srv/cc-ci # loops expect /srv/cc-ci + +# Place master age key (copied from current VM .sops/master-age.txt) +mkdir -p /srv/cc-ci/.sops +scp loops@:/srv/cc-ci/.sops/master-age.txt /srv/cc-ci/.sops/master-age.txt +chmod 600 /srv/cc-ci/.sops/master-age.txt +``` + +### 2c. Run nixos-rebuild + +```bash +# on the new server +cd /srv/cc-ci +nixos-rebuild switch --flake .#cc-ci-orchestrator-hetzner +``` + +### 2d. Stage credentials (not in git — placed once) + +```bash +# SSH key for reaching cc-ci +mkdir -p /home/loops/.ssh && chmod 700 /home/loops/.ssh +# scp cc-ci-root-ed25519 from current VM or copy content +chmod 600 /home/loops/.ssh/cc-ci-root-ed25519 + +# .testenv (GITEA creds, etc.) +cp /path/to/.testenv /srv/cc-ci/.testenv && chmod 600 /srv/cc-ci/.testenv +``` + +### 2e. Auth claude and start loops + +```bash +# as loops user on new server +sudo -u loops /home/loops/.local/bin/claude auth login # device code — operator step + +# start the loops +cd /srv/cc-ci && sudo -u loops ./cc-ci-plan/launch.sh start +``` + +### 2f. Verify + +```bash +tmux ls # should show cc-ci-builder, cc-ci-adv, cc-ci-watchdog +``` + +--- + +## Cutover + +Once the new server is running and the loops are verified: + +1. Update the `Host cc-ci` entry in the current VM's `/home/loops/.ssh/config` if needed +2. Stop the old Incus VM (or just leave it idle — it costs nothing in disk) + +--- + +## Variables + +| Variable | Default | Notes | +|---|---|---| +| `location` | `nbg1` | Nuremberg | +| `server_type` | `cpx11` | 2 vCPU / 2 GB dedicated AMD. Upgrade to `cpx21` (4 GB) if OOM. | +| `image` | `debian-12` | nixos-infect base | +| `server_name` | `cc-ci-orchestrator` | | +| `ssh_public_key` | required | Pass via `TF_VAR_ssh_public_key` | + +--- + +## State + +`terraform.tfstate` and `terraform.tfstate.backup` are gitignored. Keep the state file locally or +in a remote backend — losing it means `tofu destroy` can't find the server (use `tofu import` to +recover, or delete directly via the Hetzner console). diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000..0fddafa --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,32 @@ +resource "hcloud_ssh_key" "cc_ci_orch" { + name = "cc-ci-orchestrator-deploy" + public_key = var.ssh_public_key + + labels = { + project = "cc-ci-orchestrator" + managed = "terraform" + } +} + +resource "hcloud_server" "cc_ci_orch" { + name = var.server_name + server_type = var.server_type + image = var.image + location = var.location + ssh_keys = [hcloud_ssh_key.cc_ci_orch.id] + + # Stage 1: cloud-init runs nixos-infect on first boot, converting Debian to NixOS, then reboots. + # Wait ~5 min after apply, then SSH in and run Stage 2 per README.md. + user_data = file("${path.module}/user-data.sh") + + public_net { + ipv4_enabled = true + ipv6_enabled = false + } + + labels = { + project = "cc-ci-orchestrator" + managed = "terraform" + stage = "infect" + } +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000..de5c861 --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,19 @@ +output "server_ipv4" { + description = "Public IPv4 address of the cc-ci-orchestrator Hetzner server" + value = hcloud_server.cc_ci_orch.ipv4_address +} + +output "server_id" { + description = "Hetzner internal server ID" + value = hcloud_server.cc_ci_orch.id +} + +output "ssh_connect" { + description = "SSH command to connect as root (after nixos-infect)" + value = "ssh root@${hcloud_server.cc_ci_orch.ipv4_address}" +} + +output "nixos_infect_log" { + description = "Check infect progress" + value = "ssh root@${hcloud_server.cc_ci_orch.ipv4_address} 'cat /var/log/nixos-infect.log'" +} diff --git a/terraform/user-data.sh b/terraform/user-data.sh new file mode 100644 index 0000000..5068fb2 --- /dev/null +++ b/terraform/user-data.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Stage 1 — convert Debian 12 → NixOS via nixos-infect (pinned revision). +# +# nixos-infect generates /etc/nixos/{configuration.nix,hardware-configuration.nix,networking.nix} +# with Hetzner-correct bootloader (GRUB) and networking, then reboots into NixOS. +# +# After the reboot SSH as root is available. Run Stage 2 per terraform/README.md. +# Logs: /var/log/nixos-infect.log + +set -euo pipefail + +# Same pinned revision as the cc-ci server terraform (2026-03-22). +INFECT_SHA="40f62a680bb0e8f2f607d79abfaaecd99d59401c" + +export NIX_CHANNEL="nixos-24.11" +export PROVIDER="hetzner" +export NIXOS_IMPORT="" + +curl -fsSL "https://raw.githubusercontent.com/elitak/nixos-infect/${INFECT_SHA}/nixos-infect" \ + | bash -x 2>&1 | tee /var/log/nixos-infect.log diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..93ece2b --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,38 @@ +variable "location" { + description = "Hetzner datacenter (nbg1=Nuremberg, fsn1=Falkenstein, hel1=Helsinki)" + type = string + default = "nbg1" +} + +variable "server_type" { + description = <<-EOT + Hetzner server type. Must be x86 — the flake is x86_64-linux; NEVER use cax* (ARM). + cpx11 = AMD 2 vCPU / 2 GB (default; dedicated vCPU, NVMe — the orchestrator loops runtime). + cpx21 = AMD 3 vCPU / 4 GB (upgrade if claude sessions OOM under cpx11). + cx22 = AMD 2 vCPU / 4 GB (shared vCPU, cheaper alternative with more RAM). + EOT + type = string + default = "cpx11" + + validation { + condition = !startswith(var.server_type, "cax") + error_message = "ARM server types (cax*) are not supported — the flake is x86_64-linux only." + } +} + +variable "image" { + description = "Base OS image. nixos-infect supports debian-12 and ubuntu-24.04. debian-12 preferred." + type = string + default = "debian-12" +} + +variable "ssh_public_key" { + description = "SSH public key content (the full line). Registered with Hetzner for root access post-infect. Pass via TF_VAR_ssh_public_key." + type = string +} + +variable "server_name" { + description = "Hetzner server name and initial NixOS hostname" + type = string + default = "cc-ci-orchestrator" +} diff --git a/terraform/versions.tf b/terraform/versions.tf new file mode 100644 index 0000000..875220b --- /dev/null +++ b/terraform/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.0" + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "1.64.0" + } + } +} + +# The hcloud provider reads HCLOUD_TOKEN from the environment automatically. +# Never put the token value in any .tf file or .tfvars — keep it in the shell +# environment (export HCLOUD_TOKEN=...) or pass via TF_VAR_hcloud_token. +provider "hcloud" {}