From 9158a0f93b1459664defcb545477654aad14f7b9 Mon Sep 17 00:00:00 2001 From: Hermes Date: Fri, 15 May 2026 21:12:53 -0400 Subject: [PATCH] staging-vm-full-module --- .hermes/plans/staging-vm-ci-cd-plan.md | 136 ++++++++++++ assets/compose | 2 +- flake.nix | 1 + hosts/lazyworkhorse/configuration.nix | 4 +- modules/nixos/services/staging-vm.nix | 275 +++++++++++++++++++++++++ 5 files changed, 415 insertions(+), 3 deletions(-) create mode 100644 .hermes/plans/staging-vm-ci-cd-plan.md create mode 100644 modules/nixos/services/staging-vm.nix diff --git a/.hermes/plans/staging-vm-ci-cd-plan.md b/.hermes/plans/staging-vm-ci-cd-plan.md new file mode 100644 index 0000000..aa265d2 --- /dev/null +++ b/.hermes/plans/staging-vm-ci-cd-plan.md @@ -0,0 +1,136 @@ +# Infrastructure CI/CD + Staging Plan + +Date: 2026-05-12 +Status: Draft for review (updated) + +## Current State + +- Gitea Actions workflows exist (PR #21: build-ollama, build-hermes; PR #39: build-nixos) +- act_runner blocked by env var typo (GITEA_RUNNER_REGIS_TOKEN → GITEA_RUNNER_REGISTRATION_TOKEN) +- KVM unavailable currently (VT-x possibly disabled in BIOS) +- NixOS 26.05 on bare metal (Intel Xeon E5-2697 v4, 18 cores, 125GB RAM) +- Docker running: gitea, act_runner, nextcloud, synapse, traefik, etc. + +## Architecture Decision: KVM VM (after enabling VT-x in BIOS) + +Once Intel VT-x is enabled in BIOS, we run a proper KVM/QEMU virtual machine: + +``` +┌─────────────────────────────────────────────────┐ +│ Bare Metal Host (lazyworkhorse) │ +│ │ +│ ┌─────────────────┐ ┌─────────────────────┐ │ +│ │ Production │ │ Staging VM │ │ +│ │ Docker Compose │ │ KVM/QEMU │ │ +│ │ (gitea, nc, ...) │ │ 4 vCPU, 16GB RAM │ │ +│ │ /mnt/HoardCow/ │ │ 50GB virtual disk │ │ +│ └─────────────────┘ │ Own NixOS + Docker │ │ +│ │ Own volumes (isolated) │ │ +│ └─────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────┐ │ +│ │ act_runner (Docker) │ │ +│ │ → SSH deploy to staging VM │ │ +│ │ → Run tests against staging │ │ +│ └─────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────┘ +``` + +## Data Isolation (Critical) + +**Production data is NEVER exposed to staging.** + +- Staging VM gets its own 50GB virtual disk (QCOW2 image) +- All Docker volumes (DB data, uploads, config) live inside the VM's disk +- Host paths like `/mnt/HoardingCow_docker_data/` are NOT bind-mounted +- VM snapshots before major tests for fast rollback +- Even catastrophic staging failure cannot touch production data + +NixOS config approach: +```nix +# In hosts/staging/configuration.nix +let + dataRoot = "/var/lib/staging-docker"; # Inside VM disk +in { + virtualisation.oci-containers.containers = { + nextcloud = { + volumes = [ "${dataRoot}/nextcloud:/var/www/html" ]; + # Same image, same config, different volume path + }; + }; +} +``` + +## Implementation Phases + +### Phase 0: Enable KVM +1. Reboot server, enter BIOS, enable Intel Virtualization Technology (VT-x) +2. Boot into NixOS +3. Add to lazyworkhorse configuration.nix: + ```nix + boot.kernelModules = [ "kvm-intel" "kvm" ]; + virtualisation.libvirtd.enable = true; + users.users.ai-worker.extraGroups = [ "libvirtd" ]; + ``` +4. nixos-rebuild switch → reboot → verify `ls /dev/kvm` + +### Phase 1: Fix CI Runner +1. Fix env var typo in act_runner config +2. Merge PR #21 (workflows), #22 (runner), #39 (nixos CI) +3. Verify runner processes PR builds + +### Phase 2: Create Staging VM +1. Define VM with virsh: + - 4 vCPU, 16GB RAM, 50GB QCOW2 disk + - Bridge network (192.168.122.0/24 via libvirt default NAT) + - Install NixOS via nixos-anywhere or ISO +2. Deploy NixOS config to staging (imports same modules as production) +3. Verify Docker and services come up in staging + +### Phase 3: CI Deploys to Staging +1. CI builds config (`nix build .#nixosConfigurations.staging`) +2. CI deploys: `nixos-rebuild switch --flake .#staging --target-host root@192.168.122.X` +3. CI runs health checks against staging services + +### Phase 4: Accumulate Tests +1. Create `tests/` directory in infra repo +2. Each new feature adds its test(s) +3. All tests run on every PR +4. Test categories: + - Container health (are all services running?) + - HTTP response (do endpoints return 200?) + - Integration (does feature X still work?) + - Regression (did change Y break Z?) + +### Phase 5: Auto-Rollback & Deploy +1. Add auto-rollback to nixos-rebuild: + ```nix + boot.loader.systemd-boot.autoRollback = true; + ``` +2. Or script: switch → health check → rollback on failure +3. Cron job for automatic nixos-rebuild on merged PRs +4. Only deploy commits that passed staging CI + +## Test Suite Examples + +```bash +# tests/containers_running.sh +for container in gitea nextcloud synapse traefik; do + if ! ssh staging "docker ps --format '{{.Names}}' | grep -q $container"; then + echo "FAIL: $container not running" + exit 1 + fi +done + +# tests/endpoints.sh +curl -sf http://192.168.122.50:3000 > /dev/null || exit 1 # Gitea +curl -sf http://192.168.122.50:8080 > /dev/null || exit 1 # Nextcloud +``` + +## To Be Decided + +1. **VM resources**: 4 vCPU / 16GB RAM sufficient? +2. **Network**: libvirt default NAT (192.168.122.0/24) or dedicated bridge? +3. **VM disk**: 50GB enough for NixOS + Docker images + volumes? +4. **Auto-merge**: full auto or with "safe-to-merge" label gate? +5. **Test runner**: inline bash in Gitea Actions, or separate test script repo? diff --git a/assets/compose b/assets/compose index 6b82a26..f9fb28d 160000 --- a/assets/compose +++ b/assets/compose @@ -1 +1 @@ -Subproject commit 6b82a26c25f1592a2d1c9bea4f941864362fe001 +Subproject commit f9fb28d56078e7503516ac69307e862f3929c92b diff --git a/flake.nix b/flake.nix index 8f8b51a..6276626 100644 --- a/flake.nix +++ b/flake.nix @@ -61,6 +61,7 @@ ./modules/nixos/services/open_code_server.nix ./modules/nixos/services/ollama_init_custom_models.nix ./modules/nixos/services/openclaw_node.nix + ./modules/nixos/services/staging-vm.nix ./modules/nixos/security/ai-worker-restricted.nix ./users/gortium.nix ./users/ai-worker.nix diff --git a/hosts/lazyworkhorse/configuration.nix b/hosts/lazyworkhorse/configuration.nix index 13baa96..8419b2e 100644 --- a/hosts/lazyworkhorse/configuration.nix +++ b/hosts/lazyworkhorse/configuration.nix @@ -340,9 +340,9 @@ }; # KVM/libvirt for staging VM - virtualisation.libvirtd.enable = true; + services.stagingVm.enable = true; -# Open ports in the firewall. + # Open ports in the firewall. # networking.firewall.allowedTCPPorts = [ ... ]; # networking.firewall.allowedUDPPorts = [ ... ]; # Or disable the firewall altogether. diff --git a/modules/nixos/services/staging-vm.nix b/modules/nixos/services/staging-vm.nix new file mode 100644 index 0000000..91bf667 --- /dev/null +++ b/modules/nixos/services/staging-vm.nix @@ -0,0 +1,275 @@ +{ config, pkgs, lib, ... }: + +with lib; + +let + cfg = config.services.stagingVm; +in +{ + options.services.stagingVm = { + enable = mkOption { + type = types.bool; + default = false; + description = "Enable KVM/libvirt staging VM for compose PR testing"; + }; + + vmName = mkOption { + type = types.str; + default = "compose-test-vm"; + description = "Name of the staging VM"; + }; + + memory = mkOption { + type = types.str; + default = "4096"; + description = "RAM allocated to the staging VM (MB)"; + }; + + vcpus = mkOption { + type = types.int; + default = 2; + description = "Number of vCPUs for the staging VM"; + }; + + storagePath = mkOption { + type = types.str; + default = "/var/lib/libvirt/images"; + description = "Path for libvirt storage pool"; + }; + + dataPath = mkOption { + type = types.str; + default = "/var/lib/staging-vm"; + description = "Path for compose test data (PR checkouts, test results)"; + }; + }; + + config = mkIf cfg.enable { + # Enable libvirt daemon + virtualisation.libvirtd = { + enable = true; + qemu = { + package = pkgs.qemu_kvm; + runAsRoot = true; + swtpm.enable = true; + ovmf = { + enable = true; + packages = [ pkgs.OVMFFull.fd ]; + }; + }; + }; + + # Kernel modules + groups already handled in configuration.nix + + # libvirt NAT network (192.168.122.0/24) + environment.etc."libvirt/qemu/networks/default.xml" = { + text = '' + + default + 2b8f7a3c-9e5d-4a1f-bc3d-6e7a8f9b0c1d + + + + + + + + + + + + + + ''; + # Autostart the network so it comes up on boot + mode = "0644"; + }; + + # Ensure the default network is defined and autostarted + systemd.services.libvirtd = { + postStart = '' + ${pkgs.libvirt}/bin/virsh net-define /etc/libvirt/qemu/networks/default.xml 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh net-autostart default 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh net-start default 2>/dev/null || true + ''; + }; + + # Storage directory for VM images + systemd.tmpfiles.rules = [ + "d ${cfg.storagePath} 0755 root root -" + "d ${cfg.dataPath} 0755 root root -" + ]; + + # Ensure storage pool exists in libvirt + systemd.services.libvirtd.postStart = mkAfter '' + ${pkgs.libvirt}/bin/virsh pool-define-as default dir --target "${cfg.storagePath}" 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh pool-autostart default 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh pool-start default 2>/dev/null || true + ''; + + # Firewall: allow traffic from virbr0 to host and outbound NAT + networking.firewall = { + extraCommands = '' + # Allow inbound DHCP/DNS from libvirt guests + iptables -I INPUT -i virbr0 -p udp --dport 67:68 -j ACCEPT + iptables -I INPUT -i virbr0 -p tcp --dport 53 -j ACCEPT + iptables -I INPUT -i virbr0 -p udp --dport 53 -j ACCEPT + + # Allow established/related traffic back to guests + iptables -I FORWARD -i virbr0 -o virbr0 -j ACCEPT + iptables -I FORWARD -o virbr0 -j ACCEPT + iptables -I FORWARD -i virbr0 -j ACCEPT + ''; + }; + + # Packages needed for VM management + environment.systemPackages = with pkgs; [ + libvirt + qemu_kvm + virt-manager # optional GUI for manual management + OVMFFull + swtpm + ]; + + # Enable docker in the host (already enabled, but ensure for compose testing) + virtualisation.docker.enable = true; + + # Helper script: pr-test-vm + # Usage: + # pr-test-vm build — build the staging VM derivation + # pr-test-vm start — boot the VM with a compose PR branch + # pr-test-vm stop — graceful shutdown + # pr-test-vm destroy — force stop + delete VM + # pr-test-vm ssh — SSH into the running VM + systemd.tmpfiles.rules = mkAfter [ + "d ${cfg.dataPath}/scripts 0755 root root -" + ]; + + environment.systemPackages = [ (pkgs.writeShellScriptBin "pr-test-vm" '' + set -euo pipefail + + DATA="${cfg.dataPath}" + VM_NAME="${cfg.vmName}" + VM_IMAGE="''${DATA}/''${VM_NAME}.qcow2" + VM_PORT=2223 + + build_vm() { + echo "==> Building NixOS staging VM for compose testing..." + # Build the VM config inline — a minimal NixOS with Docker + SSH + cat > /tmp/staging-vm-config.nix << 'NIXEOF' + { config, pkgs, lib, ... }: { + boot.loader.grub.devices = [ "/dev/vda" ]; + boot.loader.timeout = 0; + + # Minimal kernel + boot.kernelParams = [ "console=ttyS0" ]; + boot.initrd.kernelModules = [ "virtio_blk" "virtio_net" "virtio_pci" ]; + + # SSH access + services.openssh = { + enable = true; + settings.PasswordAuthentication = false; + settings.PermitRootLogin = "prohibit-password"; + }; + + # Docker for compose testing + virtualisation.docker.enable = true; + + # Network (DHCP via virbr0) + networking.useDHCP = true; + networking.firewall.enable = false; + + # Users + users.users.root.openssh.authorizedKeys.keys = [ + "$(cat /root/.ssh/authorized_keys 2>/dev/null || echo 'ssh-ed25519 AAAAC3... placeholder')" + ]; + users.users.testrunner = { + isNormalUser = true; + extraGroups = [ "docker" ]; + openssh.authorizedKeys.keys = [ + "$(cat /root/.ssh/authorized_keys 2>/dev/null || echo 'ssh-ed25519 AAAAC3... placeholder')" + ]; + }; + + # Git + compose tools + environment.systemPackages = with pkgs; [ git docker-compose curl ]; + + system.stateVersion = "24.11"; + } + NIXEOF + + nixos-rebuild build-vm -I nixpkgs=channel:nixos-unstable \ + --arg configuration 'import /tmp/staging-vm-config.nix' \ + --out-link "''${DATA}/vm-result" + echo "==> VM built. Run 'pr-test-vm start' to boot." + } + + start_vm() { + if [ -f "''${VM_IMAGE}" ]; then + echo "==> Booting existing VM..." + else + echo "==> Creating VM image..." + ${pkgs.qemu_kvm}/bin/qemu-img create -f qcow2 "''${VM_IMAGE}" 20G + fi + + # Check if already running + if ${pkgs.libvirt}/bin/virsh list --name 2>/dev/null | grep -q "''${VM_NAME}"; then + echo "==> VM already running." + exit 0 + fi + + ${pkgs.qemu_kvm}/bin/qemu-system-x86_64 \ + -name "''${VM_NAME}" \ + -machine q35,accel=kvm \ + -cpu host \ + -smp ${toString cfg.vcpus} \ + -m ${cfg.memory} \ + -drive file="''${VM_IMAGE}",if=virtio,format=qcow2 \ + -netdev user,id=net0,hostfwd=tcp::''${VM_PORT}-:22 \ + -device virtio-net-pci,netdev=net0 \ + -nographic \ + -serial mon:stdio \ + -pidfile "''${DATA}/''${VM_NAME}.pid" \ + -daemonize + + echo "==> VM booting... SSH on port ''${VM_PORT}" + echo "==> Wait for it: ssh -p ''${VM_PORT} testrunner@localhost" + } + + stop_vm() { + PIDFILE="''${DATA}/''${VM_NAME}.pid" + if [ -f "''${PIDFILE}" ]; then + PID=$(cat "''${PIDFILE}") + kill "''${PID}" 2>/dev/null || true + rm -f "''${PIDFILE}" + echo "==> VM stopped." + else + ${pkgs.libvirt}/bin/virsh destroy "''${VM_NAME}" 2>/dev/null || true + echo "==> VM destroyed." + fi + } + + ssh_vm() { + exec ssh -p "''${VM_PORT}" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "testrunner@localhost" "$@" + } + + # Main dispatch + case "''${1:-help}" in + build) build_vm ;; + start) start_vm ;; + stop) stop_vm ;; + destroy) stop_vm; rm -f "''${VM_IMAGE}"; echo "==> VM deleted." ;; + ssh) shift; ssh_vm "$@" ;; + *) + echo "Usage: pr-test-vm {build|start|stop|destroy|ssh}" + echo "" + echo " build — build the NixOS VM derivation" + echo " start — boot the VM (create image if needed)" + echo " stop — graceful VM shutdown" + echo " destroy — stop + delete VM image" + echo " ssh — SSH into the running VM" + ;; + esac + '') ]; + }; +}