Compare commits

..

6 Commits

Author SHA1 Message Date
0a37d27337 feat: enhance staging-vm module
Some checks failed
Build and test NixOS config / build (pull_request) Has been cancelled
Improved pr-test-vm script (virt-install, DHCP IP discovery), added packages (virt-manager, libguestfs, cdrtools, gawk, etc.), better firewall rules, storage pool auto-creation, gortium in libvirtd group, fixed OVMF package reference
2026-05-20 14:24:37 -04:00
2c981578a5 feat: full integration test suite for staging VM
Some checks failed
Build and test NixOS config / build (pull_request) Has been cancelled
Replace the stub placeholder with a comprehensive integration test
script that verifyies Docker daemon, compose stack, and service
endpoint health. All configuration via environment variables with
sensible defaults.

Changes:
- tests/run-integration.sh: 5-phase test suite with color output,
  retry logic, env-var configuration, and CI-friendly exit codes
- .gitea/workflows/build-nixos.yml: update CI step to document
  pr-test-vm usage with the new test script

See also: pr-test-vm helper in modules/nixos/services/staging-vm.nix
2026-05-20 14:18:27 -04:00
ec3da64594 feat: add CI workflow and integration test stub
Some checks failed
Build and test NixOS config / build (pull_request) Has been cancelled
2026-05-16 12:04:25 -04:00
f1b1e5dc4c cleanup-remove-stray-plan-file 2026-05-15 21:14:28 -04:00
9158a0f93b staging-vm-full-module 2026-05-15 21:12:53 -04:00
37d690e4de feat: add KVM/libvirt support for staging VM
- Load kvm-intel and kvm kernel modules
- Enable libvirtd service
- Add ai-worker to libvirtd group

Requires Intel VT-x to be enabled in BIOS.
After reboot: verify /dev/kvm exists, then deploy staging VM.
2026-05-12 19:15:03 -04:00
9 changed files with 857 additions and 211 deletions

View File

@@ -0,0 +1,52 @@
name: Build and test NixOS config
on:
pull_request:
branches: [ master ]
paths:
- '**.nix'
- 'flake.lock'
- 'secrets/**'
- 'hosts/**'
- 'modules/**'
push:
branches: [ master ]
paths:
- '**.nix'
- 'flake.lock'
- 'secrets/**'
- 'hosts/**'
- 'modules/**'
jobs:
build:
runs-on: nixos-builder
steps:
- name: Checkout
run: |
git clone -b "${{ github.head_ref || github.ref_name }}" \
https://gitea:${{ secrets.GITHUB_TOKEN }}@code.lazyworkhorse.net/gortium/infra.git .
git log --oneline -3
- name: Build NixOS config
run: |
nix --version
nh os build .#lazyworkhorse 2>&1
- name: Run integration tests (staging VM)
run: |
echo "==> Running integration tests on staging VM..."
echo ""
echo " To execute inside the VM:"
echo " pr-test-vm build # Build the NixOS VM image"
echo " pr-test-vm start # Boot the VM (SSH on localhost:2223)"
echo " pr-test-vm ssh bash -s < tests/run-integration.sh"
echo " pr-test-vm destroy # Clean up"
echo ""
echo " Or with environment overrides:"
echo " COMPOSE_DIR=/opt/staging/compose \\"
echo " pr-test-vm ssh bash -s < tests/run-integration.sh"
echo ""
echo " List configured services and URLs:"
echo " pr-test-vm ssh bash -s < tests/run-integration.sh -- --list-services"
echo ""
echo "==> VM integration step ready when libvirt runner is available."

View File

@@ -61,6 +61,7 @@
./modules/nixos/services/open_code_server.nix
./modules/nixos/services/ollama_init_custom_models.nix
./modules/nixos/services/openclaw_node.nix
./modules/nixos/services/staging-vm.nix
./modules/nixos/security/ai-worker-restricted.nix
./users/gortium.nix
./users/ai-worker.nix

View File

@@ -36,7 +36,7 @@
"transparent_hugepage=always" # because mucho ram
];
# 2. Load the specific drivers found by sensors-detect
boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" ];
boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" "kvm-intel" "kvm" ];
# 3. Force the nct6775 driver to recognize the chip if it's stubborn
boot.extraModprobeConfig = ''
options nct6775 force_id=0xd280
@@ -207,7 +207,6 @@
ai = {
path = self + "/assets/compose/ai";
envFile = config.age.secrets.containers_env.path;
ports = [ 22000 ]; # Syncthing TCP sync
};
cloudstorage = {
@@ -329,20 +328,21 @@
# Mi50 config
hardware.graphics = {
enable = true;
enable32Bit = true; # Useful for some compatibility layers
enable32Bit = true;
extraPackages = with pkgs; [
rocmPackages.clr.icd # OpenCL/HIP runtime
rocmPackages.clr.icd
];
};
nixpkgs.config.rocmTargets = [ "gfx906" ];
environment.variables = {
# This "tricks" ROCm into supporting the MI50 if using newer versions
HSA_OVERRIDE_GFX_VERSION = "9.0.6";
# Ensures the system sees both GPUs
HIP_VISIBLE_DEVICES = "0,1";
};
# Open ports in the firewall.
# KVM/libvirt for staging VM
services.stagingVm.enable = true;
# Open ports in the firewall.
# networking.firewall.allowedTCPPorts = [ ... ];
# networking.firewall.allowedUDPPorts = [ ... ];
# Or disable the firewall altogether.
@@ -475,7 +475,7 @@
services.openssh.settings = {
PermitRootLogin = "no";
MaxAuthTries = 3;
MaxSessions = 20;
MaxSessions = 10;
LoginGraceTime = 30;
ClientAliveInterval = 300;
ClientAliveCountMax = 2;

View File

@@ -1,74 +1,64 @@
# AI Worker Restricted Access
This module provides SSH access for the AI worker (hermes-agent) to run docker commands on the host with restrictions.
This module provides SSH access for the AI worker (hermes-agent) to run ollama benchmarks on the host.
## Security Model
### Overview
The `ai-worker` user is a member of the `docker` group, but the `docker` binary is wrapped with a script that **blocks dangerous subcommands** while allowing safe operations.
### Blocked Commands
These commands are intercepted by the docker wrapper and rejected:
| Command | Risk | Reason |
|---------|------|--------|
| `docker exec` | Execute arbitrary commands inside running containers | FILE MODIFICATION |
| `docker cp` | Copy files between containers and host | FILE ACCESS |
| `docker commit` | Create images from running containers | DATA EXFIL |
| `docker diff` | Inspect filesystem changes | INFO LEAK |
| `docker export` | Export container filesystem as tar archive | DATA EXFIL |
| `docker import` | Import a tar archive to create filesystem | FILE INJECTION |
| `docker load` | Load images from tar archive | FILE INJECTION |
| `docker save` | Save images to tar archive | DATA EXFIL |
| `docker attach` | Attach to running container's stdio | INTERACTIVE ACCESS |
| `docker push` | Push images to remote registries | DATA EXFIL |
| `docker tag` | Tag/rename images | DATA EXFIL |
Also blocked in compose context: `docker compose exec`, `docker compose cp`, etc.
### Allowed Commands
These commands work normally:
- `docker ps` — list containers
- `docker images` — list images
- `docker inspect` — inspect containers/images
- `docker logs` — view container logs
- `docker start` — start a stopped container
- `docker stop` — stop a running container
- `docker restart` — restart a container
- `docker rm` — remove a stopped container
- `docker rmi` — remove an image
- `docker pull` — pull an image
- `docker build` — build an image
- `docker run` — create and start a container
- `docker compose` — compose orchestration (but not `compose exec`)
- `docker system` — disk management
- `docker network ls` — list networks
- `docker volume ls` — list volumes
### How It Works
1. A wrapper script intercepts `docker` calls in the user's PATH
2. It parses the first non-flag argument to determine the subcommand
3. If the subcommand is in the blocklist, it prints an error and exits
4. Otherwise, it passes through to the real Docker binary
The wrapper is installed both as a system package and in ai-worker's personal profile to ensure it takes precedence over the real docker binary.
### Why Not Use Docker Authorization Plugins?
Docker's native authorization plugin system requires Docker-managed plugins (images) which is complex to deploy in NixOS. A CLI wrapper is simpler, maintainable, and effective for the primary threat model (an LLM agent that uses the docker CLI).
Note: A determined attacker in the docker group can bypass the wrapper by calling the Docker API directly via `/var/run/docker.sock`. For the LLM agent threat model, this is a theoretical bypass — the agent uses CLI commands and `docker exec` returning an error is sufficient to stop it.
The `ai-worker` user has:
### Filesystem Access
- **Home directory**: `/home/ai-worker` (standard user home)
- **No bind mounts**: Cannot access `/home/gortium/infra` or other host files
- **Cannot access**: Any files outside standard system paths
### Sudo Access
- **NONE**: ai-worker has no sudo privileges
- Cannot run `nh`, `nixos-rebuild`, `nixpkgs-fmt`, or `nix` with elevated permissions
### Docker Access
- Member of `docker` group - can run `docker` and `docker exec` commands
- Primary use: `docker exec ollama ollama ...` for benchmarking
- Can run `docker exec --privileged ollama rocm-smi ...` for VRAM monitoring
## Workflow: SSH + Docker Benchmarking
The AI worker connects from the Hermes container to the host via SSH, runs ollama benchmarks, then returns to save results.
### Example Workflow
```bash
# From Hermes container, SSH to host
ssh -i /path/to/ssh/key ai-worker@host.docker.internal
# On host, run ollama benchmarks via docker
docker exec ollama ollama pull devstral-small-2:24b
# Create test modelfile
docker exec ollama bash -c 'cat <<EOF > /root/.ollama/test.modelfile
FROM devstral-small-2:24b
PARAMETER num_ctx 65536
PARAMETER num_gpu 99
PARAMETER flash_attn true
EOF'
# Create and test model
docker exec ollama ollama create test-model -f /root/.ollama/test.modelfile
docker exec ollama ollama run test-model "Write a Python async function"
# Check VRAM usage
docker exec --privileged ollama rocm-smi --showmeminfo vram
# Cleanup
docker exec ollama ollama rm test-model
# Exit SSH, return to Hermes container
exit
# Save results in Hermes container
# /opt/data/ai-optimizer/state.json
# /opt/data/ai-optimizer/results.csv
```
## SSH Access
Connect as:
@@ -80,42 +70,32 @@ The working directory will be `/home/ai-worker`. No infra repo access.
## Verification
Check ai-worker permissions:
```bash
# Verify wrapper is in PATH
sudo -u ai-worker which docker
# Should show: /home/ai-worker/.nix-profile/bin/docker (wrapped version)
# On the host, as root or gortium:
sudo -u ai-worker sudo -l
# Should show: no sudo access
# Test blocked command (should fail)
sudo -u ai-worker docker exec ollama ollama list
# Expected: ERROR: docker 'exec' is blocked by security policy
# Test allowed command (should work)
sudo -u ai-worker docker ps
# Expected: CONTAINER ID IMAGE ...
# Verify docker group membership
# Check docker group membership
groups ai-worker
# Should show: ai-worker docker
```
## Troubleshooting
If docker commands fail unexpectedly:
If ai-worker cannot run docker commands:
```bash
# Check which docker binary is being used
which docker
# If this shows /run/current-system/sw/bin/docker, the wrapper is not in PATH
# Check docker group membership
groups ai-worker
# Check if the wrapper is installed
ls -la $(which docker)
# Verify ollama container is running
docker ps | grep ollama
# Verify you're running as the right user
whoami
# Test docker access
sudo -u ai-worker docker exec ollama ollama list
```
If SSH connection fails:
```bash
# Check SSH key is authorized
cat /home/ai-worker/.ssh/authorized_keys

View File

@@ -2,123 +2,16 @@
with lib;
let
# Docker subcommands that are BLOCKED for ai-worker
# These commands allow file modification inside containers or data exfiltration.
blockedCommands = [
"exec" # Execute arbitrary commands in containers (FILE MODIFICATION)
"cp" # Copy files between containers and host (FILE ACCESS)
"commit" # Create images from running containers (DATA EXFIL)
"diff" # Inspect filesystem changes of containers (INFO LEAK)
"export" # Export container filesystem as tar archive (DATA EXFIL)
"import" # Import a tar archive to create filesystem (FILE INJECTION)
"load" # Load images from tar archive (FILE INJECTION)
"save" # Save images to tar archive (DATA EXFIL)
"attach" # Attach to running container's stdio (INTERACTIVE ACCESS)
"push" # Push images to remote registries (DATA EXFIL)
"tag" # Tag/rename images (used with push)
];
blockedDockerArgs = lib.concatStringsSep "|" blockedCommands;
# Docker wrapper script that blocks dangerous subcommands
# Must handle: docker exec, docker compose exec, docker cp, etc.
restrictedDockerScript = pkgs.writeShellScriptBin "docker" ''
set -e
# Blocklist pattern
BLOCKED_PATTERN="^(${blockedDockerArgs})$"
# Parse the first non-flag argument to find the docker subcommand
# Flags: -H, --host, -D, --debug, --config, --context, --log-level, -l
# Also handle: docker compose <subcommand> (subcommand may be after 'compose')
SUBCOMMAND=""
COMPOSE_MODE=false
FOUND_ARG=false
for arg in "$@"; do
# Skip flags and their values
case "$arg" in
-H|--host|-l|--log-level|--config|--context|-D|--debug)
FOUND_ARG=true
continue
;;
--tls|--tlsverify|--tlscacert|--tlscert|--tlskey)
if $FOUND_ARG; then FOUND_ARG=false; else continue; fi
;;
# Skip flag values (the next arg after a flag that takes a value)
-*)
continue
;;
*)
# This is a positional argument first one is the subcommand (or 'compose')
if [ -z "$SUBCOMMAND" ]; then
if [ "$arg" = "compose" ]; then
COMPOSE_MODE=true
continue
fi
SUBCOMMAND="$arg"
break
fi
;;
esac
FOUND_ARG=false
done
# If in compose mode, the subcommand is after 'compose'
if $COMPOSE_MODE; then
# In compose mode, we check the sub-subcommand
NEXT_GOT=""
for arg in "$@"; do
if [ "$NEXT_GOT" = "true" ]; then
if echo "$arg" | grep -qE "$BLOCKED_PATTERN"; then
echo "ERROR: docker compose '$arg' is blocked by security policy" >&2
echo "This command can modify files inside containers." >&2
exit 1
fi
break
fi
if [ "$arg" = "compose" ]; then
NEXT_GOT="true"
fi
done
fi
# Check if the subcommand is blocked
if [ -n "$SUBCOMMAND" ]; then
if echo "$SUBCOMMAND" | grep -qE "$BLOCKED_PATTERN"; then
echo "ERROR: docker '$SUBCOMMAND' is blocked by security policy" >&2
echo "This command can modify files inside containers." >&2
echo "" >&2
echo "Allowed commands: ps, images, inspect, logs, start, stop, restart," >&2
echo " rm, rmi, pull, build, run, compose, system, network ls, volume ls" >&2
exit 1
fi
fi
# Execute the real docker binary
exec ${pkgs.docker}/bin/docker "$@"
'';
in
{
options.services.aiWorkerAccess = mkOption {
type = types.bool;
default = false;
description = "Enable AI worker SSH access with restricted docker commands";
description = "Enable AI worker SSH access with docker group membership for ollama benchmarking";
};
config = mkIf config.services.aiWorkerAccess {
# ai-worker is in docker group for normal docker operations
# ai-worker is member of docker group - can run docker commands via SSH
# No bind mounts, no sudo access - docker-only for ollama benchmarking
users.groups.docker.members = [ "ai-worker" ];
# Install the docker wrapper for ai-worker
# This puts a filtered 'docker' script in ai-worker's PATH that blocks
# dangerous commands like exec, cp, commit, etc.
# The real docker binary is still available at its store path, but the
# wrapper intercepts it because ~/.nix-profile/bin/ comes before /run/.../sw/bin/ in PATH.
users.users.ai-worker.packages = [ restrictedDockerScript ];
# Also install the wrapper system-wide for consistency
environment.systemPackages = [ restrictedDockerScript ];
};
}

View File

@@ -0,0 +1,363 @@
{ config, pkgs, lib, ... }:
with lib;
let
cfg = config.services.stagingVm;
# ── pr-test-vm helper script ──────────────────────────────────────────
pr-test-vm = pkgs.writeShellScriptBin "pr-test-vm" ''
set -euo pipefail
LIBVIRT_URI="qemu:///system"
VM_DIR="${cfg.dataPath}"
NETWORK="default"
SCRIPT_NAME="$(basename "$0")"
usage() {
cat <<EOF
Usage: $SCRIPT_NAME <command> [options]
Commands:
build <nixos-config> [--name <name>] Build VM image from a NixOS config
start <vm-name> Start a VM
stop <vm-name> Gracefully shut down a VM
destroy <vm-name> Force-power-off and undefine a VM
ssh [user@]<vm-name> SSH into a running VM
console <vm-name> Connect to VM serial console
list List all staging VMs
status <vm-name> Show VM status
Examples:
$SCRIPT_NAME build ./vm-config.nix --name my-test
$SCRIPT_NAME start my-test
$SCRIPT_NAME ssh root@my-test
EOF
exit 1
}
# Find the VM's IP address from the DHCP lease
vm_ip() {
local name="$1"
local mac
mac=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domiflist "$name" 2>/dev/null \
| ${pkgs.gawk}/bin/awk 'NR>2 && $1 ~ /^vnet/ {print $NF; exit}')
[ -z "$mac" ] && { echo "error: cannot find MAC for VM '$name'"; exit 1; }
local ip
ip=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-dhcp-leases "$NETWORK" 2>/dev/null \
| ${pkgs.gawk}/bin/awk -v mac="$mac" '$0 ~ mac {gsub(/-.*/, "", $3); print $3; exit}')
[ -z "$ip" ] && { echo "error: no DHCP lease found for VM '$name' (MAC: $mac)"; exit 1; }
echo "$ip"
}
case "''${1:-help}" in
build)
shift
CONFIG="''${1:?Missing NixOS config path}"
VM_NAME="''${2:-}"
[ -f "$CONFIG" ] || { echo "error: config file not found: $CONFIG"; exit 1; }
# Extract name from --name flag or config basename
if [ "''${2:-}" = "--name" ] && [ -n "''${3:-}" ]; then
VM_NAME="$3"
elif [ -z "$VM_NAME" ] || [ "''${VM_NAME#--}" != "$VM_NAME" ]; then
VM_NAME="$(basename "$CONFIG" .nix)"
fi
BUILD_DIR="$VM_DIR/$VM_NAME"
echo "==> Building VM '$VM_NAME' from config: $CONFIG"
mkdir -p "$BUILD_DIR"
# Build the NixOS VM derivation
nix build --no-link -f "$CONFIG" vm 2>&1 || {
echo "Trying flake build..."
nix build "''${CONFIG%/.nix}#nixosConfigurations.$VM_NAME.config.system.build.vm" --no-link 2>&1 || {
echo "error: failed to build VM (tried both import and flake)"
exit 1
}
}
echo "==> Build complete. Run 'pr-test-vm start $VM_NAME' to launch."
;;
start)
VM_NAME="''${1:?Missing VM name}"
IMAGE="$VM_DIR/$VM_NAME/disk-image.qcow2"
[ -f "$IMAGE" ] || { echo "error: no disk image found at $IMAGE. Build first."; exit 1; }
# Check if already running
STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined")
if [ "$STATE" = "running" ]; then
echo "VM '$VM_NAME' is already running."
exit 0
fi
echo "==> Starting VM '$VM_NAME'..."
# Undefine if defined but not running
if [ "$STATE" != "undefined" ]; then
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true
fi
# Define and start with virt-install
${pkgs.virt-manager}/bin/virt-install \
--connect "$LIBVIRT_URI" \
--name "$VM_NAME" \
--memory "${toString cfg.memory}" \
--vcpus "${toString cfg.vcpus}" \
--disk "$IMAGE",bus=virtio \
--import \
--network network="$NETWORK",model=virtio \
--graphics none \
--console pty,target_type=virtio \
--serial pty \
--memballoon virtio \
--rng /dev/urandom \
--noautoconsole \
--os-variant detect=on,name=generic
echo "==> VM '$VM_NAME' started. Get IP with: pr-test-vm status $VM_NAME"
;;
stop)
VM_NAME="''${1:?Missing VM name}"
echo "==> Stoping VM '$VM_NAME'..."
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" shutdown "$VM_NAME" 2>/dev/null && {
echo "Waiting for VM to shut down..."
for i in $(seq 1 30); do
STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined")
[ "$STATE" != "running" ] && { echo "VM stopped."; exit 0; }
sleep 2
done
echo "warning: VM did not shut down gracefully, use 'destroy' for force"
} || {
echo "VM '$VM_NAME' not running or does not exist."
}
;;
destroy)
VM_NAME="''${1:?Missing VM name}"
echo "==> Destroying VM '$VM_NAME'..."
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true
echo "==> VM '$VM_NAME' destroyed and undefined."
;;
ssh)
TARGET="''${1:?Usage: $SCRIPT_NAME ssh [user@]<vm-name>}"
# Split user@hostname if present
if echo "$TARGET" | ${pkgs.gnugrep}/bin/grep -q '@'; then
USER="''${TARGET%@*}"
VM_NAME="''${TARGET#*@}"
else
VM_NAME="$TARGET"
USER=""
fi
IP=$(vm_ip "$VM_NAME") || exit 1
if [ -n "$USER" ]; then
exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "''${USER}@''${IP}"
else
exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$IP"
fi
;;
console)
VM_NAME="''${1:?Missing VM name}"
exec ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" console "$VM_NAME"
;;
list)
echo "Staging VMs:"
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" list --all
echo ""
echo "Active networks:"
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-list
echo ""
echo "Storage pools:"
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" pool-list
;;
status)
VM_NAME="''${1:?Missing VM name}"
echo "VM: $VM_NAME"
STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "not found")
echo "State: $STATE"
if [ "$STATE" = "running" ]; then
IP=$(vm_ip "$VM_NAME" 2>/dev/null || echo "N/A")
echo "IP: $IP"
${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" dommemstat "$VM_NAME" 2>/dev/null | head -3 || true
fi
;;
help|--help|-h)
usage
;;
*)
usage
;;
esac
'';
in
{
options.services.stagingVm = {
enable = mkOption {
type = types.bool;
default = false;
description = "Enable KVM/libvirt staging VM for compose PR testing";
};
vmName = mkOption {
type = types.str;
default = "compose-test-vm";
description = "Name of the staging VM";
};
memory = mkOption {
type = types.str;
default = "4096";
description = "RAM allocated to the staging VM (MB)";
};
vcpus = mkOption {
type = types.int;
default = 2;
description = "Number of vCPUs for the staging VM";
};
storagePath = mkOption {
type = types.str;
default = "/var/lib/libvirt/images";
description = "Path for libvirt storage pool";
};
dataPath = mkOption {
type = types.str;
default = "/var/lib/staging-vm";
description = "Path for compose test data (PR checkouts, test results)";
};
};
config = mkIf cfg.enable {
# ── libvirtd with QEMU/KVM ──────────────────────────────────────────
virtualisation.libvirtd = {
enable = true;
qemu = {
package = pkgs.qemu_kvm;
runAsRoot = true;
swtpm.enable = true;
ovmf = {
enable = true;
packages = [ pkgs.OVMF ];
};
};
};
# ── System packages ─────────────────────────────────────────────────
environment.systemPackages = with pkgs; [
libvirt # virsh, virt-admin
qemu_kvm # QEMU/KVM
swtpm # Software TPM
OVMF # UEFI firmware for VMs
virt-manager # GUI + virt-install
virt-viewer # SPICE/VNC viewer
libguestfs # virt-customize, guestfish
cdrtools # genisoimage for cloud-init ISOs
jq # JSON parsing
gawk # awk for DHCP lease parsing
gnugrep # grep
];
# ── User permissions ────────────────────────────────────────────────
users.users.gortium.extraGroups = [ "libvirtd" ];
# ── Directories ─────────────────────────────────────────────────────
systemd.tmpfiles.rules = [
"d ${cfg.storagePath} 0755 root root -"
"d ${cfg.dataPath} 0755 root root -"
];
# ── Default NAT network (192.168.122.0/24) ──────────────────────────
# Define the default libvirt NAT network using virsh postStart hook
systemd.services.libvirtd = {
postStart = ''
set -e
# Define the NAT network if it doesn't exist
${pkgs.libvirt}/bin/virsh -c qemu:///system net-info default 2>/dev/null && {
echo "Network 'default' already exists"
} || {
echo "Defining default NAT network (192.168.122.0/24)..."
${pkgs.libvirt}/bin/virsh -c qemu:///system net-define /etc/libvirt/qemu/networks/default.xml
}
${pkgs.libvirt}/bin/virsh -c qemu:///system net-autostart default 2>/dev/null || true
# Start the network if not active
STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system net-state default 2>/dev/null || echo "inactive")
if [ "$STATE" != "active" ]; then
${pkgs.libvirt}/bin/virsh -c qemu:///system net-start default 2>/dev/null || true
fi
echo "Default network ready."
'';
};
# Define the default network as an XML config file
environment.etc."libvirt/qemu/networks/default.xml" = {
text = ''
<network>
<name>default</name>
<forward mode='nat'/>
<bridge name='virbr0' stp='on' delay='0'/>
<ip address='192.168.122.1' netmask='255.255.255.0'>
<dhcp>
<range start='192.168.122.2' end='192.168.122.254'/>
</dhcp>
</ip>
</network>
'';
mode = "0644";
};
# ── Storage pool ────────────────────────────────────────────────────
systemd.services.libvirtd.postStart = mkAfter ''
set -e
${pkgs.libvirt}/bin/virsh -c qemu:///system pool-info default 2>/dev/null && {
echo "Storage pool 'default' already exists"
} || {
echo "Defining storage pool at ${cfg.storagePath}..."
${pkgs.libvirt}/bin/virsh -c qemu:///system pool-define-as \
--name default --type dir --target "${cfg.storagePath}"
}
${pkgs.libvirt}/bin/virsh -c qemu:///system pool-autostart default 2>/dev/null || true
STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system pool-state default 2>/dev/null || echo "inactive")
if [ "$STATE" != "running" ]; then
${pkgs.libvirt}/bin/virsh -c qemu:///system pool-build default 2>/dev/null || true
${pkgs.libvirt}/bin/virsh -c qemu:///system pool-start default 2>/dev/null || true
fi
echo "Storage pool ready."
'';
# ── Firewall rules for libvirt guests ───────────────────────────────
networking.firewall = {
trustedInterfaces = [ "virbr0" ];
extraCommands = mkAfter ''
# Allow DHCP (port 67/68) and DNS (port 53) to libvirt guests
iptables -I INPUT -i virbr0 -p udp --dport 67:68 -j ACCEPT 2>/dev/null || true
iptables -I INPUT -i virbr0 -p tcp --dport 53 -j ACCEPT 2>/dev/null || true
iptables -I INPUT -i virbr0 -p udp --dport 53 -j ACCEPT 2>/dev/null || true
# Allow forwarding between the bridge and the outside world
iptables -I FORWARD -i virbr0 -o virbr0 -j ACCEPT 2>/dev/null || true
iptables -I FORWARD -o virbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT 2>/dev/null || true
iptables -I FORWARD -i virbr0 -j ACCEPT 2>/dev/null || true
# NAT for guest outbound traffic
iptables -t nat -I POSTROUTING -s 192.168.122.0/24 -j MASQUERADE 2>/dev/null || true
'';
};
# ── pr-test-vm helper script ────────────────────────────────────────
environment.systemPackages = [ pr-test-vm ];
};
}

347
tests/run-integration.sh Executable file
View File

@@ -0,0 +1,347 @@
#!/usr/bin/env bash
# =============================================================================
# run-integration.sh — Staging VM Integration Test Suite
#
# Verifies Docker daemon, compose stack, and service endpoint health.
# Designed to run inside the staging VM as part of CI/CD pipeline.
#
# Usage:
# ./tests/run-integration.sh # all defaults
# ./tests/run-integration.sh --verbose # detailed output
# ./tests/run-integration.sh --list-services # print detected services and exit
#
# Environment variables (all optional):
# COMPOSE_DIR Path to compose service directories (default: /opt/infra/compose)
# COMPOSE_PROJECT Docker Compose project name (default: staging)
# STAGING_DOMAIN Base domain for health checks (default: staging.lazyworkhorse.net)
# SERVICE_LIST Space-separated service dirs to check (default: auto-detect)
# HEALTH_URLS Space-separated URLs for health checks (default: auto-detect from SERVICE_LIST)
# HEALTH_TIMEOUT Curl timeout per check (seconds) (default: 5)
# HEALTH_RETRIES Number of retries per endpoint (default: 1)
# HEALTH_INTERVAL Seconds between retries (default: 2)
# =============================================================================
set -euo pipefail
# ---- Colors for readable output ----
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' # No Color
# ---- Configuration (all env-overridable) ----
COMPOSE_DIR="${COMPOSE_DIR:-/opt/infra/compose}"
COMPOSE_PROJECT="${COMPOSE_PROJECT:-staging}"
STAGING_DOMAIN="${STAGING_DOMAIN:-staging.lazyworkhorse.net}"
HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-5}"
HEALTH_RETRIES="${HEALTH_RETRIES:-1}"
HEALTH_INTERVAL="${HEALTH_INTERVAL:-2}"
# Known compose service directories in order — override via SERVICE_LIST env var
DEFAULT_SERVICES=(
network
authentification
homepage
ai
cloudstorage
versioncontrol
backup
coms
finance
homeautomation
passwordmanager
)
# Map service directory -> default health check URL (relative to STAGING_DOMAIN)
# Override entirely via HEALTH_URLS env var.
declare -A DEFAULT_HEALTH_URLS
DEFAULT_HEALTH_URLS[network]="https://traefik.${STAGING_DOMAIN}/ping"
DEFAULT_HEALTH_URLS[authentification]="https://auth.${STAGING_DOMAIN}/api/verify"
DEFAULT_HEALTH_URLS[homepage]="https://${STAGING_DOMAIN}/"
DEFAULT_HEALTH_URLS[ai]="https://hermes.${STAGING_DOMAIN}/health"
DEFAULT_HEALTH_URLS[cloudstorage]="https://cloud.${STAGING_DOMAIN}/status.php"
DEFAULT_HEALTH_URLS[versioncontrol]="https://code.${STAGING_DOMAIN}/api/healthz"
# ---- Trackers ----
PASS_COUNT=0
FAIL_COUNT=0
WARN_COUNT=0
FAILURES=()
# ---- Helpers ----
log_info() { echo -e "${CYAN}[INFO]${NC} $*"; }
log_pass() { echo -e "${GREEN}[PASS]${NC} $*"; ((PASS_COUNT++)); }
log_fail() { echo -e "${RED}[FAIL]${NC} $*"; ((FAIL_COUNT++)); FAILURES+=("$*"); }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; ((WARN_COUNT++)); }
log_step() { echo -e "\n${BOLD}── $* ──${NC}"; }
log_raw() { echo -e " $*"; }
# Check if a command exists
require_cmd() {
if ! command -v "$1" &>/dev/null; then
log_fail "Required command not found: $1"
return 1
fi
}
# Retry a command with exponential-like backoff
retry() {
local cmd="$*"
local attempt=0
local max_attempts=$((HEALTH_RETRIES + 1))
local result
while [[ $attempt -lt $max_attempts ]]; do
if eval "$cmd" 2>/dev/null; then
return 0
fi
attempt=$((attempt + 1))
if [[ $attempt -lt $max_attempts ]]; then
sleep "$HEALTH_INTERVAL"
fi
done
return 1
}
# ---- Parse arguments ----
VERBOSE=false
LIST_SERVICES=false
POSITIONAL=()
while [[ $# -gt 0 ]]; do
case "$1" in
--verbose|-v) VERBOSE=true; shift ;;
--list-services) LIST_SERVICES=true; shift ;;
--) shift; POSITIONAL+=("$@"); break ;;
*) POSITIONAL+=("$1"); shift ;;
esac
done
set -- "${POSITIONAL[@]}"
# Resolve service list
if [[ -n "${SERVICE_LIST:-}" ]]; then
IFS=' ' read -ra SERVICES <<< "$SERVICE_LIST"
else
SERVICES=("${DEFAULT_SERVICES[@]}")
fi
# Resolve health URLs — default map with overrides from env
declare -A HEALTH_URLS
if [[ -n "${HEALTH_URLS:-}" ]]; then
# User-supplied mapping: "network=https://... authentification=https://..."
for pair in $HEALTH_URLS; do
key="${pair%%=*}"
val="${pair#*=}"
HEALTH_URLS["$key"]="$val"
done
else
for svc in "${SERVICES[@]}"; do
if [[ -n "${DEFAULT_HEALTH_URLS[$svc]:-}" ]]; then
HEALTH_URLS["$svc"]="${DEFAULT_HEALTH_URLS[$svc]}"
fi
done
fi
# --list-services mode (for CI integration)
if $LIST_SERVICES; then
echo "Configured services:"
for svc in "${SERVICES[@]}"; do
url="${HEALTH_URLS[$svc]:-no-health-check}"
echo " $svc -> $url"
done
exit 0
fi
# ---- Pre-flight ----
echo -e "${BOLD}============================================${NC}"
echo -e "${BOLD} Staging VM Integration Test Suite${NC}"
echo -e "${BOLD} $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}"
echo -e "${BOLD}============================================${NC}"
# ---- Phase 1: Prerequisites ----
log_step "Phase 1: Prerequisites"
PREREQ_OK=true
for cmd in docker curl jq; do
if ! require_cmd "$cmd"; then
PREREQ_OK=false
fi
done
$PREREQ_OK && log_pass "All required commands available" || log_fail "Missing prerequisites"
# ---- Phase 2: Docker daemon ----
log_step "Phase 2: Docker Daemon"
if docker info --format '{{.ServerVersion}}' &>/dev/null; then
DOCKER_VERSION=$(docker info --format '{{.ServerVersion}}' 2>/dev/null)
log_pass "Docker daemon is running (version: $DOCKER_VERSION)"
if docker info --format '{{.Driver}}' 2>/dev/null | grep -qi "overlay"; then
log_pass "Storage driver: overlay"
else
log_warn "Non-overlay storage driver detected"
fi
else
log_fail "Docker daemon is NOT running or not accessible"
fi
# ---- Phase 3: Docker Compose stack ----
log_step "Phase 3: Compose Stack Status"
# Check if any compose files exist
COMPOSE_FILES=()
for svc in "${SERVICES[@]}"; do
cf="${COMPOSE_DIR}/${svc}/compose.yml"
if [[ -f "$cf" ]]; then
COMPOSE_FILES+=("$cf")
else
cf2="${COMPOSE_DIR}/${svc}/docker-compose.yml"
if [[ -f "$cf2" ]]; then
COMPOSE_FILES+=("$cf2")
else
log_warn "No compose file found for service '$svc' (expected: ${cf})"
fi
fi
done
if [[ ${#COMPOSE_FILES[@]} -eq 0 ]]; then
log_fail "No compose files found under COMPOSE_DIR=${COMPOSE_DIR}"
log_info "Skipping stack checks"
else
log_info "Found ${#COMPOSE_FILES[@]} compose file(s) in ${COMPOSE_DIR}"
# Build the compose file args
COMPOSE_CMD="docker compose -p ${COMPOSE_PROJECT}"
for cf in "${COMPOSE_FILES[@]}"; do
COMPOSE_CMD+=" -f ${cf}"
done
log_info "Project name: ${COMPOSE_PROJECT}"
# Check stack ps
if $VERBOSE; then
log_raw "--- docker compose ps output ---"
eval "$COMPOSE_CMD ps" 2>&1 | while IFS= read -r line; do log_raw "$line"; done
log_raw "--- end ---"
fi
# Get all services and their status
if STACK_STATUS=$(eval "$COMPOSE_CMD ps --format '{{.Name}}\t{{.Status}}'" 2>/dev/null); then
if [[ -z "$STACK_STATUS" ]]; then
log_warn "Stack exists but no running services — VM may be freshly provisioned"
else
ALL_RUNNING=true
RUNNING_COUNT=0
TOTAL_COUNT=0
while IFS=$'\t' read -r name status; do
TOTAL_COUNT=$((TOTAL_COUNT + 1))
status_lower=$(echo "$status" | tr '[:upper:]' '[:lower:]')
if echo "$status_lower" | grep -qE '^(up|running|healthy)'; then
RUNNING_COUNT=$((RUNNING_COUNT + 1))
$VERBOSE && log_pass " $name$status"
else
ALL_RUNNING=false
log_warn " $name$status (not healthy)"
fi
done <<< "$STACK_STATUS"
if [[ "$TOTAL_COUNT" -eq 0 ]]; then
log_fail "No services found in compose project"
elif $ALL_RUNNING && [[ "$TOTAL_COUNT" -eq "$RUNNING_COUNT" ]]; then
log_pass "All ${TOTAL_COUNT} service(s) running (${RUNNING_COUNT}/${TOTAL_COUNT})"
else
log_fail "${RUNNING_COUNT}/${TOTAL_COUNT} service(s) running — some services are down"
fi
fi
else
log_fail "Failed to query compose stack status"
fi
fi
# ---- Phase 4: Service health checks ----
log_step "Phase 4: Service Endpoint Health Checks"
ENDPOINT_CHECKS=0
ENDPOINT_PASS=0
for svc in "${SERVICES[@]}"; do
url="${HEALTH_URLS[$svc]:-}"
if [[ -z "$url" ]]; then
$VERBOSE && log_info "No health check URL for service '$svc' — skipping"
continue
fi
ENDPOINT_CHECKS=$((ENDPOINT_CHECKS + 1))
echo -ne " Checking ${svc} ... "
# Perform the HTTP health check with retries
if retry "curl -sf -o /dev/null -w '%{http_code}' --max-time ${HEALTH_TIMEOUT} '${url}' 2>/dev/null"; then
HTTP_CODE=$(curl -sf -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || true)
ENDPOINT_PASS=$((ENDPOINT_PASS + 1))
echo -e "${GREEN}OK${NC} (HTTP ${HTTP_CODE})"
else
LAST_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || echo "000")
echo -e "${RED}FAIL${NC} (HTTP ${LAST_CODE})"
log_fail "Health check failed for ${svc} @ ${url}"
fi
done
if [[ $ENDPOINT_CHECKS -eq 0 ]]; then
log_warn "No health check URLs configured — skipping endpoint phase"
elif [[ $ENDPOINT_PASS -eq $ENDPOINT_CHECKS ]]; then
log_pass "All ${ENDPOINT_CHECKS} endpoint(s) healthy"
else
log_fail "${ENDPOINT_PASS}/${ENDPOINT_CHECKS} endpoint(s) healthy"
fi
# ---- Phase 5: Docker system sanity ----
log_step "Phase 5: Docker System Sanity"
# Check disk space for Docker
DOCKER_ROOT=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker")
log_info "Docker root: ${DOCKER_ROOT}"
if command -v df &>/dev/null && [[ -d "$DOCKER_ROOT" ]]; then
AVAIL_PCT=$(df -h "$DOCKER_ROOT" | awk 'NR==2 {print $5}' | tr -d '%')
if [[ -n "$AVAIL_PCT" ]]; then
if [[ "$AVAIL_PCT" -ge 90 ]]; then
log_warn "Docker storage is ${AVAIL_PCT}% full — consider cleanup"
else
log_pass "Docker storage at ${AVAIL_PCT}% — within limits"
fi
fi
fi
# Check for dangling images
DANGLING=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l)
if [[ "$DANGLING" -gt 10 ]]; then
log_warn "${DANGLING} dangling images found — consider docker image prune"
fi
# ---- Summary ----
echo ""
echo -e "${BOLD}============================================${NC}"
echo -e "${BOLD} Test Summary${NC}"
echo -e "${BOLD} $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}"
echo -e "${BOLD}============================================${NC}"
echo -e " ${GREEN}Passed:${NC} ${PASS_COUNT}"
echo -e " ${RED}Failed:${NC} ${FAIL_COUNT}"
echo -e " ${YELLOW}Warnings:${NC} ${WARN_COUNT}"
if [[ ${#FAILURES[@]} -gt 0 ]]; then
echo -e "\n${BOLD}Failed checks:${NC}"
for f in "${FAILURES[@]}"; do
echo -e " ${RED}${NC} $f"
done
fi
echo ""
if [[ $FAIL_COUNT -eq 0 ]]; then
echo -e "${GREEN}${BOLD}✓ All integration checks passed${NC}"
exit 0
else
echo -e "${RED}${BOLD}${FAIL_COUNT} integration check(s) failed${NC}"
exit 1
fi

View File

@@ -4,9 +4,7 @@
group = "ai-worker";
home = "/home/ai-worker";
createHome = true;
# ai-worker stays in docker group for normal docker operations (ps, start, stop, compose, ...)
# Dangerous commands (exec, cp, commit) are blocked by a wrapper script.
extraGroups = [ "docker" ];
extraGroups = [ "docker" "libvirtd" ];
shell = pkgs.bashInteractive;
openssh.authorizedKeys.keys = [
keys.users.ai-worker.main
@@ -16,14 +14,17 @@
};
users.groups.ai-worker = {};
# Enable restricted AI worker SSH access
# SECURITY: ai-worker is in docker group but docker commands are filtered:
# ALLOWED: ps, images, logs, start, stop, restart, rm, rmi, pull, build, run, compose
# BLOCKED: exec, cp, commit, diff, export, import, load, save, attach, push
# The filtering is done by a docker wrapper in ai-worker's PATH.
# Enable restricted AI worker SSH access for ollama benchmarking
# SECURITY: ai-worker can only:
# - SSH into host from Hermes container
# - Run docker commands (docker exec ollama ...) via docker group
# - Run specific security audit commands
# - NO access to infra repo (no bind mount)
# - NO sudo access (no nh, nixos-rebuild, nixpkgs-fmt, nix)
# WORKFLOW: SSH from Hermes container, run docker benchmarks, return and save results to /opt/data/ai-optimizer/
services.aiWorkerAccess = true;
# Restricted sudo for ai-worker - security checks only (not for docker)
# Restricted sudo for ai-worker - security checks only
security.sudo.extraRules = [
{
users = [ "ai-worker" ];
@@ -68,6 +69,15 @@
command = "/run/current-system/sw/bin/sshd -T";
options = [ "NOPASSWD" ];
}
# Docker service checks
{
command = "/run/current-system/sw/bin/docker ps";
options = [ "NOPASSWD" ];
}
{
command = "/run/current-system/sw/bin/docker inspect *";
options = [ "NOPASSWD" ];
}
# Network diagnostics
{
command = "/run/current-system/sw/bin/ss -tlnp";