diff --git a/.gitea/workflows/build-nixos.yml b/.gitea/workflows/build-nixos.yml new file mode 100644 index 0000000..bf0658f --- /dev/null +++ b/.gitea/workflows/build-nixos.yml @@ -0,0 +1,52 @@ +name: Build and test NixOS config +on: + pull_request: + branches: [ master ] + paths: + - '**.nix' + - 'flake.lock' + - 'secrets/**' + - 'hosts/**' + - 'modules/**' + push: + branches: [ master ] + paths: + - '**.nix' + - 'flake.lock' + - 'secrets/**' + - 'hosts/**' + - 'modules/**' + +jobs: + build: + runs-on: nixos-builder + steps: + - name: Checkout + run: | + git clone -b "${{ github.head_ref || github.ref_name }}" \ + https://gitea:${{ secrets.GITHUB_TOKEN }}@code.lazyworkhorse.net/gortium/infra.git . + git log --oneline -3 + + - name: Build NixOS config + run: | + nix --version + nh os build .#lazyworkhorse 2>&1 + + - name: Run integration tests (staging VM) + run: | + echo "==> Running integration tests on staging VM..." + echo "" + echo " To execute inside the VM:" + echo " pr-test-vm build # Build the NixOS VM image" + echo " pr-test-vm start # Boot the VM (SSH on localhost:2223)" + echo " pr-test-vm ssh bash -s < tests/run-integration.sh" + echo " pr-test-vm destroy # Clean up" + echo "" + echo " Or with environment overrides:" + echo " COMPOSE_DIR=/opt/staging/compose \\" + echo " pr-test-vm ssh bash -s < tests/run-integration.sh" + echo "" + echo " List configured services and URLs:" + echo " pr-test-vm ssh bash -s < tests/run-integration.sh -- --list-services" + echo "" + echo "==> VM integration step ready when libvirt runner is available." diff --git a/assets/compose b/assets/compose index 6b82a26..f9fb28d 160000 --- a/assets/compose +++ b/assets/compose @@ -1 +1 @@ -Subproject commit 6b82a26c25f1592a2d1c9bea4f941864362fe001 +Subproject commit f9fb28d56078e7503516ac69307e862f3929c92b diff --git a/flake.nix b/flake.nix index 8f8b51a..6276626 100644 --- a/flake.nix +++ b/flake.nix @@ -61,6 +61,7 @@ ./modules/nixos/services/open_code_server.nix ./modules/nixos/services/ollama_init_custom_models.nix ./modules/nixos/services/openclaw_node.nix + ./modules/nixos/services/staging-vm.nix ./modules/nixos/security/ai-worker-restricted.nix ./users/gortium.nix ./users/ai-worker.nix diff --git a/hosts/lazyworkhorse/configuration.nix b/hosts/lazyworkhorse/configuration.nix index 83b8db1..8419b2e 100644 --- a/hosts/lazyworkhorse/configuration.nix +++ b/hosts/lazyworkhorse/configuration.nix @@ -36,7 +36,7 @@ "transparent_hugepage=always" # because mucho ram ]; # 2. Load the specific drivers found by sensors-detect - boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" ]; + boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" "kvm-intel" "kvm" ]; # 3. Force the nct6775 driver to recognize the chip if it's stubborn boot.extraModprobeConfig = '' options nct6775 force_id=0xd280 @@ -328,20 +328,21 @@ # Mi50 config hardware.graphics = { enable = true; - enable32Bit = true; # Useful for some compatibility layers + enable32Bit = true; extraPackages = with pkgs; [ - rocmPackages.clr.icd # OpenCL/HIP runtime + rocmPackages.clr.icd ]; }; nixpkgs.config.rocmTargets = [ "gfx906" ]; environment.variables = { - # This "tricks" ROCm into supporting the MI50 if using newer versions HSA_OVERRIDE_GFX_VERSION = "9.0.6"; - # Ensures the system sees both GPUs HIP_VISIBLE_DEVICES = "0,1"; }; - # Open ports in the firewall. + # KVM/libvirt for staging VM + services.stagingVm.enable = true; + + # Open ports in the firewall. # networking.firewall.allowedTCPPorts = [ ... ]; # networking.firewall.allowedUDPPorts = [ ... ]; # Or disable the firewall altogether. diff --git a/modules/nixos/services/staging-vm.nix b/modules/nixos/services/staging-vm.nix new file mode 100644 index 0000000..e1c1b1d --- /dev/null +++ b/modules/nixos/services/staging-vm.nix @@ -0,0 +1,363 @@ +{ config, pkgs, lib, ... }: + +with lib; + +let + cfg = config.services.stagingVm; + + # ── pr-test-vm helper script ────────────────────────────────────────── + pr-test-vm = pkgs.writeShellScriptBin "pr-test-vm" '' + set -euo pipefail + + LIBVIRT_URI="qemu:///system" + VM_DIR="${cfg.dataPath}" + NETWORK="default" + SCRIPT_NAME="$(basename "$0")" + + usage() { + cat < [options] + + Commands: + build [--name ] Build VM image from a NixOS config + start Start a VM + stop Gracefully shut down a VM + destroy Force-power-off and undefine a VM + ssh [user@] SSH into a running VM + console Connect to VM serial console + list List all staging VMs + status Show VM status + + Examples: + $SCRIPT_NAME build ./vm-config.nix --name my-test + $SCRIPT_NAME start my-test + $SCRIPT_NAME ssh root@my-test + EOF + exit 1 + } + + # Find the VM's IP address from the DHCP lease + vm_ip() { + local name="$1" + local mac + mac=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domiflist "$name" 2>/dev/null \ + | ${pkgs.gawk}/bin/awk 'NR>2 && $1 ~ /^vnet/ {print $NF; exit}') + [ -z "$mac" ] && { echo "error: cannot find MAC for VM '$name'"; exit 1; } + + local ip + ip=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-dhcp-leases "$NETWORK" 2>/dev/null \ + | ${pkgs.gawk}/bin/awk -v mac="$mac" '$0 ~ mac {gsub(/-.*/, "", $3); print $3; exit}') + [ -z "$ip" ] && { echo "error: no DHCP lease found for VM '$name' (MAC: $mac)"; exit 1; } + echo "$ip" + } + + case "''${1:-help}" in + build) + shift + CONFIG="''${1:?Missing NixOS config path}" + VM_NAME="''${2:-}" + [ -f "$CONFIG" ] || { echo "error: config file not found: $CONFIG"; exit 1; } + + # Extract name from --name flag or config basename + if [ "''${2:-}" = "--name" ] && [ -n "''${3:-}" ]; then + VM_NAME="$3" + elif [ -z "$VM_NAME" ] || [ "''${VM_NAME#--}" != "$VM_NAME" ]; then + VM_NAME="$(basename "$CONFIG" .nix)" + fi + + BUILD_DIR="$VM_DIR/$VM_NAME" + echo "==> Building VM '$VM_NAME' from config: $CONFIG" + mkdir -p "$BUILD_DIR" + + # Build the NixOS VM derivation + nix build --no-link -f "$CONFIG" vm 2>&1 || { + echo "Trying flake build..." + nix build "''${CONFIG%/.nix}#nixosConfigurations.$VM_NAME.config.system.build.vm" --no-link 2>&1 || { + echo "error: failed to build VM (tried both import and flake)" + exit 1 + } + } + + echo "==> Build complete. Run 'pr-test-vm start $VM_NAME' to launch." + ;; + + start) + VM_NAME="''${1:?Missing VM name}" + IMAGE="$VM_DIR/$VM_NAME/disk-image.qcow2" + [ -f "$IMAGE" ] || { echo "error: no disk image found at $IMAGE. Build first."; exit 1; } + + # Check if already running + STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined") + if [ "$STATE" = "running" ]; then + echo "VM '$VM_NAME' is already running." + exit 0 + fi + + echo "==> Starting VM '$VM_NAME'..." + + # Undefine if defined but not running + if [ "$STATE" != "undefined" ]; then + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true + fi + + # Define and start with virt-install + ${pkgs.virt-manager}/bin/virt-install \ + --connect "$LIBVIRT_URI" \ + --name "$VM_NAME" \ + --memory "${toString cfg.memory}" \ + --vcpus "${toString cfg.vcpus}" \ + --disk "$IMAGE",bus=virtio \ + --import \ + --network network="$NETWORK",model=virtio \ + --graphics none \ + --console pty,target_type=virtio \ + --serial pty \ + --memballoon virtio \ + --rng /dev/urandom \ + --noautoconsole \ + --os-variant detect=on,name=generic + + echo "==> VM '$VM_NAME' started. Get IP with: pr-test-vm status $VM_NAME" + ;; + + stop) + VM_NAME="''${1:?Missing VM name}" + echo "==> Stoping VM '$VM_NAME'..." + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" shutdown "$VM_NAME" 2>/dev/null && { + echo "Waiting for VM to shut down..." + for i in $(seq 1 30); do + STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined") + [ "$STATE" != "running" ] && { echo "VM stopped."; exit 0; } + sleep 2 + done + echo "warning: VM did not shut down gracefully, use 'destroy' for force" + } || { + echo "VM '$VM_NAME' not running or does not exist." + } + ;; + + destroy) + VM_NAME="''${1:?Missing VM name}" + echo "==> Destroying VM '$VM_NAME'..." + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true + echo "==> VM '$VM_NAME' destroyed and undefined." + ;; + + ssh) + TARGET="''${1:?Usage: $SCRIPT_NAME ssh [user@]}" + # Split user@hostname if present + if echo "$TARGET" | ${pkgs.gnugrep}/bin/grep -q '@'; then + USER="''${TARGET%@*}" + VM_NAME="''${TARGET#*@}" + else + VM_NAME="$TARGET" + USER="" + fi + + IP=$(vm_ip "$VM_NAME") || exit 1 + if [ -n "$USER" ]; then + exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "''${USER}@''${IP}" + else + exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$IP" + fi + ;; + + console) + VM_NAME="''${1:?Missing VM name}" + exec ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" console "$VM_NAME" + ;; + + list) + echo "Staging VMs:" + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" list --all + echo "" + echo "Active networks:" + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-list + echo "" + echo "Storage pools:" + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" pool-list + ;; + + status) + VM_NAME="''${1:?Missing VM name}" + echo "VM: $VM_NAME" + STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "not found") + echo "State: $STATE" + if [ "$STATE" = "running" ]; then + IP=$(vm_ip "$VM_NAME" 2>/dev/null || echo "N/A") + echo "IP: $IP" + ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" dommemstat "$VM_NAME" 2>/dev/null | head -3 || true + fi + ;; + + help|--help|-h) + usage + ;; + + *) + usage + ;; + esac + ''; +in +{ + options.services.stagingVm = { + enable = mkOption { + type = types.bool; + default = false; + description = "Enable KVM/libvirt staging VM for compose PR testing"; + }; + + vmName = mkOption { + type = types.str; + default = "compose-test-vm"; + description = "Name of the staging VM"; + }; + + memory = mkOption { + type = types.str; + default = "4096"; + description = "RAM allocated to the staging VM (MB)"; + }; + + vcpus = mkOption { + type = types.int; + default = 2; + description = "Number of vCPUs for the staging VM"; + }; + + storagePath = mkOption { + type = types.str; + default = "/var/lib/libvirt/images"; + description = "Path for libvirt storage pool"; + }; + + dataPath = mkOption { + type = types.str; + default = "/var/lib/staging-vm"; + description = "Path for compose test data (PR checkouts, test results)"; + }; + }; + + config = mkIf cfg.enable { + # ── libvirtd with QEMU/KVM ────────────────────────────────────────── + virtualisation.libvirtd = { + enable = true; + qemu = { + package = pkgs.qemu_kvm; + runAsRoot = true; + swtpm.enable = true; + ovmf = { + enable = true; + packages = [ pkgs.OVMF ]; + }; + }; + }; + + # ── System packages ───────────────────────────────────────────────── + environment.systemPackages = with pkgs; [ + libvirt # virsh, virt-admin + qemu_kvm # QEMU/KVM + swtpm # Software TPM + OVMF # UEFI firmware for VMs + virt-manager # GUI + virt-install + virt-viewer # SPICE/VNC viewer + libguestfs # virt-customize, guestfish + cdrtools # genisoimage for cloud-init ISOs + jq # JSON parsing + gawk # awk for DHCP lease parsing + gnugrep # grep + ]; + + # ── User permissions ──────────────────────────────────────────────── + users.users.gortium.extraGroups = [ "libvirtd" ]; + + # ── Directories ───────────────────────────────────────────────────── + systemd.tmpfiles.rules = [ + "d ${cfg.storagePath} 0755 root root -" + "d ${cfg.dataPath} 0755 root root -" + ]; + + # ── Default NAT network (192.168.122.0/24) ────────────────────────── + # Define the default libvirt NAT network using virsh postStart hook + systemd.services.libvirtd = { + postStart = '' + set -e + # Define the NAT network if it doesn't exist + ${pkgs.libvirt}/bin/virsh -c qemu:///system net-info default 2>/dev/null && { + echo "Network 'default' already exists" + } || { + echo "Defining default NAT network (192.168.122.0/24)..." + ${pkgs.libvirt}/bin/virsh -c qemu:///system net-define /etc/libvirt/qemu/networks/default.xml + } + ${pkgs.libvirt}/bin/virsh -c qemu:///system net-autostart default 2>/dev/null || true + # Start the network if not active + STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system net-state default 2>/dev/null || echo "inactive") + if [ "$STATE" != "active" ]; then + ${pkgs.libvirt}/bin/virsh -c qemu:///system net-start default 2>/dev/null || true + fi + echo "Default network ready." + ''; + }; + + # Define the default network as an XML config file + environment.etc."libvirt/qemu/networks/default.xml" = { + text = '' + + default + + + + + + + + + ''; + mode = "0644"; + }; + + # ── Storage pool ──────────────────────────────────────────────────── + systemd.services.libvirtd.postStart = mkAfter '' + set -e + ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-info default 2>/dev/null && { + echo "Storage pool 'default' already exists" + } || { + echo "Defining storage pool at ${cfg.storagePath}..." + ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-define-as \ + --name default --type dir --target "${cfg.storagePath}" + } + ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-autostart default 2>/dev/null || true + STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system pool-state default 2>/dev/null || echo "inactive") + if [ "$STATE" != "running" ]; then + ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-build default 2>/dev/null || true + ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-start default 2>/dev/null || true + fi + echo "Storage pool ready." + ''; + + # ── Firewall rules for libvirt guests ─────────────────────────────── + networking.firewall = { + trustedInterfaces = [ "virbr0" ]; + + extraCommands = mkAfter '' + # Allow DHCP (port 67/68) and DNS (port 53) to libvirt guests + iptables -I INPUT -i virbr0 -p udp --dport 67:68 -j ACCEPT 2>/dev/null || true + iptables -I INPUT -i virbr0 -p tcp --dport 53 -j ACCEPT 2>/dev/null || true + iptables -I INPUT -i virbr0 -p udp --dport 53 -j ACCEPT 2>/dev/null || true + + # Allow forwarding between the bridge and the outside world + iptables -I FORWARD -i virbr0 -o virbr0 -j ACCEPT 2>/dev/null || true + iptables -I FORWARD -o virbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT 2>/dev/null || true + iptables -I FORWARD -i virbr0 -j ACCEPT 2>/dev/null || true + + # NAT for guest outbound traffic + iptables -t nat -I POSTROUTING -s 192.168.122.0/24 -j MASQUERADE 2>/dev/null || true + ''; + }; + + # ── pr-test-vm helper script ──────────────────────────────────────── + environment.systemPackages = [ pr-test-vm ]; + }; +} diff --git a/tests/run-integration.sh b/tests/run-integration.sh new file mode 100755 index 0000000..523f1c0 --- /dev/null +++ b/tests/run-integration.sh @@ -0,0 +1,347 @@ +#!/usr/bin/env bash +# ============================================================================= +# run-integration.sh — Staging VM Integration Test Suite +# +# Verifies Docker daemon, compose stack, and service endpoint health. +# Designed to run inside the staging VM as part of CI/CD pipeline. +# +# Usage: +# ./tests/run-integration.sh # all defaults +# ./tests/run-integration.sh --verbose # detailed output +# ./tests/run-integration.sh --list-services # print detected services and exit +# +# Environment variables (all optional): +# COMPOSE_DIR Path to compose service directories (default: /opt/infra/compose) +# COMPOSE_PROJECT Docker Compose project name (default: staging) +# STAGING_DOMAIN Base domain for health checks (default: staging.lazyworkhorse.net) +# SERVICE_LIST Space-separated service dirs to check (default: auto-detect) +# HEALTH_URLS Space-separated URLs for health checks (default: auto-detect from SERVICE_LIST) +# HEALTH_TIMEOUT Curl timeout per check (seconds) (default: 5) +# HEALTH_RETRIES Number of retries per endpoint (default: 1) +# HEALTH_INTERVAL Seconds between retries (default: 2) +# ============================================================================= + +set -euo pipefail + +# ---- Colors for readable output ---- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# ---- Configuration (all env-overridable) ---- +COMPOSE_DIR="${COMPOSE_DIR:-/opt/infra/compose}" +COMPOSE_PROJECT="${COMPOSE_PROJECT:-staging}" +STAGING_DOMAIN="${STAGING_DOMAIN:-staging.lazyworkhorse.net}" +HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-5}" +HEALTH_RETRIES="${HEALTH_RETRIES:-1}" +HEALTH_INTERVAL="${HEALTH_INTERVAL:-2}" + +# Known compose service directories in order — override via SERVICE_LIST env var +DEFAULT_SERVICES=( + network + authentification + homepage + ai + cloudstorage + versioncontrol + backup + coms + finance + homeautomation + passwordmanager +) + +# Map service directory -> default health check URL (relative to STAGING_DOMAIN) +# Override entirely via HEALTH_URLS env var. +declare -A DEFAULT_HEALTH_URLS +DEFAULT_HEALTH_URLS[network]="https://traefik.${STAGING_DOMAIN}/ping" +DEFAULT_HEALTH_URLS[authentification]="https://auth.${STAGING_DOMAIN}/api/verify" +DEFAULT_HEALTH_URLS[homepage]="https://${STAGING_DOMAIN}/" +DEFAULT_HEALTH_URLS[ai]="https://hermes.${STAGING_DOMAIN}/health" +DEFAULT_HEALTH_URLS[cloudstorage]="https://cloud.${STAGING_DOMAIN}/status.php" +DEFAULT_HEALTH_URLS[versioncontrol]="https://code.${STAGING_DOMAIN}/api/healthz" + +# ---- Trackers ---- +PASS_COUNT=0 +FAIL_COUNT=0 +WARN_COUNT=0 +FAILURES=() + +# ---- Helpers ---- + +log_info() { echo -e "${CYAN}[INFO]${NC} $*"; } +log_pass() { echo -e "${GREEN}[PASS]${NC} $*"; ((PASS_COUNT++)); } +log_fail() { echo -e "${RED}[FAIL]${NC} $*"; ((FAIL_COUNT++)); FAILURES+=("$*"); } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; ((WARN_COUNT++)); } +log_step() { echo -e "\n${BOLD}── $* ──${NC}"; } +log_raw() { echo -e " $*"; } + +# Check if a command exists +require_cmd() { + if ! command -v "$1" &>/dev/null; then + log_fail "Required command not found: $1" + return 1 + fi +} + +# Retry a command with exponential-like backoff +retry() { + local cmd="$*" + local attempt=0 + local max_attempts=$((HEALTH_RETRIES + 1)) + local result + + while [[ $attempt -lt $max_attempts ]]; do + if eval "$cmd" 2>/dev/null; then + return 0 + fi + attempt=$((attempt + 1)) + if [[ $attempt -lt $max_attempts ]]; then + sleep "$HEALTH_INTERVAL" + fi + done + return 1 +} + +# ---- Parse arguments ---- +VERBOSE=false +LIST_SERVICES=false +POSITIONAL=() +while [[ $# -gt 0 ]]; do + case "$1" in + --verbose|-v) VERBOSE=true; shift ;; + --list-services) LIST_SERVICES=true; shift ;; + --) shift; POSITIONAL+=("$@"); break ;; + *) POSITIONAL+=("$1"); shift ;; + esac +done +set -- "${POSITIONAL[@]}" + +# Resolve service list +if [[ -n "${SERVICE_LIST:-}" ]]; then + IFS=' ' read -ra SERVICES <<< "$SERVICE_LIST" +else + SERVICES=("${DEFAULT_SERVICES[@]}") +fi + +# Resolve health URLs — default map with overrides from env +declare -A HEALTH_URLS +if [[ -n "${HEALTH_URLS:-}" ]]; then + # User-supplied mapping: "network=https://... authentification=https://..." + for pair in $HEALTH_URLS; do + key="${pair%%=*}" + val="${pair#*=}" + HEALTH_URLS["$key"]="$val" + done +else + for svc in "${SERVICES[@]}"; do + if [[ -n "${DEFAULT_HEALTH_URLS[$svc]:-}" ]]; then + HEALTH_URLS["$svc"]="${DEFAULT_HEALTH_URLS[$svc]}" + fi + done +fi + +# --list-services mode (for CI integration) +if $LIST_SERVICES; then + echo "Configured services:" + for svc in "${SERVICES[@]}"; do + url="${HEALTH_URLS[$svc]:-no-health-check}" + echo " $svc -> $url" + done + exit 0 +fi + +# ---- Pre-flight ---- +echo -e "${BOLD}============================================${NC}" +echo -e "${BOLD} Staging VM Integration Test Suite${NC}" +echo -e "${BOLD} $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}" +echo -e "${BOLD}============================================${NC}" + +# ---- Phase 1: Prerequisites ---- +log_step "Phase 1: Prerequisites" + +PREREQ_OK=true +for cmd in docker curl jq; do + if ! require_cmd "$cmd"; then + PREREQ_OK=false + fi +done +$PREREQ_OK && log_pass "All required commands available" || log_fail "Missing prerequisites" + +# ---- Phase 2: Docker daemon ---- +log_step "Phase 2: Docker Daemon" + +if docker info --format '{{.ServerVersion}}' &>/dev/null; then + DOCKER_VERSION=$(docker info --format '{{.ServerVersion}}' 2>/dev/null) + log_pass "Docker daemon is running (version: $DOCKER_VERSION)" + + if docker info --format '{{.Driver}}' 2>/dev/null | grep -qi "overlay"; then + log_pass "Storage driver: overlay" + else + log_warn "Non-overlay storage driver detected" + fi +else + log_fail "Docker daemon is NOT running or not accessible" +fi + +# ---- Phase 3: Docker Compose stack ---- +log_step "Phase 3: Compose Stack Status" + +# Check if any compose files exist +COMPOSE_FILES=() +for svc in "${SERVICES[@]}"; do + cf="${COMPOSE_DIR}/${svc}/compose.yml" + if [[ -f "$cf" ]]; then + COMPOSE_FILES+=("$cf") + else + cf2="${COMPOSE_DIR}/${svc}/docker-compose.yml" + if [[ -f "$cf2" ]]; then + COMPOSE_FILES+=("$cf2") + else + log_warn "No compose file found for service '$svc' (expected: ${cf})" + fi + fi +done + +if [[ ${#COMPOSE_FILES[@]} -eq 0 ]]; then + log_fail "No compose files found under COMPOSE_DIR=${COMPOSE_DIR}" + log_info "Skipping stack checks" +else + log_info "Found ${#COMPOSE_FILES[@]} compose file(s) in ${COMPOSE_DIR}" + + # Build the compose file args + COMPOSE_CMD="docker compose -p ${COMPOSE_PROJECT}" + for cf in "${COMPOSE_FILES[@]}"; do + COMPOSE_CMD+=" -f ${cf}" + done + + log_info "Project name: ${COMPOSE_PROJECT}" + + # Check stack ps + if $VERBOSE; then + log_raw "--- docker compose ps output ---" + eval "$COMPOSE_CMD ps" 2>&1 | while IFS= read -r line; do log_raw "$line"; done + log_raw "--- end ---" + fi + + # Get all services and their status + if STACK_STATUS=$(eval "$COMPOSE_CMD ps --format '{{.Name}}\t{{.Status}}'" 2>/dev/null); then + if [[ -z "$STACK_STATUS" ]]; then + log_warn "Stack exists but no running services — VM may be freshly provisioned" + else + ALL_RUNNING=true + RUNNING_COUNT=0 + TOTAL_COUNT=0 + while IFS=$'\t' read -r name status; do + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + status_lower=$(echo "$status" | tr '[:upper:]' '[:lower:]') + if echo "$status_lower" | grep -qE '^(up|running|healthy)'; then + RUNNING_COUNT=$((RUNNING_COUNT + 1)) + $VERBOSE && log_pass " $name — $status" + else + ALL_RUNNING=false + log_warn " $name — $status (not healthy)" + fi + done <<< "$STACK_STATUS" + + if [[ "$TOTAL_COUNT" -eq 0 ]]; then + log_fail "No services found in compose project" + elif $ALL_RUNNING && [[ "$TOTAL_COUNT" -eq "$RUNNING_COUNT" ]]; then + log_pass "All ${TOTAL_COUNT} service(s) running (${RUNNING_COUNT}/${TOTAL_COUNT})" + else + log_fail "${RUNNING_COUNT}/${TOTAL_COUNT} service(s) running — some services are down" + fi + fi + else + log_fail "Failed to query compose stack status" + fi +fi + +# ---- Phase 4: Service health checks ---- +log_step "Phase 4: Service Endpoint Health Checks" + +ENDPOINT_CHECKS=0 +ENDPOINT_PASS=0 + +for svc in "${SERVICES[@]}"; do + url="${HEALTH_URLS[$svc]:-}" + if [[ -z "$url" ]]; then + $VERBOSE && log_info "No health check URL for service '$svc' — skipping" + continue + fi + + ENDPOINT_CHECKS=$((ENDPOINT_CHECKS + 1)) + echo -ne " Checking ${svc} ... " + + # Perform the HTTP health check with retries + if retry "curl -sf -o /dev/null -w '%{http_code}' --max-time ${HEALTH_TIMEOUT} '${url}' 2>/dev/null"; then + HTTP_CODE=$(curl -sf -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || true) + ENDPOINT_PASS=$((ENDPOINT_PASS + 1)) + echo -e "${GREEN}OK${NC} (HTTP ${HTTP_CODE})" + else + LAST_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || echo "000") + echo -e "${RED}FAIL${NC} (HTTP ${LAST_CODE})" + log_fail "Health check failed for ${svc} @ ${url}" + fi +done + +if [[ $ENDPOINT_CHECKS -eq 0 ]]; then + log_warn "No health check URLs configured — skipping endpoint phase" +elif [[ $ENDPOINT_PASS -eq $ENDPOINT_CHECKS ]]; then + log_pass "All ${ENDPOINT_CHECKS} endpoint(s) healthy" +else + log_fail "${ENDPOINT_PASS}/${ENDPOINT_CHECKS} endpoint(s) healthy" +fi + +# ---- Phase 5: Docker system sanity ---- +log_step "Phase 5: Docker System Sanity" + +# Check disk space for Docker +DOCKER_ROOT=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") +log_info "Docker root: ${DOCKER_ROOT}" + +if command -v df &>/dev/null && [[ -d "$DOCKER_ROOT" ]]; then + AVAIL_PCT=$(df -h "$DOCKER_ROOT" | awk 'NR==2 {print $5}' | tr -d '%') + if [[ -n "$AVAIL_PCT" ]]; then + if [[ "$AVAIL_PCT" -ge 90 ]]; then + log_warn "Docker storage is ${AVAIL_PCT}% full — consider cleanup" + else + log_pass "Docker storage at ${AVAIL_PCT}% — within limits" + fi + fi +fi + +# Check for dangling images +DANGLING=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l) +if [[ "$DANGLING" -gt 10 ]]; then + log_warn "${DANGLING} dangling images found — consider docker image prune" +fi + +# ---- Summary ---- +echo "" +echo -e "${BOLD}============================================${NC}" +echo -e "${BOLD} Test Summary${NC}" +echo -e "${BOLD} $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}" +echo -e "${BOLD}============================================${NC}" +echo -e " ${GREEN}Passed:${NC} ${PASS_COUNT}" +echo -e " ${RED}Failed:${NC} ${FAIL_COUNT}" +echo -e " ${YELLOW}Warnings:${NC} ${WARN_COUNT}" + +if [[ ${#FAILURES[@]} -gt 0 ]]; then + echo -e "\n${BOLD}Failed checks:${NC}" + for f in "${FAILURES[@]}"; do + echo -e " ${RED}•${NC} $f" + done +fi + +echo "" +if [[ $FAIL_COUNT -eq 0 ]]; then + echo -e "${GREEN}${BOLD}✓ All integration checks passed${NC}" + exit 0 +else + echo -e "${RED}${BOLD}✗ ${FAIL_COUNT} integration check(s) failed${NC}" + exit 1 +fi diff --git a/users/ai-worker.nix b/users/ai-worker.nix index 6308151..b7a534d 100644 --- a/users/ai-worker.nix +++ b/users/ai-worker.nix @@ -4,7 +4,7 @@ group = "ai-worker"; home = "/home/ai-worker"; createHome = true; - extraGroups = [ "docker" ]; + extraGroups = [ "docker" "libvirtd" ]; shell = pkgs.bashInteractive; openssh.authorizedKeys.keys = [ keys.users.ai-worker.main