feat: integrate rollback sentinel as NixOS module

Add rollback-sentinel NixOS module that: - Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as system packages - Runs a boot-time systemd oneshot service after multi-user.target with configurable delay — checks Tier-1 services, triggers rollback on failure - Runs a post-rebuild service via activation script after every nixos-rebuild switch - Exposes options for tier1Services, tier2Services, tier3InfoServices, bootDelay, rollbackMode (set-default/rollback-now/dry-run), and enablePostRebuild Module wired into flake.nix for lazyworkhorse and enabled in configuration.nix with standard Tier-1/2 service lists and 120s delay.
Merge pull request 'feat: add Syncthing firewall port and update compose submodule' (#47 ) from feat/syncthing-org-sync into master
2026-05-25 00:09:20 -04:00 · 2026-05-19 00:34:42 +00:00 · 2026-05-18 20:33:29 -04:00 · 2026-05-18 20:31:07 -04:00 · 2026-05-14 21:40:12 -04:00 · 2026-05-14 21:36:26 -04:00
9 changed files with 625 additions and 773 deletions
--- a/.gitea/workflows/build-nixos.yml
+++ b/.gitea/workflows/build-nixos.yml
@@ -1,52 +0,0 @@
-name: Build and test NixOS config
-on:
-  pull_request:
-    branches: [ master ]
-    paths:
-      - '**.nix'
-      - 'flake.lock'
-      - 'secrets/**'
-      - 'hosts/**'
-      - 'modules/**'
-  push:
-    branches: [ master ]
-    paths:
-      - '**.nix'
-      - 'flake.lock'
-      - 'secrets/**'
-      - 'hosts/**'
-      - 'modules/**'
-
-jobs:
-  build:
-    runs-on: nixos-builder
-    steps:
-      - name: Checkout
-        run: |
-          git clone -b "${{ github.head_ref || github.ref_name }}" \
-            https://gitea:${{ secrets.GITHUB_TOKEN }}@code.lazyworkhorse.net/gortium/infra.git .
-          git log --oneline -3
-
-      - name: Build NixOS config
-        run: |
-          nix --version
-          nh os build .#lazyworkhorse 2>&1
-
-      - name: Run integration tests (staging VM)
-        run: |
-          echo "==> Running integration tests on staging VM..."
-          echo ""
-          echo "  To execute inside the VM:"
-          echo "    pr-test-vm build    # Build the NixOS VM image"
-          echo "    pr-test-vm start    # Boot the VM (SSH on localhost:2223)"
-          echo "    pr-test-vm ssh bash -s < tests/run-integration.sh"
-          echo "    pr-test-vm destroy  # Clean up"
-          echo ""
-          echo "  Or with environment overrides:"
-          echo "    COMPOSE_DIR=/opt/staging/compose \\"
-          echo "      pr-test-vm ssh bash -s < tests/run-integration.sh"
-          echo ""
-          echo "  List configured services and URLs:"
-          echo "    pr-test-vm ssh bash -s < tests/run-integration.sh -- --list-services"
-          echo ""
-          echo "==> VM integration step ready when libvirt runner is available."
--- a/assets/compose
+++ b/assets/compose
--- a/flake.nix
+++ b/flake.nix
@@ -61,7 +61,7 @@
              ./modules/nixos/services/open_code_server.nix
              ./modules/nixos/services/ollama_init_custom_models.nix
              ./modules/nixos/services/openclaw_node.nix
-              ./modules/nixos/services/staging-vm.nix
+              ./modules/nixos/services/rollback-sentinel.nix
              ./modules/nixos/security/ai-worker-restricted.nix
              ./users/gortium.nix
              ./users/ai-worker.nix
--- a/hosts/lazyworkhorse/configuration.nix
+++ b/hosts/lazyworkhorse/configuration.nix
@@ -36,7 +36,7 @@
    "transparent_hugepage=always" # because mucho ram
  ];
  # 2. Load the specific drivers found by sensors-detect
-  boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" "kvm-intel" "kvm" ];
+  boot.kernelModules = [ "nct6775" "lm96163" "iptable_nat" "iptable_filter" ];
  # 3. Force the nct6775 driver to recognize the chip if it's stubborn
  boot.extraModprobeConfig = ''
    options nct6775 force_id=0xd280
@@ -207,6 +207,7 @@
    ai = {
      path = self + "/assets/compose/ai";
      envFile = config.age.secrets.containers_env.path;
+      ports = [ 22000 ];  # Syncthing TCP sync
    };

    cloudstorage = {
@@ -320,29 +321,58 @@
  environment.etc."ssh/ssh_host_ed25519_key.pub".text =
    "${keys.hosts.lazyworkhorse.main}";

+  # ── Boot sentinel: auto-rollback on critical service failure ───────────────
+  services.rollbackSentinel.enable = true;
+  # Tier-1: failure triggers rollback
+  services.rollbackSentinel.tier1Services = [
+    "sshd" "docker" "traefik" "authelia"
+  ];
+  # Tier-2: warn only
+  services.rollbackSentinel.tier2Services = [
+    "gitea" "hermes" "ollama" "synapse" "nextcloud"
+    "vaultwarden" "wireguard" "homeassistant" "fail2ban"
+  ];
+  # Wait 2 minutes after boot before checking (lets services initialize)
+  services.rollbackSentinel.bootDelay = "120";
+  # Change boot default only (not --rollback-now) for safety
+  services.rollbackSentinel.rollbackMode = "set-default";
+
  services.fstrim.enable = true;

  services.zfs.autoSnapshot.enable = true;
  services.zfs.autoScrub.enable = true;
+
+  # Ensure com.sun:auto-snapshot is set on ZFS datasets so auto-snapshots actually run
+  systemd.services."zfs-set-auto-snapshot" = {
+    description = "Set com.sun:auto-snapshot=true on ZFS datasets";
+    after = [ "zfs-import.target" ];
+    wants = [ "zfs-import.target" ];
+    wantedBy = [ "multi-user.target" ];
+    path = with pkgs; [ zfs ];
+    serviceConfig = {
+      Type = "oneshot";
+      RemainAfterExit = true;
+      ExecStart = "${pkgs.zfs}/bin/zfs set -r com.sun:auto-snapshot=true rpool";
+    };
+  };
  
  # Mi50 config
  hardware.graphics = {
    enable = true;
-    enable32Bit = true;
+    enable32Bit = true; # Useful for some compatibility layers
    extraPackages = with pkgs; [
-      rocmPackages.clr.icd
+      rocmPackages.clr.icd # OpenCL/HIP runtime
    ];
  };
  nixpkgs.config.rocmTargets = [ "gfx906" ];
  environment.variables = {
+    # This "tricks" ROCm into supporting the MI50 if using newer versions
    HSA_OVERRIDE_GFX_VERSION = "9.0.6";
+    # Ensures the system sees both GPUs
    HIP_VISIBLE_DEVICES = "0,1";
  };

-  # KVM/libvirt for staging VM
-  services.stagingVm.enable = true;
-
-  # Open ports in the firewall.
+ # Open ports in the firewall.
  # networking.firewall.allowedTCPPorts = [ ... ];
  # networking.firewall.allowedUDPPorts = [ ... ];
  # Or disable the firewall altogether.
@@ -475,7 +505,7 @@
  services.openssh.settings = {
    PermitRootLogin = "no";
    MaxAuthTries = 3;
-    MaxSessions = 10;
+    MaxSessions = 20;
    LoginGraceTime = 30;
    ClientAliveInterval = 300;
    ClientAliveCountMax = 2;
--- a/modules/nixos/services/nixos-rollback.sh
+++ b/modules/nixos/services/nixos-rollback.sh
@@ -0,0 +1,400 @@
+#!/usr/bin/env bash
+# =============================================================================
+# nixos-rollback.sh — NixOS systemd-boot Rollback Script
+#
+# Detects a failed NixOS generation (critical services not starting) and sets
+# the previous generation as the default boot option for systemd-boot.
+# Logs all actions to syslog/journald and a local logfile. Fails safely when
+# no previous generation exists or required files are missing.
+#
+# Integration with the boot sentinel:
+#   sentinel-check.sh  →  detects Tier-1 service failures (sshd, docker,
+#                          traefik, authelia) after a boot
+#   nixos-rollback.sh  ←  called when sentinel exits nonzero; sets previous
+#                          generation as default for next boot
+#
+# Usage:
+#   nixos-rollback.sh                        # auto-detect & set previous gen
+#   nixos-rollback.sh --dry-run              # show what would be done
+#   nixos-rollback.sh --rollback-now         # also run nixos-rebuild switch
+#                                            #   --rollback for immediate fix
+#   nixos-rollback.sh --help                 # full help text
+#
+# Exit codes:
+#   0 — rollback applied (or dry-run would apply)
+#   1 — preflight failure (missing files, permissions)
+#   2 — no previous generation available
+#   3 — nixos-rebuild --rollback failed (only with --rollback-now)
+#
+# Installation on NixOS:
+#   Place in /usr/local/bin/nixos-rollback.sh and make executable.
+#   Add a systemd oneshot service to run it after sentinel-check detects
+#   failures, or invoke directly from a sentinel timer.
+# =============================================================================
+
+set -euo pipefail
+
+# ── Configuration ────────────────────────────────────────────────────────────
+# These can be overridden via environment variables for testing.
+LOADER_CONF="${NIXOS_ROLLBACK_LOADER_CONF:-/boot/loader/loader.conf}"
+ENTRIES_DIR="${NIXOS_ROLLBACK_ENTRIES_DIR:-/boot/loader/entries}"
+LOGFILE="${NIXOS_ROLLBACK_LOGFILE:-/var/log/nixos-rollback.log}"
+SYSLOG_IDENT="nixos-rollback"
+
+# ── CLI flags ────────────────────────────────────────────────────────────────
+DRY_RUN=false
+ROLLBACK_NOW=false
+
+# ── Colors (disabled when not a terminal) ────────────────────────────────────
+if [ -t 1 ]; then
+    RED='\033[0;31m'
+    GREEN='\033[0;32m'
+    YELLOW='\033[1;33m'
+    CYAN='\033[0;36m'
+    NC='\033[0m' # No Color
+else
+    RED=''; GREEN=''; YELLOW=''; CYAN=''; NC=''
+fi
+
+# =============================================================================
+# Help
+# =============================================================================
+usage() {
+    cat <<EOF
+${CYAN}nixos-rollback.sh${NC} — Set the previous NixOS generation as systemd-boot default
+
+${CYAN}USAGE${NC}
+    nixos-rollback.sh [OPTIONS]
+
+${CYAN}OPTIONS${NC}
+    --dry-run           Show what would be done without making changes
+    --rollback-now      Also run 'nixos-rebuild switch --rollback' for
+                        immediate fix of the running system (requires
+                        nixos-rebuild on PATH)
+    -h, --help          Show this help text
+
+${CYAN}DESCRIPTION${NC}
+    Reads the current default boot entry from ${LOADER_CONF},
+    determines the previous generation number, and writes it as the
+    new default.  The script only modifies systemd-boot config —
+    it does NOT touch the Nix store or system profile unless
+    --rollback-now is passed.
+
+    Designed as the rollback half of a boot sentinel:
+      1. System boots into generation N
+      2. sentinel-check.sh detects Tier-1 service failures
+      3. nixos-rollback.sh sets default to generation N-1
+      4. Next reboot uses the working generation
+
+${CYAN}EXIT CODES${NC}
+    0   Rollback applied (or dry-run would apply)
+    1   Preflight failure (missing files, permissions)
+    2   No previous generation available (only one generation)
+    3   nixos-rebuild --rollback failed (with --rollback-now)
+
+${CYAN}FILES${NC}
+    ${LOADER_CONF}      systemd-boot loader configuration
+    ${ENTRIES_DIR}/     generation entry .conf files
+    ${LOGFILE}          action log (append-only)
+EOF
+}
+
+# =============================================================================
+# Logging
+# =============================================================================
+log() {
+    local level="$1"; shift
+    local msg="$*"
+    local timestamp
+    timestamp="$(date '+%Y-%m-%d %H:%M:%S')"
+    echo "${timestamp} [${level}] ${msg}" >> "${LOGFILE}"
+    logger -t "${SYSLOG_IDENT}" -p "user.${level}" "${msg}"
+
+    # Also print to stderr for ERROR/WARN, stdout for INFO
+    case "${level}" in
+        ERROR) echo >&2 "${RED}[ERROR]${NC} ${msg}" ;;
+        WARN)  echo >&2 "${YELLOW}[WARN]${NC}  ${msg}" ;;
+        INFO)  echo " ${GREEN}[INFO]${NC}  ${msg}" ;;
+    esac
+}
+
+info()  { log "INFO" "$@"; }
+warn()  { log "WARN" "$@"; }
+error() { log "ERROR" "$@"; }
+
+# =============================================================================
+# Preflight checks
+# =============================================================================
+preflight() {
+    # Must run as root (need to write to /boot), unless overridden for testing
+    if [ -z "${NIXOS_ROLLBACK_SKIP_ROOT_CHECK:-}" ] && [ "$(id -u)" -ne 0 ]; then
+        error "This script must be run as root (needs write access to /boot/loader)"
+        error "Set NIXOS_ROLLBACK_SKIP_ROOT_CHECK=1 for testing against mock paths."
+        exit 1
+    fi
+
+    # Directories and files
+    if [ ! -d "${ENTRIES_DIR}" ]; then
+        error "Boot entries directory not found: ${ENTRIES_DIR}"
+        exit 1
+    fi
+
+    if [ ! -f "${LOADER_CONF}" ]; then
+        error "Loader config not found: ${LOADER_CONF}"
+        exit 1
+    fi
+
+    if [ ! -r "${LOADER_CONF}" ]; then
+        error "Cannot read loader config: ${LOADER_CONF}"
+        exit 1
+    fi
+
+    # Check write access to /boot/loader (parent of loader.conf)
+    local loader_dir
+    loader_dir="$(dirname "${LOADER_CONF}")"
+    if [ ! -w "${loader_dir}" ]; then
+        error "Cannot write to ${loader_dir} (insufficient permissions)"
+        exit 1
+    fi
+
+    # Logfile directory must exist
+    local log_dir
+    log_dir="$(dirname "${LOGFILE}")"
+    if [ ! -d "${log_dir}" ]; then
+        warn "Log directory ${log_dir} does not exist, creating it"
+        mkdir -p "${log_dir}" 2>/dev/null || {
+            error "Cannot create log directory ${log_dir}"
+            exit 1
+        }
+    fi
+
+    # Check --rollback-now dependencies
+    if [ "${ROLLBACK_NOW}" = true ]; then
+        if ! command -v nixos-rebuild &>/dev/null; then
+            error "nixos-rebuild not found on PATH (required for --rollback-now)"
+            exit 1
+        fi
+    fi
+}
+
+# =============================================================================
+# Generation helpers
+# =============================================================================
+
+# get_current_default: reads the current default entry from loader.conf
+# Returns: "nixos-generation-N.conf" or empty string
+get_current_default() {
+    grep -E '^default\s+' "${LOADER_CONF}" 2>/dev/null \
+        | awk '{print $2}' \
+        || true
+}
+
+# extract_gen_number: extracts the numeric generation from a conf filename
+# Input:  "nixos-generation-367.conf"
+# Output: 367
+extract_gen_number() {
+    echo "$1" | sed 's/nixos-generation-//;s/\.conf//'
+}
+
+# get_all_gen_numbers: returns sorted list of generation numbers from entries dir
+get_all_gen_numbers() {
+    local -a gens=()
+    local f n
+    for f in "${ENTRIES_DIR}"/nixos-generation-*.conf; do
+        [ -f "${f}" ] || continue
+        n="$(basename "${f}" | sed 's/nixos-generation-//;s/\.conf//')"
+        gens+=("${n}")
+    done
+
+    if [ "${#gens[@]}" -eq 0 ]; then
+        return 1
+    fi
+
+    # Sort numerically and output
+    printf '%s\n' "${gens[@]}" | sort -n
+}
+
+# get_previous_gen: given current generation number, find the previous one
+# from the list of all available generations
+get_previous_gen() {
+    local current="$1"
+    shift
+    local -a gens=("$@")
+
+    local prev=""
+    local g
+    for g in "${gens[@]}"; do
+        if [ "${g}" -lt "${current}" ]; then
+            prev="${g}"
+        fi
+    done
+
+    if [ -z "${prev}" ]; then
+        return 1
+    fi
+    echo "${prev}"
+}
+
+# =============================================================================
+# Main rollback logic
+# =============================================================================
+do_rollback() {
+    # Step 1: Read current default
+    local current_entry
+    current_entry="$(get_current_default)"
+
+    if [ -z "${current_entry}" ]; then
+        error "No 'default' entry found in ${LOADER_CONF}"
+        error "Cannot determine current generation — aborting"
+        exit 1
+    fi
+
+    info "Current default boot entry: ${current_entry}"
+
+    # Step 2: Build sorted list of all available generations
+    local -a all_gens=()
+    local line
+    while IFS= read -r line; do
+        all_gens+=("${line}")
+    done < <(get_all_gen_numbers || true)
+
+    if [ "${#all_gens[@]}" -eq 0 ]; then
+        error "No NixOS generation .conf files found in ${ENTRIES_DIR}"
+        exit 1
+    fi
+
+    info "Available generations: ${all_gens[*]}"
+
+    # Step 3: Find current generation number
+    local current_gen
+    current_gen="$(extract_gen_number "${current_entry}")"
+
+    # Verify current_gen is a valid number
+    if ! [[ "${current_gen}" =~ ^[0-9]+$ ]]; then
+        error "Could not parse generation number from '${current_entry}'"
+        exit 1
+    fi
+
+    # Step 4: Find the previous generation
+    local prev_gen
+    prev_gen="$(get_previous_gen "${current_gen}" "${all_gens[@]}")" || {
+        error "No previous generation found before generation ${current_gen}"
+        error "This is the oldest available generation — cannot roll back further"
+        exit 2
+    }
+
+    local prev_entry="nixos-generation-${prev_gen}.conf"
+    local prev_conf_path="${ENTRIES_DIR}/${prev_entry}"
+
+    if [ ! -f "${prev_conf_path}" ]; then
+        error "Previous generation entry not found: ${prev_conf_path}"
+        error "The .conf file for generation ${prev_gen} is missing — cannot roll back"
+        exit 1
+    fi
+
+    info "Target rollback generation: ${prev_gen} → ${prev_entry}"
+
+    # Step 5: Apply the rollback
+    if [ "${DRY_RUN}" = true ]; then
+        echo ""
+        echo " ${CYAN}[DRY RUN]${NC} Would change ${LOADER_CONF}:"
+        echo "   ${YELLOW}-${NC} default ${current_entry}"
+        echo "   ${GREEN}+${NC} default ${prev_entry}"
+        echo ""
+        info "DRY RUN — no changes made"
+        exit 0
+    fi
+
+    # Write new default
+    # Use sed with a backup (.bak)
+    sed -i.bak "s/^default\s\+${current_entry}/default ${prev_entry}/" "${LOADER_CONF}"
+
+    # Verify the change was applied
+    local new_default
+    new_default="$(get_current_default)"
+    if [ "${new_default}" != "${prev_entry}" ]; then
+        error "Failed to set default boot entry to ${prev_entry}"
+        error "Current default is still: ${new_default}"
+        # Attempt to restore backup
+        if [ -f "${LOADER_CONF}.bak" ]; then
+            cp "${LOADER_CONF}.bak" "${LOADER_CONF}"
+            info "Restored backup from ${LOADER_CONF}.bak"
+        fi
+        exit 1
+    fi
+
+    info "Successfully set default boot entry to ${prev_entry} (generation ${prev_gen})"
+    info "Backup of previous config saved to ${LOADER_CONF}.bak"
+
+    # Step 6: Optionally run nixos-rebuild switch --rollback
+    if [ "${ROLLBACK_NOW}" = true ]; then
+        echo ""
+        info "Running nixos-rebuild switch --rollback for immediate effect..."
+        if nixos-rebuild switch --rollback 2>&1 | while IFS= read -r line; do
+            logger -t "${SYSLOG_IDENT}" "nixos-rebuild: ${line}"
+            echo "    ${line}"
+        done; then
+            info "nixos-rebuild switch --rollback completed successfully"
+        else
+            local rc=$?
+            error "nixos-rebuild switch --rollback failed with exit code ${rc}"
+            error "The boot default has been changed but the current system was NOT rolled back"
+            error "Reboot to apply the rollback"
+            exit 3
+        fi
+    fi
+
+    info "Rollback complete. Next boot will use generation ${prev_gen}."
+    if [ "${ROLLBACK_NOW}" = false ]; then
+        echo ""
+        echo " ${YELLOW}NOTE:${NC} The current running system is unchanged."
+        echo "       Reboot to boot into generation ${prev_gen}."
+        echo "       Or re-run with --rollback-now for immediate effect."
+    fi
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+main() {
+    # Parse arguments
+    while [ $# -gt 0 ]; do
+        case "$1" in
+            --dry-run)
+                DRY_RUN=true
+                shift
+                ;;
+            --rollback-now)
+                ROLLBACK_NOW=true
+                shift
+                ;;
+            -h|--help)
+                usage
+                exit 0
+                ;;
+            *)
+                echo >&2 "Unknown option: $1"
+                echo >&2 "Use --help for usage information."
+                exit 1
+                ;;
+        esac
+    done
+
+    echo ""
+    echo " ${CYAN}═══ NixOS systemd-boot Rollback ═══${NC}"
+    echo ""
+
+    preflight
+
+    if [ "${DRY_RUN}" = true ]; then
+        info "DRY RUN mode — no changes will be made"
+    fi
+    if [ "${ROLLBACK_NOW}" = true ]; then
+        info "ROLLBACK NOW mode — will also run nixos-rebuild switch --rollback"
+    fi
+
+    echo ""
+    do_rollback
+}
+
+main "$@"
--- a/modules/nixos/services/rollback-sentinel.nix
+++ b/modules/nixos/services/rollback-sentinel.nix
@@ -0,0 +1,184 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.services.rollbackSentinel;
+
+  # ── Scripts ────────────────────────────────────────────────────────────────
+
+  # Sentinel check — verifies Tier-1 services are active after boot.
+  # Exits nonzero when any Tier-1 service is down, which triggers the rollback.
+  sentinelCheck = pkgs.writeShellScriptBin "sentinel-check.sh" ''
+    #!/usr/bin/env bash
+    set -euo pipefail
+
+    SYSLOG_IDENT="nixos-sentinel"
+    LOGFILE="/var/log/nixos-sentinel.log"
+
+    echo "=== NixOS Sentinel Check ==="
+    echo "Tier-1 services: ${builtins.toString cfg.tier1Services}"
+    echo "Tier-2 services: ${builtins.toString cfg.tier2Services}"
+
+    FAILED=0
+
+    # Check Tier-1 services — any failure means rollback
+    for svc in ${builtins.toString cfg.tier1Services}; do
+      if systemctl is-active --quiet "$svc" 2>/dev/null; then
+        echo "  [OK]  Tier-1: $svc"
+      else
+        echo "  [FAIL] Tier-1: $svc is NOT active"
+        logger -t "$SYSLOG_IDENT" -p user.err "Tier-1 FAILURE: $svc is not active"
+        FAILED=1
+      fi
+    done
+
+    # Check Tier-2 services — warn only
+    for svc in ${builtins.toString cfg.tier2Services}; do
+      if systemctl is-active --quiet "$svc" 2>/dev/null; then
+        echo "  [OK]  Tier-2: $svc"
+      else
+        echo "  [WARN] Tier-2: $svc is NOT active"
+        logger -t "$SYSLOG_IDENT" -p user.warn "Tier-2 WARNING: $svc is not active"
+      fi
+    done
+
+    echo "=== Sentinel result: $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL') ==="
+    echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] sentinel $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL')" >> "$LOGFILE"
+    exit $FAILED
+  '';
+
+  # Rollback script — package the companion shell script from this directory.
+  # Uses builtins.readFile to embed the content at evaluation time.
+  rollbackScript = pkgs.writeShellScriptBin "nixos-rollback.sh" (builtins.readFile ./nixos-rollback.sh);
+
+  # Resolve rollback flags from config
+  rollbackFlags =
+    if cfg.rollbackMode == "dry-run" then "--dry-run"
+    else if cfg.rollbackMode == "rollback-now" then "--rollback-now"
+    else "";
+
+in {
+  options.services.rollbackSentinel = {
+    enable = mkEnableOption "NixOS Rollback Sentinel — auto-rollback on critical service failure";
+
+    tier1Services = mkOption {
+      type = types.listOf types.str;
+      default = [ "sshd" "docker" "traefik" "authelia" ];
+      description = ''
+        Tier-1 services whose failure triggers an automatic systemd-boot rollback.
+        On boot, the sentinel waits ${cfg.bootDelay} seconds, then checks each
+        service. If ANY service in this list is inactive, it runs the rollback
+        script which sets the previous NixOS generation as the default boot entry.
+      '';
+    };
+
+    tier2Services = mkOption {
+      type = types.listOf types.str;
+      default = [
+        "gitea" "hermes" "ollama" "synapse" "nextcloud"
+        "vaultwarden" "wireguard" "homeassistant" "fail2ban"
+      ];
+      description = ''
+        Tier-2 services whose failure is logged as a warning but does NOT trigger
+        an automatic rollback. Useful for detecting non-critical service issues.
+      '';
+    };
+
+    tier3InfoServices = mkOption {
+      type = types.listOf types.str;
+      default = [
+        "act_runner" "syncthing" "restic" "fava"
+        "homer" "cups" "fstrim"
+      ];
+      description = ''
+        Tier-3 informational checks (log-only, no warning). These are services
+        that the sentinel will note the status of for diagnostics.
+      '';
+    };
+
+    bootDelay = mkOption {
+      type = types.str;
+      default = "120";
+      description = ''
+        Seconds to wait after multi-user.target before running the boot-time
+        sentinel check. This gives Tier-1 services time to start before
+        the sentinel decides they've failed.
+      '';
+    };
+
+    rollbackMode = mkOption {
+      type = types.enum [ "set-default" "rollback-now" "dry-run" ];
+      default = "set-default";
+      description = ''
+        Rollback strategy when Tier-1 failures are detected:
+        - set-default: Write the previous generation to loader.conf (next reboot).
+        - rollback-now: Also run nixos-rebuild switch --rollback for immediate fix.
+        - dry-run: Log what would happen but take no action (testing).
+      '';
+    };
+
+    enablePostRebuild = mkOption {
+      type = types.bool;
+      default = true;
+      description = ''
+        When enabled, the sentinel check runs after every nixos-rebuild switch
+        activation. If a newly deployed generation has Tier-1 failures, it
+        triggers rollback immediately.
+      '';
+    };
+  };
+
+  config = mkIf cfg.enable {
+    # ── Deploy scripts to PATH ───────────────────────────────────────────────
+    environment.systemPackages = [ sentinelCheck rollbackScript ];
+
+    # Ensure log directory exists
+    systemd.tmpfiles.rules = [
+      "d /var/log/nixos-sentinel 0755 root root -"
+    ];
+
+    # ── Boot-time sentinel service ───────────────────────────────────────────
+    # Runs after multi-user.target with a configurable delay, checks Tier-1
+    # services, and triggers rollback if any are down.
+    systemd.services.nixos-sentinel = {
+      description = "NixOS Boot Sentinel — check critical services, roll back on failure";
+      after = [ "network.target" "multi-user.target" ];
+      wants = [ "network.target" ];
+      wantedBy = [ "multi-user.target" ];
+
+      path = with pkgs; [ coreutils gawk gnused systemd ];
+
+      serviceConfig = {
+        Type = "oneshot";
+        RemainAfterExit = true;
+        ExecStartPre = "${pkgs.coreutils}/bin/sleep ${cfg.bootDelay}";
+        ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
+        ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
+      };
+    };
+
+    # ── Post-rebuild sentinel service (triggered by activation script) ──────
+    systemd.services.nixos-sentinel-rebuild = mkIf cfg.enablePostRebuild {
+      description = "NixOS Post-Rebuild Sentinel — check services after nixos-rebuild";
+      after = [ "network.target" ];
+
+      path = with pkgs; [ coreutils gawk gnused systemd ];
+
+      serviceConfig = {
+        Type = "oneshot";
+        ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
+        ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
+      };
+    };
+
+    # Activation script — fires after every nixos-rebuild switch
+    system.activationScripts.rollback-sentinel = mkIf cfg.enablePostRebuild ''
+      # Start the post-rebuild sentinel in the background.
+      # This runs on every activation (boot + nixos-rebuild). On boot the
+      # boot-time service handles it, so this is primarily for nixos-rebuild,
+      # but running twice is safe (idempotent rollback).
+      systemctl start nixos-sentinel-rebuild.service --no-block 2>/dev/null || true
+    '';
+  };
+}
--- a/modules/nixos/services/staging-vm.nix
+++ b/modules/nixos/services/staging-vm.nix
@@ -1,363 +0,0 @@
-{ config, pkgs, lib, ... }:
-
-with lib;
-
-let
-  cfg = config.services.stagingVm;
-
-  # ── pr-test-vm helper script ──────────────────────────────────────────
-  pr-test-vm = pkgs.writeShellScriptBin "pr-test-vm" ''
-    set -euo pipefail
-
-    LIBVIRT_URI="qemu:///system"
-    VM_DIR="${cfg.dataPath}"
-    NETWORK="default"
-    SCRIPT_NAME="$(basename "$0")"
-
-    usage() {
-      cat <<EOF
-    Usage: $SCRIPT_NAME <command> [options]
-
-    Commands:
-      build   <nixos-config> [--name <name>]   Build VM image from a NixOS config
-      start   <vm-name>                         Start a VM
-      stop    <vm-name>                         Gracefully shut down a VM
-      destroy <vm-name>                         Force-power-off and undefine a VM
-      ssh     [user@]<vm-name>                  SSH into a running VM
-      console <vm-name>                         Connect to VM serial console
-      list                                      List all staging VMs
-      status  <vm-name>                         Show VM status
-
-    Examples:
-      $SCRIPT_NAME build ./vm-config.nix --name my-test
-      $SCRIPT_NAME start my-test
-      $SCRIPT_NAME ssh root@my-test
-    EOF
-      exit 1
-    }
-
-    # Find the VM's IP address from the DHCP lease
-    vm_ip() {
-      local name="$1"
-      local mac
-      mac=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domiflist "$name" 2>/dev/null \
-        | ${pkgs.gawk}/bin/awk 'NR>2 && $1 ~ /^vnet/ {print $NF; exit}')
-      [ -z "$mac" ] && { echo "error: cannot find MAC for VM '$name'"; exit 1; }
-
-      local ip
-      ip=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-dhcp-leases "$NETWORK" 2>/dev/null \
-        | ${pkgs.gawk}/bin/awk -v mac="$mac" '$0 ~ mac {gsub(/-.*/, "", $3); print $3; exit}')
-      [ -z "$ip" ] && { echo "error: no DHCP lease found for VM '$name' (MAC: $mac)"; exit 1; }
-      echo "$ip"
-    }
-
-    case "''${1:-help}" in
-      build)
-        shift
-        CONFIG="''${1:?Missing NixOS config path}"
-        VM_NAME="''${2:-}"
-        [ -f "$CONFIG" ] || { echo "error: config file not found: $CONFIG"; exit 1; }
-
-        # Extract name from --name flag or config basename
-        if [ "''${2:-}" = "--name" ] && [ -n "''${3:-}" ]; then
-          VM_NAME="$3"
-        elif [ -z "$VM_NAME" ] || [ "''${VM_NAME#--}" != "$VM_NAME" ]; then
-          VM_NAME="$(basename "$CONFIG" .nix)"
-        fi
-
-        BUILD_DIR="$VM_DIR/$VM_NAME"
-        echo "==> Building VM '$VM_NAME' from config: $CONFIG"
-        mkdir -p "$BUILD_DIR"
-
-        # Build the NixOS VM derivation
-        nix build --no-link -f "$CONFIG" vm 2>&1 || {
-          echo "Trying flake build..."
-          nix build "''${CONFIG%/.nix}#nixosConfigurations.$VM_NAME.config.system.build.vm" --no-link 2>&1 || {
-            echo "error: failed to build VM (tried both import and flake)"
-            exit 1
-          }
-        }
-
-        echo "==> Build complete. Run 'pr-test-vm start $VM_NAME' to launch."
-        ;;
-
-      start)
-        VM_NAME="''${1:?Missing VM name}"
-        IMAGE="$VM_DIR/$VM_NAME/disk-image.qcow2"
-        [ -f "$IMAGE" ] || { echo "error: no disk image found at $IMAGE. Build first."; exit 1; }
-
-        # Check if already running
-        STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined")
-        if [ "$STATE" = "running" ]; then
-          echo "VM '$VM_NAME' is already running."
-          exit 0
-        fi
-
-        echo "==> Starting VM '$VM_NAME'..."
-
-        # Undefine if defined but not running
-        if [ "$STATE" != "undefined" ]; then
-          ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true
-        fi
-
-        # Define and start with virt-install
-        ${pkgs.virt-manager}/bin/virt-install \
-          --connect "$LIBVIRT_URI" \
-          --name "$VM_NAME" \
-          --memory "${toString cfg.memory}" \
-          --vcpus "${toString cfg.vcpus}" \
-          --disk "$IMAGE",bus=virtio \
-          --import \
-          --network network="$NETWORK",model=virtio \
-          --graphics none \
-          --console pty,target_type=virtio \
-          --serial pty \
-          --memballoon virtio \
-          --rng /dev/urandom \
-          --noautoconsole \
-          --os-variant detect=on,name=generic
-
-        echo "==> VM '$VM_NAME' started. Get IP with: pr-test-vm status $VM_NAME"
-        ;;
-
-      stop)
-        VM_NAME="''${1:?Missing VM name}"
-        echo "==> Stoping VM '$VM_NAME'..."
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" shutdown "$VM_NAME" 2>/dev/null && {
-          echo "Waiting for VM to shut down..."
-          for i in $(seq 1 30); do
-            STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "undefined")
-            [ "$STATE" != "running" ] && { echo "VM stopped."; exit 0; }
-            sleep 2
-          done
-          echo "warning: VM did not shut down gracefully, use 'destroy' for force"
-        } || {
-          echo "VM '$VM_NAME' not running or does not exist."
-        }
-        ;;
-
-      destroy)
-        VM_NAME="''${1:?Missing VM name}"
-        echo "==> Destroying VM '$VM_NAME'..."
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" undefine "$VM_NAME" 2>/dev/null || true
-        echo "==> VM '$VM_NAME' destroyed and undefined."
-        ;;
-
-      ssh)
-        TARGET="''${1:?Usage: $SCRIPT_NAME ssh [user@]<vm-name>}"
-        # Split user@hostname if present
-        if echo "$TARGET" | ${pkgs.gnugrep}/bin/grep -q '@'; then
-          USER="''${TARGET%@*}"
-          VM_NAME="''${TARGET#*@}"
-        else
-          VM_NAME="$TARGET"
-          USER=""
-        fi
-
-        IP=$(vm_ip "$VM_NAME") || exit 1
-        if [ -n "$USER" ]; then
-          exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "''${USER}@''${IP}"
-        else
-          exec ${pkgs.openssh}/bin/ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "$IP"
-        fi
-        ;;
-
-      console)
-        VM_NAME="''${1:?Missing VM name}"
-        exec ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" console "$VM_NAME"
-        ;;
-
-      list)
-        echo "Staging VMs:"
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" list --all
-        echo ""
-        echo "Active networks:"
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" net-list
-        echo ""
-        echo "Storage pools:"
-        ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" pool-list
-        ;;
-
-      status)
-        VM_NAME="''${1:?Missing VM name}"
-        echo "VM: $VM_NAME"
-        STATE=$(${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" domstate "$VM_NAME" 2>/dev/null || echo "not found")
-        echo "State: $STATE"
-        if [ "$STATE" = "running" ]; then
-          IP=$(vm_ip "$VM_NAME" 2>/dev/null || echo "N/A")
-          echo "IP: $IP"
-          ${pkgs.libvirt}/bin/virsh -c "$LIBVIRT_URI" dommemstat "$VM_NAME" 2>/dev/null | head -3 || true
-        fi
-        ;;
-
-      help|--help|-h)
-        usage
-        ;;
-
-      *)
-        usage
-        ;;
-    esac
-  '';
-in
-{
-  options.services.stagingVm = {
-    enable = mkOption {
-      type = types.bool;
-      default = false;
-      description = "Enable KVM/libvirt staging VM for compose PR testing";
-    };
-
-    vmName = mkOption {
-      type = types.str;
-      default = "compose-test-vm";
-      description = "Name of the staging VM";
-    };
-
-    memory = mkOption {
-      type = types.str;
-      default = "4096";
-      description = "RAM allocated to the staging VM (MB)";
-    };
-
-    vcpus = mkOption {
-      type = types.int;
-      default = 2;
-      description = "Number of vCPUs for the staging VM";
-    };
-
-    storagePath = mkOption {
-      type = types.str;
-      default = "/var/lib/libvirt/images";
-      description = "Path for libvirt storage pool";
-    };
-
-    dataPath = mkOption {
-      type = types.str;
-      default = "/var/lib/staging-vm";
-      description = "Path for compose test data (PR checkouts, test results)";
-    };
-  };
-
-  config = mkIf cfg.enable {
-    # ── libvirtd with QEMU/KVM ──────────────────────────────────────────
-    virtualisation.libvirtd = {
-      enable = true;
-      qemu = {
-        package = pkgs.qemu_kvm;
-        runAsRoot = true;
-        swtpm.enable = true;
-        ovmf = {
-          enable = true;
-          packages = [ pkgs.OVMF ];
-        };
-      };
-    };
-
-    # ── System packages ─────────────────────────────────────────────────
-    environment.systemPackages = with pkgs; [
-      libvirt                # virsh, virt-admin
-      qemu_kvm               # QEMU/KVM
-      swtpm                  # Software TPM
-      OVMF                   # UEFI firmware for VMs
-      virt-manager           # GUI + virt-install
-      virt-viewer            # SPICE/VNC viewer
-      libguestfs             # virt-customize, guestfish
-      cdrtools               # genisoimage for cloud-init ISOs
-      jq                     # JSON parsing
-      gawk                   # awk for DHCP lease parsing
-      gnugrep                # grep
-    ];
-
-    # ── User permissions ────────────────────────────────────────────────
-    users.users.gortium.extraGroups = [ "libvirtd" ];
-
-    # ── Directories ─────────────────────────────────────────────────────
-    systemd.tmpfiles.rules = [
-      "d ${cfg.storagePath} 0755 root root -"
-      "d ${cfg.dataPath} 0755 root root -"
-    ];
-
-    # ── Default NAT network (192.168.122.0/24) ──────────────────────────
-    # Define the default libvirt NAT network using virsh postStart hook
-    systemd.services.libvirtd = {
-      postStart = ''
-        set -e
-        # Define the NAT network if it doesn't exist
-        ${pkgs.libvirt}/bin/virsh -c qemu:///system net-info default 2>/dev/null && {
-          echo "Network 'default' already exists"
-        } || {
-          echo "Defining default NAT network (192.168.122.0/24)..."
-          ${pkgs.libvirt}/bin/virsh -c qemu:///system net-define /etc/libvirt/qemu/networks/default.xml
-        }
-        ${pkgs.libvirt}/bin/virsh -c qemu:///system net-autostart default 2>/dev/null || true
-        # Start the network if not active
-        STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system net-state default 2>/dev/null || echo "inactive")
-        if [ "$STATE" != "active" ]; then
-          ${pkgs.libvirt}/bin/virsh -c qemu:///system net-start default 2>/dev/null || true
-        fi
-        echo "Default network ready."
-      '';
-    };
-
-    # Define the default network as an XML config file
-    environment.etc."libvirt/qemu/networks/default.xml" = {
-      text = ''
-        <network>
-          <name>default</name>
-          <forward mode='nat'/>
-          <bridge name='virbr0' stp='on' delay='0'/>
-          <ip address='192.168.122.1' netmask='255.255.255.0'>
-            <dhcp>
-              <range start='192.168.122.2' end='192.168.122.254'/>
-            </dhcp>
-          </ip>
-        </network>
-      '';
-      mode = "0644";
-    };
-
-    # ── Storage pool ────────────────────────────────────────────────────
-    systemd.services.libvirtd.postStart = mkAfter ''
-      set -e
-      ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-info default 2>/dev/null && {
-        echo "Storage pool 'default' already exists"
-      } || {
-        echo "Defining storage pool at ${cfg.storagePath}..."
-        ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-define-as \
-          --name default --type dir --target "${cfg.storagePath}"
-      }
-      ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-autostart default 2>/dev/null || true
-      STATE=$(${pkgs.libvirt}/bin/virsh -c qemu:///system pool-state default 2>/dev/null || echo "inactive")
-      if [ "$STATE" != "running" ]; then
-        ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-build default 2>/dev/null || true
-        ${pkgs.libvirt}/bin/virsh -c qemu:///system pool-start default 2>/dev/null || true
-      fi
-      echo "Storage pool ready."
-    '';
-
-    # ── Firewall rules for libvirt guests ───────────────────────────────
-    networking.firewall = {
-      trustedInterfaces = [ "virbr0" ];
-
-      extraCommands = mkAfter ''
-        # Allow DHCP (port 67/68) and DNS (port 53) to libvirt guests
-        iptables -I INPUT -i virbr0 -p udp --dport 67:68 -j ACCEPT 2>/dev/null || true
-        iptables -I INPUT -i virbr0 -p tcp --dport 53 -j ACCEPT 2>/dev/null || true
-        iptables -I INPUT -i virbr0 -p udp --dport 53 -j ACCEPT 2>/dev/null || true
-
-        # Allow forwarding between the bridge and the outside world
-        iptables -I FORWARD -i virbr0 -o virbr0 -j ACCEPT 2>/dev/null || true
-        iptables -I FORWARD -o virbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT 2>/dev/null || true
-        iptables -I FORWARD -i virbr0 -j ACCEPT 2>/dev/null || true
-
-        # NAT for guest outbound traffic
-        iptables -t nat -I POSTROUTING -s 192.168.122.0/24 -j MASQUERADE 2>/dev/null || true
-      '';
-    };
-
-    # ── pr-test-vm helper script ────────────────────────────────────────
-    environment.systemPackages = [ pr-test-vm ];
-  };
-}
--- a/tests/run-integration.sh
+++ b/tests/run-integration.sh
@@ -1,347 +0,0 @@
-#!/usr/bin/env bash
-# =============================================================================
-# run-integration.sh — Staging VM Integration Test Suite
-#
-# Verifies Docker daemon, compose stack, and service endpoint health.
-# Designed to run inside the staging VM as part of CI/CD pipeline.
-#
-# Usage:
-#   ./tests/run-integration.sh                  # all defaults
-#   ./tests/run-integration.sh --verbose         # detailed output
-#   ./tests/run-integration.sh --list-services   # print detected services and exit
-#
-# Environment variables (all optional):
-#   COMPOSE_DIR       Path to compose service directories  (default: /opt/infra/compose)
-#   COMPOSE_PROJECT   Docker Compose project name          (default: staging)
-#   STAGING_DOMAIN    Base domain for health checks        (default: staging.lazyworkhorse.net)
-#   SERVICE_LIST      Space-separated service dirs to check (default: auto-detect)
-#   HEALTH_URLS       Space-separated URLs for health checks (default: auto-detect from SERVICE_LIST)
-#   HEALTH_TIMEOUT    Curl timeout per check (seconds)      (default: 5)
-#   HEALTH_RETRIES    Number of retries per endpoint         (default: 1)
-#   HEALTH_INTERVAL   Seconds between retries                (default: 2)
-# =============================================================================
-
-set -euo pipefail
-
-# ---- Colors for readable output ----
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-CYAN='\033[0;36m'
-BOLD='\033[1m'
-NC='\033[0m' # No Color
-
-# ---- Configuration (all env-overridable) ----
-COMPOSE_DIR="${COMPOSE_DIR:-/opt/infra/compose}"
-COMPOSE_PROJECT="${COMPOSE_PROJECT:-staging}"
-STAGING_DOMAIN="${STAGING_DOMAIN:-staging.lazyworkhorse.net}"
-HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-5}"
-HEALTH_RETRIES="${HEALTH_RETRIES:-1}"
-HEALTH_INTERVAL="${HEALTH_INTERVAL:-2}"
-
-# Known compose service directories in order — override via SERVICE_LIST env var
-DEFAULT_SERVICES=(
-  network
-  authentification
-  homepage
-  ai
-  cloudstorage
-  versioncontrol
-  backup
-  coms
-  finance
-  homeautomation
-  passwordmanager
-)
-
-# Map service directory -> default health check URL (relative to STAGING_DOMAIN)
-# Override entirely via HEALTH_URLS env var.
-declare -A DEFAULT_HEALTH_URLS
-DEFAULT_HEALTH_URLS[network]="https://traefik.${STAGING_DOMAIN}/ping"
-DEFAULT_HEALTH_URLS[authentification]="https://auth.${STAGING_DOMAIN}/api/verify"
-DEFAULT_HEALTH_URLS[homepage]="https://${STAGING_DOMAIN}/"
-DEFAULT_HEALTH_URLS[ai]="https://hermes.${STAGING_DOMAIN}/health"
-DEFAULT_HEALTH_URLS[cloudstorage]="https://cloud.${STAGING_DOMAIN}/status.php"
-DEFAULT_HEALTH_URLS[versioncontrol]="https://code.${STAGING_DOMAIN}/api/healthz"
-
-# ---- Trackers ----
-PASS_COUNT=0
-FAIL_COUNT=0
-WARN_COUNT=0
-FAILURES=()
-
-# ---- Helpers ----
-
-log_info()  { echo -e "${CYAN}[INFO]${NC}  $*"; }
-log_pass()  { echo -e "${GREEN}[PASS]${NC}  $*"; ((PASS_COUNT++)); }
-log_fail()  { echo -e "${RED}[FAIL]${NC}  $*"; ((FAIL_COUNT++)); FAILURES+=("$*"); }
-log_warn()  { echo -e "${YELLOW}[WARN]${NC}  $*"; ((WARN_COUNT++)); }
-log_step()  { echo -e "\n${BOLD}── $* ──${NC}"; }
-log_raw()   { echo -e "         $*"; }
-
-# Check if a command exists
-require_cmd() {
-  if ! command -v "$1" &>/dev/null; then
-    log_fail "Required command not found: $1"
-    return 1
-  fi
-}
-
-# Retry a command with exponential-like backoff
-retry() {
-  local cmd="$*"
-  local attempt=0
-  local max_attempts=$((HEALTH_RETRIES + 1))
-  local result
-
-  while [[ $attempt -lt $max_attempts ]]; do
-    if eval "$cmd" 2>/dev/null; then
-      return 0
-    fi
-    attempt=$((attempt + 1))
-    if [[ $attempt -lt $max_attempts ]]; then
-      sleep "$HEALTH_INTERVAL"
-    fi
-  done
-  return 1
-}
-
-# ---- Parse arguments ----
-VERBOSE=false
-LIST_SERVICES=false
-POSITIONAL=()
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --verbose|-v)  VERBOSE=true; shift ;;
-    --list-services) LIST_SERVICES=true; shift ;;
-    --) shift; POSITIONAL+=("$@"); break ;;
-    *) POSITIONAL+=("$1"); shift ;;
-  esac
-done
-set -- "${POSITIONAL[@]}"
-
-# Resolve service list
-if [[ -n "${SERVICE_LIST:-}" ]]; then
-  IFS=' ' read -ra SERVICES <<< "$SERVICE_LIST"
-else
-  SERVICES=("${DEFAULT_SERVICES[@]}")
-fi
-
-# Resolve health URLs — default map with overrides from env
-declare -A HEALTH_URLS
-if [[ -n "${HEALTH_URLS:-}" ]]; then
-  # User-supplied mapping: "network=https://... authentification=https://..."
-  for pair in $HEALTH_URLS; do
-    key="${pair%%=*}"
-    val="${pair#*=}"
-    HEALTH_URLS["$key"]="$val"
-  done
-else
-  for svc in "${SERVICES[@]}"; do
-    if [[ -n "${DEFAULT_HEALTH_URLS[$svc]:-}" ]]; then
-      HEALTH_URLS["$svc"]="${DEFAULT_HEALTH_URLS[$svc]}"
-    fi
-  done
-fi
-
-# --list-services mode (for CI integration)
-if $LIST_SERVICES; then
-  echo "Configured services:"
-  for svc in "${SERVICES[@]}"; do
-    url="${HEALTH_URLS[$svc]:-no-health-check}"
-    echo "  $svc -> $url"
-  done
-  exit 0
-fi
-
-# ---- Pre-flight ----
-echo -e "${BOLD}============================================${NC}"
-echo -e "${BOLD}  Staging VM Integration Test Suite${NC}"
-echo -e "${BOLD}  $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}"
-echo -e "${BOLD}============================================${NC}"
-
-# ---- Phase 1: Prerequisites ----
-log_step "Phase 1: Prerequisites"
-
-PREREQ_OK=true
-for cmd in docker curl jq; do
-  if ! require_cmd "$cmd"; then
-    PREREQ_OK=false
-  fi
-done
-$PREREQ_OK && log_pass "All required commands available" || log_fail "Missing prerequisites"
-
-# ---- Phase 2: Docker daemon ----
-log_step "Phase 2: Docker Daemon"
-
-if docker info --format '{{.ServerVersion}}' &>/dev/null; then
-  DOCKER_VERSION=$(docker info --format '{{.ServerVersion}}' 2>/dev/null)
-  log_pass "Docker daemon is running (version: $DOCKER_VERSION)"
-
-  if docker info --format '{{.Driver}}' 2>/dev/null | grep -qi "overlay"; then
-    log_pass "Storage driver: overlay"
-  else
-    log_warn "Non-overlay storage driver detected"
-  fi
-else
-  log_fail "Docker daemon is NOT running or not accessible"
-fi
-
-# ---- Phase 3: Docker Compose stack ----
-log_step "Phase 3: Compose Stack Status"
-
-# Check if any compose files exist
-COMPOSE_FILES=()
-for svc in "${SERVICES[@]}"; do
-  cf="${COMPOSE_DIR}/${svc}/compose.yml"
-  if [[ -f "$cf" ]]; then
-    COMPOSE_FILES+=("$cf")
-  else
-    cf2="${COMPOSE_DIR}/${svc}/docker-compose.yml"
-    if [[ -f "$cf2" ]]; then
-      COMPOSE_FILES+=("$cf2")
-    else
-      log_warn "No compose file found for service '$svc' (expected: ${cf})"
-    fi
-  fi
-done
-
-if [[ ${#COMPOSE_FILES[@]} -eq 0 ]]; then
-  log_fail "No compose files found under COMPOSE_DIR=${COMPOSE_DIR}"
-  log_info "Skipping stack checks"
-else
-  log_info "Found ${#COMPOSE_FILES[@]} compose file(s) in ${COMPOSE_DIR}"
-
-  # Build the compose file args
-  COMPOSE_CMD="docker compose -p ${COMPOSE_PROJECT}"
-  for cf in "${COMPOSE_FILES[@]}"; do
-    COMPOSE_CMD+=" -f ${cf}"
-  done
-
-  log_info "Project name: ${COMPOSE_PROJECT}"
-
-  # Check stack ps
-  if $VERBOSE; then
-    log_raw "--- docker compose ps output ---"
-    eval "$COMPOSE_CMD ps" 2>&1 | while IFS= read -r line; do log_raw "$line"; done
-    log_raw "--- end ---"
-  fi
-
-  # Get all services and their status
-  if STACK_STATUS=$(eval "$COMPOSE_CMD ps --format '{{.Name}}\t{{.Status}}'" 2>/dev/null); then
-    if [[ -z "$STACK_STATUS" ]]; then
-      log_warn "Stack exists but no running services — VM may be freshly provisioned"
-    else
-      ALL_RUNNING=true
-      RUNNING_COUNT=0
-      TOTAL_COUNT=0
-      while IFS=$'\t' read -r name status; do
-        TOTAL_COUNT=$((TOTAL_COUNT + 1))
-        status_lower=$(echo "$status" | tr '[:upper:]' '[:lower:]')
-        if echo "$status_lower" | grep -qE '^(up|running|healthy)'; then
-          RUNNING_COUNT=$((RUNNING_COUNT + 1))
-          $VERBOSE && log_pass "  $name — $status"
-        else
-          ALL_RUNNING=false
-          log_warn "  $name — $status (not healthy)"
-        fi
-      done <<< "$STACK_STATUS"
-
-      if [[ "$TOTAL_COUNT" -eq 0 ]]; then
-        log_fail "No services found in compose project"
-      elif $ALL_RUNNING && [[ "$TOTAL_COUNT" -eq "$RUNNING_COUNT" ]]; then
-        log_pass "All ${TOTAL_COUNT} service(s) running (${RUNNING_COUNT}/${TOTAL_COUNT})"
-      else
-        log_fail "${RUNNING_COUNT}/${TOTAL_COUNT} service(s) running — some services are down"
-      fi
-    fi
-  else
-    log_fail "Failed to query compose stack status"
-  fi
-fi
-
-# ---- Phase 4: Service health checks ----
-log_step "Phase 4: Service Endpoint Health Checks"
-
-ENDPOINT_CHECKS=0
-ENDPOINT_PASS=0
-
-for svc in "${SERVICES[@]}"; do
-  url="${HEALTH_URLS[$svc]:-}"
-  if [[ -z "$url" ]]; then
-    $VERBOSE && log_info "No health check URL for service '$svc' — skipping"
-    continue
-  fi
-
-  ENDPOINT_CHECKS=$((ENDPOINT_CHECKS + 1))
-  echo -ne "  Checking ${svc} ... "
-
-  # Perform the HTTP health check with retries
-  if retry "curl -sf -o /dev/null -w '%{http_code}' --max-time ${HEALTH_TIMEOUT} '${url}' 2>/dev/null"; then
-    HTTP_CODE=$(curl -sf -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || true)
-    ENDPOINT_PASS=$((ENDPOINT_PASS + 1))
-    echo -e "${GREEN}OK${NC} (HTTP ${HTTP_CODE})"
-  else
-    LAST_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time "${HEALTH_TIMEOUT}" "${url}" 2>/dev/null || echo "000")
-    echo -e "${RED}FAIL${NC} (HTTP ${LAST_CODE})"
-    log_fail "Health check failed for ${svc} @ ${url}"
-  fi
-done
-
-if [[ $ENDPOINT_CHECKS -eq 0 ]]; then
-  log_warn "No health check URLs configured — skipping endpoint phase"
-elif [[ $ENDPOINT_PASS -eq $ENDPOINT_CHECKS ]]; then
-  log_pass "All ${ENDPOINT_CHECKS} endpoint(s) healthy"
-else
-  log_fail "${ENDPOINT_PASS}/${ENDPOINT_CHECKS} endpoint(s) healthy"
-fi
-
-# ---- Phase 5: Docker system sanity ----
-log_step "Phase 5: Docker System Sanity"
-
-# Check disk space for Docker
-DOCKER_ROOT=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker")
-log_info "Docker root: ${DOCKER_ROOT}"
-
-if command -v df &>/dev/null && [[ -d "$DOCKER_ROOT" ]]; then
-  AVAIL_PCT=$(df -h "$DOCKER_ROOT" | awk 'NR==2 {print $5}' | tr -d '%')
-  if [[ -n "$AVAIL_PCT" ]]; then
-    if [[ "$AVAIL_PCT" -ge 90 ]]; then
-      log_warn "Docker storage is ${AVAIL_PCT}% full — consider cleanup"
-    else
-      log_pass "Docker storage at ${AVAIL_PCT}% — within limits"
-    fi
-  fi
-fi
-
-# Check for dangling images
-DANGLING=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l)
-if [[ "$DANGLING" -gt 10 ]]; then
-  log_warn "${DANGLING} dangling images found — consider docker image prune"
-fi
-
-# ---- Summary ----
-echo ""
-echo -e "${BOLD}============================================${NC}"
-echo -e "${BOLD}  Test Summary${NC}"
-echo -e "${BOLD}  $(date -u '+%Y-%m-%dT%H:%M:%SZ')${NC}"
-echo -e "${BOLD}============================================${NC}"
-echo -e "  ${GREEN}Passed:${NC}  ${PASS_COUNT}"
-echo -e "  ${RED}Failed:${NC}  ${FAIL_COUNT}"
-echo -e "  ${YELLOW}Warnings:${NC} ${WARN_COUNT}"
-
-if [[ ${#FAILURES[@]} -gt 0 ]]; then
-  echo -e "\n${BOLD}Failed checks:${NC}"
-  for f in "${FAILURES[@]}"; do
-    echo -e "  ${RED}•${NC} $f"
-  done
-fi
-
-echo ""
-if [[ $FAIL_COUNT -eq 0 ]]; then
-  echo -e "${GREEN}${BOLD}✓ All integration checks passed${NC}"
-  exit 0
-else
-  echo -e "${RED}${BOLD}✗ ${FAIL_COUNT} integration check(s) failed${NC}"
-  exit 1
-fi
--- a/users/ai-worker.nix
+++ b/users/ai-worker.nix
@@ -4,7 +4,7 @@
    group = "ai-worker";
    home = "/home/ai-worker";
    createHome = true;
-    extraGroups = [ "docker" "libvirtd" ];
+    extraGroups = [ "docker" ];
    shell = pkgs.bashInteractive;
    openssh.authorizedKeys.keys = [
      keys.users.ai-worker.main
Author	SHA1	Message	Date
Hermes	aa4a3f5b7c	feat: integrate rollback sentinel as NixOS module Add rollback-sentinel NixOS module that: - Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as system packages - Runs a boot-time systemd oneshot service after multi-user.target with configurable delay — checks Tier-1 services, triggers rollback on failure - Runs a post-rebuild service via activation script after every nixos-rebuild switch - Exposes options for tier1Services, tier2Services, tier3InfoServices, bootDelay, rollbackMode (set-default/rollback-now/dry-run), and enablePostRebuild Module wired into flake.nix for lazyworkhorse and enabled in configuration.nix with standard Tier-1/2 service lists and 120s delay.	2026-05-25 00:09:20 -04:00
Thierry Pouplier	36359de6aa	Merge pull request 'feat: add Syncthing firewall port and update compose submodule' (#47 ) from feat/syncthing-org-sync into master Reviewed-on: #47	2026-05-19 00:34:42 +00:00
Robert	10b8565fd6	Merge branch 'master' into feat/syncthing-org-sync	2026-05-18 20:33:29 -04:00
Robert	f672696b8e	Update submodule for syncthing	2026-05-18 20:31:07 -04:00
Hermes	0980dca455	fix: update compose submodule to Traefik-routed Syncthing	2026-05-14 21:40:12 -04:00
Hermes	96bc20ab70	feat: add Syncthing firewall port and update compose submodule	2026-05-14 21:36:26 -04:00
Thierry Pouplier	670ae4f002	Merge pull request 'fix: update compose submodule — use ln -sf for iptables-nft' (#46 ) from fix/vpn-iptables-nft-v3 into master Reviewed-on: #46	2026-05-13 17:00:16 +00:00
Hermes	f785abfd49	fix: update compose submodule — use ln -sf for iptables-nft	2026-05-13 12:59:04 -04:00
Thierry Pouplier	6f44aa7f76	Merge pull request 'fix: update compose submodule — remove apk add iptables-nft' (#45 ) from fix/vpn-iptables-nft-v2 into master Reviewed-on: #45	2026-05-13 16:49:39 +00:00
Hermes	8d40f1691f	fix: update compose submodule — remove apk add iptables-nft	2026-05-13 12:49:14 -04:00
Robert	2dd2e64986	Merge remote-tracking branch 'origin/master'	2026-05-13 12:42:54 -04:00
Robert	23fc5e0597	Give a little more ssh room for tramp	2026-05-13 12:41:09 -04:00
Thierry Pouplier	0c9c33d735	Merge pull request 'fix: update wg-easy to official ghcr image with iptables-nft' (#44 ) from fix/vpn-iptables-nft-upstream into master Reviewed-on: #44	2026-05-13 16:39:56 +00:00
Hermes	0bb6890f1c	chore: merge master into branch	2026-05-13 12:39:05 -04:00
Hermes	9d5434425f	fix: update compose submodule for wg-easy iptables-nft fix Updates the assets/compose submodule to point to the fix/vpn-iptables-nft-upstream branch which contains: - Switch FROM weejewel/wg-easy:latest (Alpine 3.11, stale 4yr) to ghcr.io/wg-easy/wg-easy:latest (actively maintained, Alpine krypton) - Use update-alternatives instead of raw ln -sf to flip iptables from legacy to nftables backend - Fix compose build context: ./vpn -> . (Dockerfile is at same level)	2026-05-13 12:30:47 -04:00
Thierry Pouplier	1fb4320dd1	Merge pull request 'feat: update compose submodule for custom tools startup' (#43 ) from feat/update-compose-submodule-custom-tools into master Reviewed-on: #43	2026-05-13 13:58:27 +00:00
Hermes	51e9f47fd4	feat: update compose submodule for custom tools startup	2026-05-13 09:56:24 -04:00
Hermes	06b3eb840f	fix: update compose submodule for wg-easy iptables-nft fix	2026-05-12 16:29:51 -04:00