diff --git a/flake.nix b/flake.nix index 8f51b77..b7b18db 100644 --- a/flake.nix +++ b/flake.nix @@ -69,6 +69,7 @@ ./modules/nixos/services/open_code_server.nix ./modules/nixos/services/ollama_init_custom_models.nix ./modules/nixos/services/openclaw_node.nix + ./modules/nixos/services/rollback-sentinel.nix ./modules/nixos/security/ai-worker-restricted.nix ./users/gortium.nix ./users/ai-worker.nix diff --git a/hosts/lazyworkhorse/configuration.nix b/hosts/lazyworkhorse/configuration.nix index f1afae4..157bc8e 100644 --- a/hosts/lazyworkhorse/configuration.nix +++ b/hosts/lazyworkhorse/configuration.nix @@ -321,10 +321,40 @@ environment.etc."ssh/ssh_host_ed25519_key.pub".text = "${keys.hosts.lazyworkhorse.main}"; + # ── Boot sentinel: auto-rollback on critical service failure ─────────────── + services.rollbackSentinel.enable = true; + # Tier-1: failure triggers rollback + services.rollbackSentinel.tier1Services = [ + "sshd" "docker" "traefik" "authelia" + ]; + # Tier-2: warn only + services.rollbackSentinel.tier2Services = [ + "gitea" "hermes" "ollama" "synapse" "nextcloud" + "vaultwarden" "wireguard" "homeassistant" "fail2ban" + ]; + # Wait 2 minutes after boot before checking (lets services initialize) + services.rollbackSentinel.bootDelay = "120"; + # Change boot default only (not --rollback-now) for safety + services.rollbackSentinel.rollbackMode = "set-default"; + services.fstrim.enable = true; services.zfs.autoSnapshot.enable = true; services.zfs.autoScrub.enable = true; + + # Ensure com.sun:auto-snapshot is set on ZFS datasets so auto-snapshots actually run + systemd.services."zfs-set-auto-snapshot" = { + description = "Set com.sun:auto-snapshot=true on ZFS datasets"; + after = [ "zfs-import.target" ]; + wants = [ "zfs-import.target" ]; + wantedBy = [ "multi-user.target" ]; + path = with pkgs; [ zfs ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = "${pkgs.zfs}/bin/zfs set -r com.sun:auto-snapshot=true rpool"; + }; + }; # Mi50 config hardware.graphics = { diff --git a/modules/nixos/services/nixos-rollback.sh b/modules/nixos/services/nixos-rollback.sh new file mode 100755 index 0000000..2482efe --- /dev/null +++ b/modules/nixos/services/nixos-rollback.sh @@ -0,0 +1,400 @@ +#!/usr/bin/env bash +# ============================================================================= +# nixos-rollback.sh — NixOS systemd-boot Rollback Script +# +# Detects a failed NixOS generation (critical services not starting) and sets +# the previous generation as the default boot option for systemd-boot. +# Logs all actions to syslog/journald and a local logfile. Fails safely when +# no previous generation exists or required files are missing. +# +# Integration with the boot sentinel: +# sentinel-check.sh → detects Tier-1 service failures (sshd, docker, +# traefik, authelia) after a boot +# nixos-rollback.sh ← called when sentinel exits nonzero; sets previous +# generation as default for next boot +# +# Usage: +# nixos-rollback.sh # auto-detect & set previous gen +# nixos-rollback.sh --dry-run # show what would be done +# nixos-rollback.sh --rollback-now # also run nixos-rebuild switch +# # --rollback for immediate fix +# nixos-rollback.sh --help # full help text +# +# Exit codes: +# 0 — rollback applied (or dry-run would apply) +# 1 — preflight failure (missing files, permissions) +# 2 — no previous generation available +# 3 — nixos-rebuild --rollback failed (only with --rollback-now) +# +# Installation on NixOS: +# Place in /usr/local/bin/nixos-rollback.sh and make executable. +# Add a systemd oneshot service to run it after sentinel-check detects +# failures, or invoke directly from a sentinel timer. +# ============================================================================= + +set -euo pipefail + +# ── Configuration ──────────────────────────────────────────────────────────── +# These can be overridden via environment variables for testing. +LOADER_CONF="${NIXOS_ROLLBACK_LOADER_CONF:-/boot/loader/loader.conf}" +ENTRIES_DIR="${NIXOS_ROLLBACK_ENTRIES_DIR:-/boot/loader/entries}" +LOGFILE="${NIXOS_ROLLBACK_LOGFILE:-/var/log/nixos-rollback.log}" +SYSLOG_IDENT="nixos-rollback" + +# ── CLI flags ──────────────────────────────────────────────────────────────── +DRY_RUN=false +ROLLBACK_NOW=false + +# ── Colors (disabled when not a terminal) ──────────────────────────────────── +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + CYAN='\033[0;36m' + NC='\033[0m' # No Color +else + RED=''; GREEN=''; YELLOW=''; CYAN=''; NC='' +fi + +# ============================================================================= +# Help +# ============================================================================= +usage() { + cat <> "${LOGFILE}" + logger -t "${SYSLOG_IDENT}" -p "user.${level}" "${msg}" + + # Also print to stderr for ERROR/WARN, stdout for INFO + case "${level}" in + ERROR) echo >&2 "${RED}[ERROR]${NC} ${msg}" ;; + WARN) echo >&2 "${YELLOW}[WARN]${NC} ${msg}" ;; + INFO) echo " ${GREEN}[INFO]${NC} ${msg}" ;; + esac +} + +info() { log "INFO" "$@"; } +warn() { log "WARN" "$@"; } +error() { log "ERROR" "$@"; } + +# ============================================================================= +# Preflight checks +# ============================================================================= +preflight() { + # Must run as root (need to write to /boot), unless overridden for testing + if [ -z "${NIXOS_ROLLBACK_SKIP_ROOT_CHECK:-}" ] && [ "$(id -u)" -ne 0 ]; then + error "This script must be run as root (needs write access to /boot/loader)" + error "Set NIXOS_ROLLBACK_SKIP_ROOT_CHECK=1 for testing against mock paths." + exit 1 + fi + + # Directories and files + if [ ! -d "${ENTRIES_DIR}" ]; then + error "Boot entries directory not found: ${ENTRIES_DIR}" + exit 1 + fi + + if [ ! -f "${LOADER_CONF}" ]; then + error "Loader config not found: ${LOADER_CONF}" + exit 1 + fi + + if [ ! -r "${LOADER_CONF}" ]; then + error "Cannot read loader config: ${LOADER_CONF}" + exit 1 + fi + + # Check write access to /boot/loader (parent of loader.conf) + local loader_dir + loader_dir="$(dirname "${LOADER_CONF}")" + if [ ! -w "${loader_dir}" ]; then + error "Cannot write to ${loader_dir} (insufficient permissions)" + exit 1 + fi + + # Logfile directory must exist + local log_dir + log_dir="$(dirname "${LOGFILE}")" + if [ ! -d "${log_dir}" ]; then + warn "Log directory ${log_dir} does not exist, creating it" + mkdir -p "${log_dir}" 2>/dev/null || { + error "Cannot create log directory ${log_dir}" + exit 1 + } + fi + + # Check --rollback-now dependencies + if [ "${ROLLBACK_NOW}" = true ]; then + if ! command -v nixos-rebuild &>/dev/null; then + error "nixos-rebuild not found on PATH (required for --rollback-now)" + exit 1 + fi + fi +} + +# ============================================================================= +# Generation helpers +# ============================================================================= + +# get_current_default: reads the current default entry from loader.conf +# Returns: "nixos-generation-N.conf" or empty string +get_current_default() { + grep -E '^default\s+' "${LOADER_CONF}" 2>/dev/null \ + | awk '{print $2}' \ + || true +} + +# extract_gen_number: extracts the numeric generation from a conf filename +# Input: "nixos-generation-367.conf" +# Output: 367 +extract_gen_number() { + echo "$1" | sed 's/nixos-generation-//;s/\.conf//' +} + +# get_all_gen_numbers: returns sorted list of generation numbers from entries dir +get_all_gen_numbers() { + local -a gens=() + local f n + for f in "${ENTRIES_DIR}"/nixos-generation-*.conf; do + [ -f "${f}" ] || continue + n="$(basename "${f}" | sed 's/nixos-generation-//;s/\.conf//')" + gens+=("${n}") + done + + if [ "${#gens[@]}" -eq 0 ]; then + return 1 + fi + + # Sort numerically and output + printf '%s\n' "${gens[@]}" | sort -n +} + +# get_previous_gen: given current generation number, find the previous one +# from the list of all available generations +get_previous_gen() { + local current="$1" + shift + local -a gens=("$@") + + local prev="" + local g + for g in "${gens[@]}"; do + if [ "${g}" -lt "${current}" ]; then + prev="${g}" + fi + done + + if [ -z "${prev}" ]; then + return 1 + fi + echo "${prev}" +} + +# ============================================================================= +# Main rollback logic +# ============================================================================= +do_rollback() { + # Step 1: Read current default + local current_entry + current_entry="$(get_current_default)" + + if [ -z "${current_entry}" ]; then + error "No 'default' entry found in ${LOADER_CONF}" + error "Cannot determine current generation — aborting" + exit 1 + fi + + info "Current default boot entry: ${current_entry}" + + # Step 2: Build sorted list of all available generations + local -a all_gens=() + local line + while IFS= read -r line; do + all_gens+=("${line}") + done < <(get_all_gen_numbers || true) + + if [ "${#all_gens[@]}" -eq 0 ]; then + error "No NixOS generation .conf files found in ${ENTRIES_DIR}" + exit 1 + fi + + info "Available generations: ${all_gens[*]}" + + # Step 3: Find current generation number + local current_gen + current_gen="$(extract_gen_number "${current_entry}")" + + # Verify current_gen is a valid number + if ! [[ "${current_gen}" =~ ^[0-9]+$ ]]; then + error "Could not parse generation number from '${current_entry}'" + exit 1 + fi + + # Step 4: Find the previous generation + local prev_gen + prev_gen="$(get_previous_gen "${current_gen}" "${all_gens[@]}")" || { + error "No previous generation found before generation ${current_gen}" + error "This is the oldest available generation — cannot roll back further" + exit 2 + } + + local prev_entry="nixos-generation-${prev_gen}.conf" + local prev_conf_path="${ENTRIES_DIR}/${prev_entry}" + + if [ ! -f "${prev_conf_path}" ]; then + error "Previous generation entry not found: ${prev_conf_path}" + error "The .conf file for generation ${prev_gen} is missing — cannot roll back" + exit 1 + fi + + info "Target rollback generation: ${prev_gen} → ${prev_entry}" + + # Step 5: Apply the rollback + if [ "${DRY_RUN}" = true ]; then + echo "" + echo " ${CYAN}[DRY RUN]${NC} Would change ${LOADER_CONF}:" + echo " ${YELLOW}-${NC} default ${current_entry}" + echo " ${GREEN}+${NC} default ${prev_entry}" + echo "" + info "DRY RUN — no changes made" + exit 0 + fi + + # Write new default + # Use sed with a backup (.bak) + sed -i.bak "s/^default\s\+${current_entry}/default ${prev_entry}/" "${LOADER_CONF}" + + # Verify the change was applied + local new_default + new_default="$(get_current_default)" + if [ "${new_default}" != "${prev_entry}" ]; then + error "Failed to set default boot entry to ${prev_entry}" + error "Current default is still: ${new_default}" + # Attempt to restore backup + if [ -f "${LOADER_CONF}.bak" ]; then + cp "${LOADER_CONF}.bak" "${LOADER_CONF}" + info "Restored backup from ${LOADER_CONF}.bak" + fi + exit 1 + fi + + info "Successfully set default boot entry to ${prev_entry} (generation ${prev_gen})" + info "Backup of previous config saved to ${LOADER_CONF}.bak" + + # Step 6: Optionally run nixos-rebuild switch --rollback + if [ "${ROLLBACK_NOW}" = true ]; then + echo "" + info "Running nixos-rebuild switch --rollback for immediate effect..." + if nixos-rebuild switch --rollback 2>&1 | while IFS= read -r line; do + logger -t "${SYSLOG_IDENT}" "nixos-rebuild: ${line}" + echo " ${line}" + done; then + info "nixos-rebuild switch --rollback completed successfully" + else + local rc=$? + error "nixos-rebuild switch --rollback failed with exit code ${rc}" + error "The boot default has been changed but the current system was NOT rolled back" + error "Reboot to apply the rollback" + exit 3 + fi + fi + + info "Rollback complete. Next boot will use generation ${prev_gen}." + if [ "${ROLLBACK_NOW}" = false ]; then + echo "" + echo " ${YELLOW}NOTE:${NC} The current running system is unchanged." + echo " Reboot to boot into generation ${prev_gen}." + echo " Or re-run with --rollback-now for immediate effect." + fi +} + +# ============================================================================= +# Main +# ============================================================================= +main() { + # Parse arguments + while [ $# -gt 0 ]; do + case "$1" in + --dry-run) + DRY_RUN=true + shift + ;; + --rollback-now) + ROLLBACK_NOW=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo >&2 "Unknown option: $1" + echo >&2 "Use --help for usage information." + exit 1 + ;; + esac + done + + echo "" + echo " ${CYAN}═══ NixOS systemd-boot Rollback ═══${NC}" + echo "" + + preflight + + if [ "${DRY_RUN}" = true ]; then + info "DRY RUN mode — no changes will be made" + fi + if [ "${ROLLBACK_NOW}" = true ]; then + info "ROLLBACK NOW mode — will also run nixos-rebuild switch --rollback" + fi + + echo "" + do_rollback +} + +main "$@" diff --git a/modules/nixos/services/rollback-sentinel.nix b/modules/nixos/services/rollback-sentinel.nix new file mode 100644 index 0000000..0164a2e --- /dev/null +++ b/modules/nixos/services/rollback-sentinel.nix @@ -0,0 +1,184 @@ +{ config, pkgs, lib, ... }: + +with lib; + +let + cfg = config.services.rollbackSentinel; + + # ── Scripts ──────────────────────────────────────────────────────────────── + + # Sentinel check — verifies Tier-1 services are active after boot. + # Exits nonzero when any Tier-1 service is down, which triggers the rollback. + sentinelCheck = pkgs.writeShellScriptBin "sentinel-check.sh" '' + #!/usr/bin/env bash + set -euo pipefail + + SYSLOG_IDENT="nixos-sentinel" + LOGFILE="/var/log/nixos-sentinel.log" + + echo "=== NixOS Sentinel Check ===" + echo "Tier-1 services: ${builtins.toString cfg.tier1Services}" + echo "Tier-2 services: ${builtins.toString cfg.tier2Services}" + + FAILED=0 + + # Check Tier-1 services — any failure means rollback + for svc in ${builtins.toString cfg.tier1Services}; do + if systemctl is-active --quiet "$svc" 2>/dev/null; then + echo " [OK] Tier-1: $svc" + else + echo " [FAIL] Tier-1: $svc is NOT active" + logger -t "$SYSLOG_IDENT" -p user.err "Tier-1 FAILURE: $svc is not active" + FAILED=1 + fi + done + + # Check Tier-2 services — warn only + for svc in ${builtins.toString cfg.tier2Services}; do + if systemctl is-active --quiet "$svc" 2>/dev/null; then + echo " [OK] Tier-2: $svc" + else + echo " [WARN] Tier-2: $svc is NOT active" + logger -t "$SYSLOG_IDENT" -p user.warn "Tier-2 WARNING: $svc is not active" + fi + done + + echo "=== Sentinel result: $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL') ===" + echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] sentinel $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL')" >> "$LOGFILE" + exit $FAILED + ''; + + # Rollback script — package the companion shell script from this directory. + # Uses builtins.readFile to embed the content at evaluation time. + rollbackScript = pkgs.writeShellScriptBin "nixos-rollback.sh" (builtins.readFile ./nixos-rollback.sh); + + # Resolve rollback flags from config + rollbackFlags = + if cfg.rollbackMode == "dry-run" then "--dry-run" + else if cfg.rollbackMode == "rollback-now" then "--rollback-now" + else ""; + +in { + options.services.rollbackSentinel = { + enable = mkEnableOption "NixOS Rollback Sentinel — auto-rollback on critical service failure"; + + tier1Services = mkOption { + type = types.listOf types.str; + default = [ "sshd" "docker" "traefik" "authelia" ]; + description = '' + Tier-1 services whose failure triggers an automatic systemd-boot rollback. + On boot, the sentinel waits ${cfg.bootDelay} seconds, then checks each + service. If ANY service in this list is inactive, it runs the rollback + script which sets the previous NixOS generation as the default boot entry. + ''; + }; + + tier2Services = mkOption { + type = types.listOf types.str; + default = [ + "gitea" "hermes" "ollama" "synapse" "nextcloud" + "vaultwarden" "wireguard" "homeassistant" "fail2ban" + ]; + description = '' + Tier-2 services whose failure is logged as a warning but does NOT trigger + an automatic rollback. Useful for detecting non-critical service issues. + ''; + }; + + tier3InfoServices = mkOption { + type = types.listOf types.str; + default = [ + "act_runner" "syncthing" "restic" "fava" + "homer" "cups" "fstrim" + ]; + description = '' + Tier-3 informational checks (log-only, no warning). These are services + that the sentinel will note the status of for diagnostics. + ''; + }; + + bootDelay = mkOption { + type = types.str; + default = "120"; + description = '' + Seconds to wait after multi-user.target before running the boot-time + sentinel check. This gives Tier-1 services time to start before + the sentinel decides they've failed. + ''; + }; + + rollbackMode = mkOption { + type = types.enum [ "set-default" "rollback-now" "dry-run" ]; + default = "set-default"; + description = '' + Rollback strategy when Tier-1 failures are detected: + - set-default: Write the previous generation to loader.conf (next reboot). + - rollback-now: Also run nixos-rebuild switch --rollback for immediate fix. + - dry-run: Log what would happen but take no action (testing). + ''; + }; + + enablePostRebuild = mkOption { + type = types.bool; + default = true; + description = '' + When enabled, the sentinel check runs after every nixos-rebuild switch + activation. If a newly deployed generation has Tier-1 failures, it + triggers rollback immediately. + ''; + }; + }; + + config = mkIf cfg.enable { + # ── Deploy scripts to PATH ─────────────────────────────────────────────── + environment.systemPackages = [ sentinelCheck rollbackScript ]; + + # Ensure log directory exists + systemd.tmpfiles.rules = [ + "d /var/log/nixos-sentinel 0755 root root -" + ]; + + # ── Boot-time sentinel service ─────────────────────────────────────────── + # Runs after multi-user.target with a configurable delay, checks Tier-1 + # services, and triggers rollback if any are down. + systemd.services.nixos-sentinel = { + description = "NixOS Boot Sentinel — check critical services, roll back on failure"; + after = [ "network.target" "multi-user.target" ]; + wants = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + + path = with pkgs; [ coreutils gawk gnused systemd ]; + + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStartPre = "${pkgs.coreutils}/bin/sleep ${cfg.bootDelay}"; + ExecStart = "${sentinelCheck}/bin/sentinel-check.sh"; + ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}"; + }; + }; + + # ── Post-rebuild sentinel service (triggered by activation script) ────── + systemd.services.nixos-sentinel-rebuild = mkIf cfg.enablePostRebuild { + description = "NixOS Post-Rebuild Sentinel — check services after nixos-rebuild"; + after = [ "network.target" ]; + + path = with pkgs; [ coreutils gawk gnused systemd ]; + + serviceConfig = { + Type = "oneshot"; + ExecStart = "${sentinelCheck}/bin/sentinel-check.sh"; + ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}"; + }; + }; + + # Activation script — fires after every nixos-rebuild switch + system.activationScripts.rollback-sentinel = mkIf cfg.enablePostRebuild '' + # Start the post-rebuild sentinel in the background. + # This runs on every activation (boot + nixos-rebuild). On boot the + # boot-time service handles it, so this is primarily for nixos-rebuild, + # but running twice is safe (idempotent rollback). + systemctl start nixos-sentinel-rebuild.service --no-block 2>/dev/null || true + ''; + }; +}