{ config, pkgs, lib, ... }: with lib; let cfg = config.services.rollbackSentinel; # ── Scripts ──────────────────────────────────────────────────────────────── # Sentinel check — verifies Tier-1 services are active after boot. # Exits nonzero when any Tier-1 service is down, which triggers the rollback. sentinelCheck = pkgs.writeShellScriptBin "sentinel-check.sh" '' #!/usr/bin/env bash set -euo pipefail SYSLOG_IDENT="nixos-sentinel" LOGFILE="/var/log/nixos-sentinel.log" echo "=== NixOS Sentinel Check ===" echo "Tier-1 services: ${builtins.toString cfg.tier1Services}" echo "Tier-2 services: ${builtins.toString cfg.tier2Services}" FAILED=0 # Check Tier-1 services — any failure means rollback for svc in ${builtins.toString cfg.tier1Services}; do if systemctl is-active --quiet "$svc" 2>/dev/null; then echo " [OK] Tier-1: $svc" else echo " [FAIL] Tier-1: $svc is NOT active" logger -t "$SYSLOG_IDENT" -p user.err "Tier-1 FAILURE: $svc is not active" FAILED=1 fi done # Check Tier-2 services — warn only for svc in ${builtins.toString cfg.tier2Services}; do if systemctl is-active --quiet "$svc" 2>/dev/null; then echo " [OK] Tier-2: $svc" else echo " [WARN] Tier-2: $svc is NOT active" logger -t "$SYSLOG_IDENT" -p user.warn "Tier-2 WARNING: $svc is not active" fi done echo "=== Sentinel result: $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL') ===" echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] sentinel $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL')" >> "$LOGFILE" exit $FAILED ''; # Rollback script — package the companion shell script from this directory. # Uses builtins.readFile to embed the content at evaluation time. rollbackScript = pkgs.writeShellScriptBin "nixos-rollback.sh" (builtins.readFile ./nixos-rollback.sh); # Resolve rollback flags from config rollbackFlags = if cfg.rollbackMode == "dry-run" then "--dry-run" else if cfg.rollbackMode == "rollback-now" then "--rollback-now" else ""; in { options.services.rollbackSentinel = { enable = mkEnableOption "NixOS Rollback Sentinel — auto-rollback on critical service failure"; tier1Services = mkOption { type = types.listOf types.str; default = [ "sshd" "docker" "traefik" "authelia" ]; description = '' Tier-1 services whose failure triggers an automatic systemd-boot rollback. On boot, the sentinel waits ${cfg.bootDelay} seconds, then checks each service. If ANY service in this list is inactive, it runs the rollback script which sets the previous NixOS generation as the default boot entry. ''; }; tier2Services = mkOption { type = types.listOf types.str; default = [ "gitea" "hermes" "ollama" "synapse" "nextcloud" "vaultwarden" "wireguard" "homeassistant" "fail2ban" ]; description = '' Tier-2 services whose failure is logged as a warning but does NOT trigger an automatic rollback. Useful for detecting non-critical service issues. ''; }; tier3InfoServices = mkOption { type = types.listOf types.str; default = [ "act_runner" "syncthing" "restic" "fava" "homer" "cups" "fstrim" ]; description = '' Tier-3 informational checks (log-only, no warning). These are services that the sentinel will note the status of for diagnostics. ''; }; bootDelay = mkOption { type = types.str; default = "120"; description = '' Seconds to wait after multi-user.target before running the boot-time sentinel check. This gives Tier-1 services time to start before the sentinel decides they've failed. ''; }; rollbackMode = mkOption { type = types.enum [ "set-default" "rollback-now" "dry-run" ]; default = "set-default"; description = '' Rollback strategy when Tier-1 failures are detected: - set-default: Write the previous generation to loader.conf (next reboot). - rollback-now: Also run nixos-rebuild switch --rollback for immediate fix. - dry-run: Log what would happen but take no action (testing). ''; }; enablePostRebuild = mkOption { type = types.bool; default = true; description = '' When enabled, the sentinel check runs after every nixos-rebuild switch activation. If a newly deployed generation has Tier-1 failures, it triggers rollback immediately. ''; }; }; config = mkIf cfg.enable { # ── Deploy scripts to PATH ─────────────────────────────────────────────── environment.systemPackages = [ sentinelCheck rollbackScript ]; # Ensure log directory exists systemd.tmpfiles.rules = [ "d /var/log/nixos-sentinel 0755 root root -" ]; # ── Boot-time sentinel service ─────────────────────────────────────────── # Runs after multi-user.target with a configurable delay, checks Tier-1 # services, and triggers rollback if any are down. systemd.services.nixos-sentinel = { description = "NixOS Boot Sentinel — check critical services, roll back on failure"; after = [ "network.target" "multi-user.target" ]; wants = [ "network.target" ]; wantedBy = [ "multi-user.target" ]; path = with pkgs; [ coreutils gawk gnused systemd ]; serviceConfig = { Type = "oneshot"; RemainAfterExit = true; ExecStartPre = "${pkgs.coreutils}/bin/sleep ${cfg.bootDelay}"; ExecStart = "${sentinelCheck}/bin/sentinel-check.sh"; ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}"; }; }; # ── Post-rebuild sentinel service (triggered by activation script) ────── systemd.services.nixos-sentinel-rebuild = mkIf cfg.enablePostRebuild { description = "NixOS Post-Rebuild Sentinel — check services after nixos-rebuild"; after = [ "network.target" ]; path = with pkgs; [ coreutils gawk gnused systemd ]; serviceConfig = { Type = "oneshot"; ExecStart = "${sentinelCheck}/bin/sentinel-check.sh"; ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}"; }; }; # Activation script — fires after every nixos-rebuild switch system.activationScripts.rollback-sentinel = mkIf cfg.enablePostRebuild '' # Start the post-rebuild sentinel in the background. # This runs on every activation (boot + nixos-rebuild). On boot the # boot-time service handles it, so this is primarily for nixos-rebuild, # but running twice is safe (idempotent rollback). systemctl start nixos-sentinel-rebuild.service --no-block 2>/dev/null || true ''; }; }