Add rollback-sentinel NixOS module that: - Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as system packages - Runs a boot-time systemd oneshot service after multi-user.target with configurable delay — checks Tier-1 services, triggers rollback on failure - Runs a post-rebuild service via activation script after every nixos-rebuild switch - Exposes options for tier1Services, tier2Services, tier3InfoServices, bootDelay, rollbackMode (set-default/rollback-now/dry-run), and enablePostRebuild Module wired into flake.nix for lazyworkhorse and enabled in configuration.nix with standard Tier-1/2 service lists and 120s delay.
185 lines
7.1 KiB
Nix
185 lines
7.1 KiB
Nix
{ config, pkgs, lib, ... }:
|
|
|
|
with lib;
|
|
|
|
let
|
|
cfg = config.services.rollbackSentinel;
|
|
|
|
# ── Scripts ────────────────────────────────────────────────────────────────
|
|
|
|
# Sentinel check — verifies Tier-1 services are active after boot.
|
|
# Exits nonzero when any Tier-1 service is down, which triggers the rollback.
|
|
sentinelCheck = pkgs.writeShellScriptBin "sentinel-check.sh" ''
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SYSLOG_IDENT="nixos-sentinel"
|
|
LOGFILE="/var/log/nixos-sentinel.log"
|
|
|
|
echo "=== NixOS Sentinel Check ==="
|
|
echo "Tier-1 services: ${builtins.toString cfg.tier1Services}"
|
|
echo "Tier-2 services: ${builtins.toString cfg.tier2Services}"
|
|
|
|
FAILED=0
|
|
|
|
# Check Tier-1 services — any failure means rollback
|
|
for svc in ${builtins.toString cfg.tier1Services}; do
|
|
if systemctl is-active --quiet "$svc" 2>/dev/null; then
|
|
echo " [OK] Tier-1: $svc"
|
|
else
|
|
echo " [FAIL] Tier-1: $svc is NOT active"
|
|
logger -t "$SYSLOG_IDENT" -p user.err "Tier-1 FAILURE: $svc is not active"
|
|
FAILED=1
|
|
fi
|
|
done
|
|
|
|
# Check Tier-2 services — warn only
|
|
for svc in ${builtins.toString cfg.tier2Services}; do
|
|
if systemctl is-active --quiet "$svc" 2>/dev/null; then
|
|
echo " [OK] Tier-2: $svc"
|
|
else
|
|
echo " [WARN] Tier-2: $svc is NOT active"
|
|
logger -t "$SYSLOG_IDENT" -p user.warn "Tier-2 WARNING: $svc is not active"
|
|
fi
|
|
done
|
|
|
|
echo "=== Sentinel result: $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL') ==="
|
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] sentinel $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL')" >> "$LOGFILE"
|
|
exit $FAILED
|
|
'';
|
|
|
|
# Rollback script — package the companion shell script from this directory.
|
|
# Uses builtins.readFile to embed the content at evaluation time.
|
|
rollbackScript = pkgs.writeShellScriptBin "nixos-rollback.sh" (builtins.readFile ./nixos-rollback.sh);
|
|
|
|
# Resolve rollback flags from config
|
|
rollbackFlags =
|
|
if cfg.rollbackMode == "dry-run" then "--dry-run"
|
|
else if cfg.rollbackMode == "rollback-now" then "--rollback-now"
|
|
else "";
|
|
|
|
in {
|
|
options.services.rollbackSentinel = {
|
|
enable = mkEnableOption "NixOS Rollback Sentinel — auto-rollback on critical service failure";
|
|
|
|
tier1Services = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [ "sshd" "docker" "traefik" "authelia" ];
|
|
description = ''
|
|
Tier-1 services whose failure triggers an automatic systemd-boot rollback.
|
|
On boot, the sentinel waits ${cfg.bootDelay} seconds, then checks each
|
|
service. If ANY service in this list is inactive, it runs the rollback
|
|
script which sets the previous NixOS generation as the default boot entry.
|
|
'';
|
|
};
|
|
|
|
tier2Services = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [
|
|
"gitea" "hermes" "ollama" "synapse" "nextcloud"
|
|
"vaultwarden" "wireguard" "homeassistant" "fail2ban"
|
|
];
|
|
description = ''
|
|
Tier-2 services whose failure is logged as a warning but does NOT trigger
|
|
an automatic rollback. Useful for detecting non-critical service issues.
|
|
'';
|
|
};
|
|
|
|
tier3InfoServices = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [
|
|
"act_runner" "syncthing" "restic" "fava"
|
|
"homer" "cups" "fstrim"
|
|
];
|
|
description = ''
|
|
Tier-3 informational checks (log-only, no warning). These are services
|
|
that the sentinel will note the status of for diagnostics.
|
|
'';
|
|
};
|
|
|
|
bootDelay = mkOption {
|
|
type = types.str;
|
|
default = "120";
|
|
description = ''
|
|
Seconds to wait after multi-user.target before running the boot-time
|
|
sentinel check. This gives Tier-1 services time to start before
|
|
the sentinel decides they've failed.
|
|
'';
|
|
};
|
|
|
|
rollbackMode = mkOption {
|
|
type = types.enum [ "set-default" "rollback-now" "dry-run" ];
|
|
default = "set-default";
|
|
description = ''
|
|
Rollback strategy when Tier-1 failures are detected:
|
|
- set-default: Write the previous generation to loader.conf (next reboot).
|
|
- rollback-now: Also run nixos-rebuild switch --rollback for immediate fix.
|
|
- dry-run: Log what would happen but take no action (testing).
|
|
'';
|
|
};
|
|
|
|
enablePostRebuild = mkOption {
|
|
type = types.bool;
|
|
default = true;
|
|
description = ''
|
|
When enabled, the sentinel check runs after every nixos-rebuild switch
|
|
activation. If a newly deployed generation has Tier-1 failures, it
|
|
triggers rollback immediately.
|
|
'';
|
|
};
|
|
};
|
|
|
|
config = mkIf cfg.enable {
|
|
# ── Deploy scripts to PATH ───────────────────────────────────────────────
|
|
environment.systemPackages = [ sentinelCheck rollbackScript ];
|
|
|
|
# Ensure log directory exists
|
|
systemd.tmpfiles.rules = [
|
|
"d /var/log/nixos-sentinel 0755 root root -"
|
|
];
|
|
|
|
# ── Boot-time sentinel service ───────────────────────────────────────────
|
|
# Runs after multi-user.target with a configurable delay, checks Tier-1
|
|
# services, and triggers rollback if any are down.
|
|
systemd.services.nixos-sentinel = {
|
|
description = "NixOS Boot Sentinel — check critical services, roll back on failure";
|
|
after = [ "network.target" "multi-user.target" ];
|
|
wants = [ "network.target" ];
|
|
wantedBy = [ "multi-user.target" ];
|
|
|
|
path = with pkgs; [ coreutils gawk gnused systemd ];
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
RemainAfterExit = true;
|
|
ExecStartPre = "${pkgs.coreutils}/bin/sleep ${cfg.bootDelay}";
|
|
ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
|
|
ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
|
|
};
|
|
};
|
|
|
|
# ── Post-rebuild sentinel service (triggered by activation script) ──────
|
|
systemd.services.nixos-sentinel-rebuild = mkIf cfg.enablePostRebuild {
|
|
description = "NixOS Post-Rebuild Sentinel — check services after nixos-rebuild";
|
|
after = [ "network.target" ];
|
|
|
|
path = with pkgs; [ coreutils gawk gnused systemd ];
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
|
|
ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
|
|
};
|
|
};
|
|
|
|
# Activation script — fires after every nixos-rebuild switch
|
|
system.activationScripts.rollback-sentinel = mkIf cfg.enablePostRebuild ''
|
|
# Start the post-rebuild sentinel in the background.
|
|
# This runs on every activation (boot + nixos-rebuild). On boot the
|
|
# boot-time service handles it, so this is primarily for nixos-rebuild,
|
|
# but running twice is safe (idempotent rollback).
|
|
systemctl start nixos-sentinel-rebuild.service --no-block 2>/dev/null || true
|
|
'';
|
|
};
|
|
}
|