feat: integrate rollback sentinel as NixOS module
Add rollback-sentinel NixOS module that: - Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as system packages - Runs a boot-time systemd oneshot service after multi-user.target with configurable delay — checks Tier-1 services, triggers rollback on failure - Runs a post-rebuild service via activation script after every nixos-rebuild switch - Exposes options for tier1Services, tier2Services, tier3InfoServices, bootDelay, rollbackMode (set-default/rollback-now/dry-run), and enablePostRebuild Module wired into flake.nix for lazyworkhorse and enabled in configuration.nix with standard Tier-1/2 service lists and 120s delay.
This commit is contained in:
184
modules/nixos/services/rollback-sentinel.nix
Normal file
184
modules/nixos/services/rollback-sentinel.nix
Normal file
@@ -0,0 +1,184 @@
|
||||
{ config, pkgs, lib, ... }:
|
||||
|
||||
with lib;
|
||||
|
||||
let
|
||||
cfg = config.services.rollbackSentinel;
|
||||
|
||||
# ── Scripts ────────────────────────────────────────────────────────────────
|
||||
|
||||
# Sentinel check — verifies Tier-1 services are active after boot.
|
||||
# Exits nonzero when any Tier-1 service is down, which triggers the rollback.
|
||||
sentinelCheck = pkgs.writeShellScriptBin "sentinel-check.sh" ''
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SYSLOG_IDENT="nixos-sentinel"
|
||||
LOGFILE="/var/log/nixos-sentinel.log"
|
||||
|
||||
echo "=== NixOS Sentinel Check ==="
|
||||
echo "Tier-1 services: ${builtins.toString cfg.tier1Services}"
|
||||
echo "Tier-2 services: ${builtins.toString cfg.tier2Services}"
|
||||
|
||||
FAILED=0
|
||||
|
||||
# Check Tier-1 services — any failure means rollback
|
||||
for svc in ${builtins.toString cfg.tier1Services}; do
|
||||
if systemctl is-active --quiet "$svc" 2>/dev/null; then
|
||||
echo " [OK] Tier-1: $svc"
|
||||
else
|
||||
echo " [FAIL] Tier-1: $svc is NOT active"
|
||||
logger -t "$SYSLOG_IDENT" -p user.err "Tier-1 FAILURE: $svc is not active"
|
||||
FAILED=1
|
||||
fi
|
||||
done
|
||||
|
||||
# Check Tier-2 services — warn only
|
||||
for svc in ${builtins.toString cfg.tier2Services}; do
|
||||
if systemctl is-active --quiet "$svc" 2>/dev/null; then
|
||||
echo " [OK] Tier-2: $svc"
|
||||
else
|
||||
echo " [WARN] Tier-2: $svc is NOT active"
|
||||
logger -t "$SYSLOG_IDENT" -p user.warn "Tier-2 WARNING: $svc is not active"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "=== Sentinel result: $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL') ==="
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] sentinel $([ "$FAILED" -eq 0 ] && echo 'PASS' || echo 'FAIL')" >> "$LOGFILE"
|
||||
exit $FAILED
|
||||
'';
|
||||
|
||||
# Rollback script — package the companion shell script from this directory.
|
||||
# Uses builtins.readFile to embed the content at evaluation time.
|
||||
rollbackScript = pkgs.writeShellScriptBin "nixos-rollback.sh" (builtins.readFile ./nixos-rollback.sh);
|
||||
|
||||
# Resolve rollback flags from config
|
||||
rollbackFlags =
|
||||
if cfg.rollbackMode == "dry-run" then "--dry-run"
|
||||
else if cfg.rollbackMode == "rollback-now" then "--rollback-now"
|
||||
else "";
|
||||
|
||||
in {
|
||||
options.services.rollbackSentinel = {
|
||||
enable = mkEnableOption "NixOS Rollback Sentinel — auto-rollback on critical service failure";
|
||||
|
||||
tier1Services = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ "sshd" "docker" "traefik" "authelia" ];
|
||||
description = ''
|
||||
Tier-1 services whose failure triggers an automatic systemd-boot rollback.
|
||||
On boot, the sentinel waits ${cfg.bootDelay} seconds, then checks each
|
||||
service. If ANY service in this list is inactive, it runs the rollback
|
||||
script which sets the previous NixOS generation as the default boot entry.
|
||||
'';
|
||||
};
|
||||
|
||||
tier2Services = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [
|
||||
"gitea" "hermes" "ollama" "synapse" "nextcloud"
|
||||
"vaultwarden" "wireguard" "homeassistant" "fail2ban"
|
||||
];
|
||||
description = ''
|
||||
Tier-2 services whose failure is logged as a warning but does NOT trigger
|
||||
an automatic rollback. Useful for detecting non-critical service issues.
|
||||
'';
|
||||
};
|
||||
|
||||
tier3InfoServices = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [
|
||||
"act_runner" "syncthing" "restic" "fava"
|
||||
"homer" "cups" "fstrim"
|
||||
];
|
||||
description = ''
|
||||
Tier-3 informational checks (log-only, no warning). These are services
|
||||
that the sentinel will note the status of for diagnostics.
|
||||
'';
|
||||
};
|
||||
|
||||
bootDelay = mkOption {
|
||||
type = types.str;
|
||||
default = "120";
|
||||
description = ''
|
||||
Seconds to wait after multi-user.target before running the boot-time
|
||||
sentinel check. This gives Tier-1 services time to start before
|
||||
the sentinel decides they've failed.
|
||||
'';
|
||||
};
|
||||
|
||||
rollbackMode = mkOption {
|
||||
type = types.enum [ "set-default" "rollback-now" "dry-run" ];
|
||||
default = "set-default";
|
||||
description = ''
|
||||
Rollback strategy when Tier-1 failures are detected:
|
||||
- set-default: Write the previous generation to loader.conf (next reboot).
|
||||
- rollback-now: Also run nixos-rebuild switch --rollback for immediate fix.
|
||||
- dry-run: Log what would happen but take no action (testing).
|
||||
'';
|
||||
};
|
||||
|
||||
enablePostRebuild = mkOption {
|
||||
type = types.bool;
|
||||
default = true;
|
||||
description = ''
|
||||
When enabled, the sentinel check runs after every nixos-rebuild switch
|
||||
activation. If a newly deployed generation has Tier-1 failures, it
|
||||
triggers rollback immediately.
|
||||
'';
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
# ── Deploy scripts to PATH ───────────────────────────────────────────────
|
||||
environment.systemPackages = [ sentinelCheck rollbackScript ];
|
||||
|
||||
# Ensure log directory exists
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /var/log/nixos-sentinel 0755 root root -"
|
||||
];
|
||||
|
||||
# ── Boot-time sentinel service ───────────────────────────────────────────
|
||||
# Runs after multi-user.target with a configurable delay, checks Tier-1
|
||||
# services, and triggers rollback if any are down.
|
||||
systemd.services.nixos-sentinel = {
|
||||
description = "NixOS Boot Sentinel — check critical services, roll back on failure";
|
||||
after = [ "network.target" "multi-user.target" ];
|
||||
wants = [ "network.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
|
||||
path = with pkgs; [ coreutils gawk gnused systemd ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
RemainAfterExit = true;
|
||||
ExecStartPre = "${pkgs.coreutils}/bin/sleep ${cfg.bootDelay}";
|
||||
ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
|
||||
ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
|
||||
};
|
||||
};
|
||||
|
||||
# ── Post-rebuild sentinel service (triggered by activation script) ──────
|
||||
systemd.services.nixos-sentinel-rebuild = mkIf cfg.enablePostRebuild {
|
||||
description = "NixOS Post-Rebuild Sentinel — check services after nixos-rebuild";
|
||||
after = [ "network.target" ];
|
||||
|
||||
path = with pkgs; [ coreutils gawk gnused systemd ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
ExecStart = "${sentinelCheck}/bin/sentinel-check.sh";
|
||||
ExecStartPost = "${rollbackScript}/bin/nixos-rollback.sh ${rollbackFlags}";
|
||||
};
|
||||
};
|
||||
|
||||
# Activation script — fires after every nixos-rebuild switch
|
||||
system.activationScripts.rollback-sentinel = mkIf cfg.enablePostRebuild ''
|
||||
# Start the post-rebuild sentinel in the background.
|
||||
# This runs on every activation (boot + nixos-rebuild). On boot the
|
||||
# boot-time service handles it, so this is primarily for nixos-rebuild,
|
||||
# but running twice is safe (idempotent rollback).
|
||||
systemctl start nixos-sentinel-rebuild.service --no-block 2>/dev/null || true
|
||||
'';
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user