Files
infra/modules/nixos/services/nixos-rollback.sh
Hermes 15f70019d5 feat: integrate rollback sentinel as NixOS module
Add rollback-sentinel NixOS module that:
- Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as
  system packages
- Runs a boot-time systemd oneshot service after multi-user.target with
  configurable delay — checks Tier-1 services, triggers rollback on failure
- Runs a post-rebuild service via activation script after every
  nixos-rebuild switch
- Exposes options for tier1Services, tier2Services, tier3InfoServices,
  bootDelay, rollbackMode (set-default/rollback-now/dry-run), and
  enablePostRebuild

Module wired into flake.nix for lazyworkhorse and enabled in
configuration.nix with standard Tier-1/2 service lists and 120s delay.
2026-05-25 00:08:39 -04:00

401 lines
14 KiB
Bash
Executable File

#!/usr/bin/env bash
# =============================================================================
# nixos-rollback.sh — NixOS systemd-boot Rollback Script
#
# Detects a failed NixOS generation (critical services not starting) and sets
# the previous generation as the default boot option for systemd-boot.
# Logs all actions to syslog/journald and a local logfile. Fails safely when
# no previous generation exists or required files are missing.
#
# Integration with the boot sentinel:
# sentinel-check.sh → detects Tier-1 service failures (sshd, docker,
# traefik, authelia) after a boot
# nixos-rollback.sh ← called when sentinel exits nonzero; sets previous
# generation as default for next boot
#
# Usage:
# nixos-rollback.sh # auto-detect & set previous gen
# nixos-rollback.sh --dry-run # show what would be done
# nixos-rollback.sh --rollback-now # also run nixos-rebuild switch
# # --rollback for immediate fix
# nixos-rollback.sh --help # full help text
#
# Exit codes:
# 0 — rollback applied (or dry-run would apply)
# 1 — preflight failure (missing files, permissions)
# 2 — no previous generation available
# 3 — nixos-rebuild --rollback failed (only with --rollback-now)
#
# Installation on NixOS:
# Place in /usr/local/bin/nixos-rollback.sh and make executable.
# Add a systemd oneshot service to run it after sentinel-check detects
# failures, or invoke directly from a sentinel timer.
# =============================================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────────────────────
# These can be overridden via environment variables for testing.
LOADER_CONF="${NIXOS_ROLLBACK_LOADER_CONF:-/boot/loader/loader.conf}"
ENTRIES_DIR="${NIXOS_ROLLBACK_ENTRIES_DIR:-/boot/loader/entries}"
LOGFILE="${NIXOS_ROLLBACK_LOGFILE:-/var/log/nixos-rollback.log}"
SYSLOG_IDENT="nixos-rollback"
# ── CLI flags ────────────────────────────────────────────────────────────────
DRY_RUN=false
ROLLBACK_NOW=false
# ── Colors (disabled when not a terminal) ────────────────────────────────────
if [ -t 1 ]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
else
RED=''; GREEN=''; YELLOW=''; CYAN=''; NC=''
fi
# =============================================================================
# Help
# =============================================================================
usage() {
cat <<EOF
${CYAN}nixos-rollback.sh${NC} — Set the previous NixOS generation as systemd-boot default
${CYAN}USAGE${NC}
nixos-rollback.sh [OPTIONS]
${CYAN}OPTIONS${NC}
--dry-run Show what would be done without making changes
--rollback-now Also run 'nixos-rebuild switch --rollback' for
immediate fix of the running system (requires
nixos-rebuild on PATH)
-h, --help Show this help text
${CYAN}DESCRIPTION${NC}
Reads the current default boot entry from ${LOADER_CONF},
determines the previous generation number, and writes it as the
new default. The script only modifies systemd-boot config —
it does NOT touch the Nix store or system profile unless
--rollback-now is passed.
Designed as the rollback half of a boot sentinel:
1. System boots into generation N
2. sentinel-check.sh detects Tier-1 service failures
3. nixos-rollback.sh sets default to generation N-1
4. Next reboot uses the working generation
${CYAN}EXIT CODES${NC}
0 Rollback applied (or dry-run would apply)
1 Preflight failure (missing files, permissions)
2 No previous generation available (only one generation)
3 nixos-rebuild --rollback failed (with --rollback-now)
${CYAN}FILES${NC}
${LOADER_CONF} systemd-boot loader configuration
${ENTRIES_DIR}/ generation entry .conf files
${LOGFILE} action log (append-only)
EOF
}
# =============================================================================
# Logging
# =============================================================================
log() {
local level="$1"; shift
local msg="$*"
local timestamp
timestamp="$(date '+%Y-%m-%d %H:%M:%S')"
echo "${timestamp} [${level}] ${msg}" >> "${LOGFILE}"
logger -t "${SYSLOG_IDENT}" -p "user.${level}" "${msg}"
# Also print to stderr for ERROR/WARN, stdout for INFO
case "${level}" in
ERROR) echo >&2 "${RED}[ERROR]${NC} ${msg}" ;;
WARN) echo >&2 "${YELLOW}[WARN]${NC} ${msg}" ;;
INFO) echo " ${GREEN}[INFO]${NC} ${msg}" ;;
esac
}
info() { log "INFO" "$@"; }
warn() { log "WARN" "$@"; }
error() { log "ERROR" "$@"; }
# =============================================================================
# Preflight checks
# =============================================================================
preflight() {
# Must run as root (need to write to /boot), unless overridden for testing
if [ -z "${NIXOS_ROLLBACK_SKIP_ROOT_CHECK:-}" ] && [ "$(id -u)" -ne 0 ]; then
error "This script must be run as root (needs write access to /boot/loader)"
error "Set NIXOS_ROLLBACK_SKIP_ROOT_CHECK=1 for testing against mock paths."
exit 1
fi
# Directories and files
if [ ! -d "${ENTRIES_DIR}" ]; then
error "Boot entries directory not found: ${ENTRIES_DIR}"
exit 1
fi
if [ ! -f "${LOADER_CONF}" ]; then
error "Loader config not found: ${LOADER_CONF}"
exit 1
fi
if [ ! -r "${LOADER_CONF}" ]; then
error "Cannot read loader config: ${LOADER_CONF}"
exit 1
fi
# Check write access to /boot/loader (parent of loader.conf)
local loader_dir
loader_dir="$(dirname "${LOADER_CONF}")"
if [ ! -w "${loader_dir}" ]; then
error "Cannot write to ${loader_dir} (insufficient permissions)"
exit 1
fi
# Logfile directory must exist
local log_dir
log_dir="$(dirname "${LOGFILE}")"
if [ ! -d "${log_dir}" ]; then
warn "Log directory ${log_dir} does not exist, creating it"
mkdir -p "${log_dir}" 2>/dev/null || {
error "Cannot create log directory ${log_dir}"
exit 1
}
fi
# Check --rollback-now dependencies
if [ "${ROLLBACK_NOW}" = true ]; then
if ! command -v nixos-rebuild &>/dev/null; then
error "nixos-rebuild not found on PATH (required for --rollback-now)"
exit 1
fi
fi
}
# =============================================================================
# Generation helpers
# =============================================================================
# get_current_default: reads the current default entry from loader.conf
# Returns: "nixos-generation-N.conf" or empty string
get_current_default() {
grep -E '^default\s+' "${LOADER_CONF}" 2>/dev/null \
| awk '{print $2}' \
|| true
}
# extract_gen_number: extracts the numeric generation from a conf filename
# Input: "nixos-generation-367.conf"
# Output: 367
extract_gen_number() {
echo "$1" | sed 's/nixos-generation-//;s/\.conf//'
}
# get_all_gen_numbers: returns sorted list of generation numbers from entries dir
get_all_gen_numbers() {
local -a gens=()
local f n
for f in "${ENTRIES_DIR}"/nixos-generation-*.conf; do
[ -f "${f}" ] || continue
n="$(basename "${f}" | sed 's/nixos-generation-//;s/\.conf//')"
gens+=("${n}")
done
if [ "${#gens[@]}" -eq 0 ]; then
return 1
fi
# Sort numerically and output
printf '%s\n' "${gens[@]}" | sort -n
}
# get_previous_gen: given current generation number, find the previous one
# from the list of all available generations
get_previous_gen() {
local current="$1"
shift
local -a gens=("$@")
local prev=""
local g
for g in "${gens[@]}"; do
if [ "${g}" -lt "${current}" ]; then
prev="${g}"
fi
done
if [ -z "${prev}" ]; then
return 1
fi
echo "${prev}"
}
# =============================================================================
# Main rollback logic
# =============================================================================
do_rollback() {
# Step 1: Read current default
local current_entry
current_entry="$(get_current_default)"
if [ -z "${current_entry}" ]; then
error "No 'default' entry found in ${LOADER_CONF}"
error "Cannot determine current generation — aborting"
exit 1
fi
info "Current default boot entry: ${current_entry}"
# Step 2: Build sorted list of all available generations
local -a all_gens=()
local line
while IFS= read -r line; do
all_gens+=("${line}")
done < <(get_all_gen_numbers || true)
if [ "${#all_gens[@]}" -eq 0 ]; then
error "No NixOS generation .conf files found in ${ENTRIES_DIR}"
exit 1
fi
info "Available generations: ${all_gens[*]}"
# Step 3: Find current generation number
local current_gen
current_gen="$(extract_gen_number "${current_entry}")"
# Verify current_gen is a valid number
if ! [[ "${current_gen}" =~ ^[0-9]+$ ]]; then
error "Could not parse generation number from '${current_entry}'"
exit 1
fi
# Step 4: Find the previous generation
local prev_gen
prev_gen="$(get_previous_gen "${current_gen}" "${all_gens[@]}")" || {
error "No previous generation found before generation ${current_gen}"
error "This is the oldest available generation — cannot roll back further"
exit 2
}
local prev_entry="nixos-generation-${prev_gen}.conf"
local prev_conf_path="${ENTRIES_DIR}/${prev_entry}"
if [ ! -f "${prev_conf_path}" ]; then
error "Previous generation entry not found: ${prev_conf_path}"
error "The .conf file for generation ${prev_gen} is missing — cannot roll back"
exit 1
fi
info "Target rollback generation: ${prev_gen}${prev_entry}"
# Step 5: Apply the rollback
if [ "${DRY_RUN}" = true ]; then
echo ""
echo " ${CYAN}[DRY RUN]${NC} Would change ${LOADER_CONF}:"
echo " ${YELLOW}-${NC} default ${current_entry}"
echo " ${GREEN}+${NC} default ${prev_entry}"
echo ""
info "DRY RUN — no changes made"
exit 0
fi
# Write new default
# Use sed with a backup (.bak)
sed -i.bak "s/^default\s\+${current_entry}/default ${prev_entry}/" "${LOADER_CONF}"
# Verify the change was applied
local new_default
new_default="$(get_current_default)"
if [ "${new_default}" != "${prev_entry}" ]; then
error "Failed to set default boot entry to ${prev_entry}"
error "Current default is still: ${new_default}"
# Attempt to restore backup
if [ -f "${LOADER_CONF}.bak" ]; then
cp "${LOADER_CONF}.bak" "${LOADER_CONF}"
info "Restored backup from ${LOADER_CONF}.bak"
fi
exit 1
fi
info "Successfully set default boot entry to ${prev_entry} (generation ${prev_gen})"
info "Backup of previous config saved to ${LOADER_CONF}.bak"
# Step 6: Optionally run nixos-rebuild switch --rollback
if [ "${ROLLBACK_NOW}" = true ]; then
echo ""
info "Running nixos-rebuild switch --rollback for immediate effect..."
if nixos-rebuild switch --rollback 2>&1 | while IFS= read -r line; do
logger -t "${SYSLOG_IDENT}" "nixos-rebuild: ${line}"
echo " ${line}"
done; then
info "nixos-rebuild switch --rollback completed successfully"
else
local rc=$?
error "nixos-rebuild switch --rollback failed with exit code ${rc}"
error "The boot default has been changed but the current system was NOT rolled back"
error "Reboot to apply the rollback"
exit 3
fi
fi
info "Rollback complete. Next boot will use generation ${prev_gen}."
if [ "${ROLLBACK_NOW}" = false ]; then
echo ""
echo " ${YELLOW}NOTE:${NC} The current running system is unchanged."
echo " Reboot to boot into generation ${prev_gen}."
echo " Or re-run with --rollback-now for immediate effect."
fi
}
# =============================================================================
# Main
# =============================================================================
main() {
# Parse arguments
while [ $# -gt 0 ]; do
case "$1" in
--dry-run)
DRY_RUN=true
shift
;;
--rollback-now)
ROLLBACK_NOW=true
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo >&2 "Unknown option: $1"
echo >&2 "Use --help for usage information."
exit 1
;;
esac
done
echo ""
echo " ${CYAN}═══ NixOS systemd-boot Rollback ═══${NC}"
echo ""
preflight
if [ "${DRY_RUN}" = true ]; then
info "DRY RUN mode — no changes will be made"
fi
if [ "${ROLLBACK_NOW}" = true ]; then
info "ROLLBACK NOW mode — will also run nixos-rebuild switch --rollback"
fi
echo ""
do_rollback
}
main "$@"