Add rollback-sentinel NixOS module that: - Deploys sentinel-check.sh (inline) and nixos-rollback.sh (from file) as system packages - Runs a boot-time systemd oneshot service after multi-user.target with configurable delay — checks Tier-1 services, triggers rollback on failure - Runs a post-rebuild service via activation script after every nixos-rebuild switch - Exposes options for tier1Services, tier2Services, tier3InfoServices, bootDelay, rollbackMode (set-default/rollback-now/dry-run), and enablePostRebuild Module wired into flake.nix for lazyworkhorse and enabled in configuration.nix with standard Tier-1/2 service lists and 120s delay.
401 lines
14 KiB
Bash
Executable File
401 lines
14 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# nixos-rollback.sh — NixOS systemd-boot Rollback Script
|
|
#
|
|
# Detects a failed NixOS generation (critical services not starting) and sets
|
|
# the previous generation as the default boot option for systemd-boot.
|
|
# Logs all actions to syslog/journald and a local logfile. Fails safely when
|
|
# no previous generation exists or required files are missing.
|
|
#
|
|
# Integration with the boot sentinel:
|
|
# sentinel-check.sh → detects Tier-1 service failures (sshd, docker,
|
|
# traefik, authelia) after a boot
|
|
# nixos-rollback.sh ← called when sentinel exits nonzero; sets previous
|
|
# generation as default for next boot
|
|
#
|
|
# Usage:
|
|
# nixos-rollback.sh # auto-detect & set previous gen
|
|
# nixos-rollback.sh --dry-run # show what would be done
|
|
# nixos-rollback.sh --rollback-now # also run nixos-rebuild switch
|
|
# # --rollback for immediate fix
|
|
# nixos-rollback.sh --help # full help text
|
|
#
|
|
# Exit codes:
|
|
# 0 — rollback applied (or dry-run would apply)
|
|
# 1 — preflight failure (missing files, permissions)
|
|
# 2 — no previous generation available
|
|
# 3 — nixos-rebuild --rollback failed (only with --rollback-now)
|
|
#
|
|
# Installation on NixOS:
|
|
# Place in /usr/local/bin/nixos-rollback.sh and make executable.
|
|
# Add a systemd oneshot service to run it after sentinel-check detects
|
|
# failures, or invoke directly from a sentinel timer.
|
|
# =============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
# ── Configuration ────────────────────────────────────────────────────────────
|
|
# These can be overridden via environment variables for testing.
|
|
LOADER_CONF="${NIXOS_ROLLBACK_LOADER_CONF:-/boot/loader/loader.conf}"
|
|
ENTRIES_DIR="${NIXOS_ROLLBACK_ENTRIES_DIR:-/boot/loader/entries}"
|
|
LOGFILE="${NIXOS_ROLLBACK_LOGFILE:-/var/log/nixos-rollback.log}"
|
|
SYSLOG_IDENT="nixos-rollback"
|
|
|
|
# ── CLI flags ────────────────────────────────────────────────────────────────
|
|
DRY_RUN=false
|
|
ROLLBACK_NOW=false
|
|
|
|
# ── Colors (disabled when not a terminal) ────────────────────────────────────
|
|
if [ -t 1 ]; then
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m' # No Color
|
|
else
|
|
RED=''; GREEN=''; YELLOW=''; CYAN=''; NC=''
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Help
|
|
# =============================================================================
|
|
usage() {
|
|
cat <<EOF
|
|
${CYAN}nixos-rollback.sh${NC} — Set the previous NixOS generation as systemd-boot default
|
|
|
|
${CYAN}USAGE${NC}
|
|
nixos-rollback.sh [OPTIONS]
|
|
|
|
${CYAN}OPTIONS${NC}
|
|
--dry-run Show what would be done without making changes
|
|
--rollback-now Also run 'nixos-rebuild switch --rollback' for
|
|
immediate fix of the running system (requires
|
|
nixos-rebuild on PATH)
|
|
-h, --help Show this help text
|
|
|
|
${CYAN}DESCRIPTION${NC}
|
|
Reads the current default boot entry from ${LOADER_CONF},
|
|
determines the previous generation number, and writes it as the
|
|
new default. The script only modifies systemd-boot config —
|
|
it does NOT touch the Nix store or system profile unless
|
|
--rollback-now is passed.
|
|
|
|
Designed as the rollback half of a boot sentinel:
|
|
1. System boots into generation N
|
|
2. sentinel-check.sh detects Tier-1 service failures
|
|
3. nixos-rollback.sh sets default to generation N-1
|
|
4. Next reboot uses the working generation
|
|
|
|
${CYAN}EXIT CODES${NC}
|
|
0 Rollback applied (or dry-run would apply)
|
|
1 Preflight failure (missing files, permissions)
|
|
2 No previous generation available (only one generation)
|
|
3 nixos-rebuild --rollback failed (with --rollback-now)
|
|
|
|
${CYAN}FILES${NC}
|
|
${LOADER_CONF} systemd-boot loader configuration
|
|
${ENTRIES_DIR}/ generation entry .conf files
|
|
${LOGFILE} action log (append-only)
|
|
EOF
|
|
}
|
|
|
|
# =============================================================================
|
|
# Logging
|
|
# =============================================================================
|
|
log() {
|
|
local level="$1"; shift
|
|
local msg="$*"
|
|
local timestamp
|
|
timestamp="$(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "${timestamp} [${level}] ${msg}" >> "${LOGFILE}"
|
|
logger -t "${SYSLOG_IDENT}" -p "user.${level}" "${msg}"
|
|
|
|
# Also print to stderr for ERROR/WARN, stdout for INFO
|
|
case "${level}" in
|
|
ERROR) echo >&2 "${RED}[ERROR]${NC} ${msg}" ;;
|
|
WARN) echo >&2 "${YELLOW}[WARN]${NC} ${msg}" ;;
|
|
INFO) echo " ${GREEN}[INFO]${NC} ${msg}" ;;
|
|
esac
|
|
}
|
|
|
|
info() { log "INFO" "$@"; }
|
|
warn() { log "WARN" "$@"; }
|
|
error() { log "ERROR" "$@"; }
|
|
|
|
# =============================================================================
|
|
# Preflight checks
|
|
# =============================================================================
|
|
preflight() {
|
|
# Must run as root (need to write to /boot), unless overridden for testing
|
|
if [ -z "${NIXOS_ROLLBACK_SKIP_ROOT_CHECK:-}" ] && [ "$(id -u)" -ne 0 ]; then
|
|
error "This script must be run as root (needs write access to /boot/loader)"
|
|
error "Set NIXOS_ROLLBACK_SKIP_ROOT_CHECK=1 for testing against mock paths."
|
|
exit 1
|
|
fi
|
|
|
|
# Directories and files
|
|
if [ ! -d "${ENTRIES_DIR}" ]; then
|
|
error "Boot entries directory not found: ${ENTRIES_DIR}"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -f "${LOADER_CONF}" ]; then
|
|
error "Loader config not found: ${LOADER_CONF}"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -r "${LOADER_CONF}" ]; then
|
|
error "Cannot read loader config: ${LOADER_CONF}"
|
|
exit 1
|
|
fi
|
|
|
|
# Check write access to /boot/loader (parent of loader.conf)
|
|
local loader_dir
|
|
loader_dir="$(dirname "${LOADER_CONF}")"
|
|
if [ ! -w "${loader_dir}" ]; then
|
|
error "Cannot write to ${loader_dir} (insufficient permissions)"
|
|
exit 1
|
|
fi
|
|
|
|
# Logfile directory must exist
|
|
local log_dir
|
|
log_dir="$(dirname "${LOGFILE}")"
|
|
if [ ! -d "${log_dir}" ]; then
|
|
warn "Log directory ${log_dir} does not exist, creating it"
|
|
mkdir -p "${log_dir}" 2>/dev/null || {
|
|
error "Cannot create log directory ${log_dir}"
|
|
exit 1
|
|
}
|
|
fi
|
|
|
|
# Check --rollback-now dependencies
|
|
if [ "${ROLLBACK_NOW}" = true ]; then
|
|
if ! command -v nixos-rebuild &>/dev/null; then
|
|
error "nixos-rebuild not found on PATH (required for --rollback-now)"
|
|
exit 1
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# Generation helpers
|
|
# =============================================================================
|
|
|
|
# get_current_default: reads the current default entry from loader.conf
|
|
# Returns: "nixos-generation-N.conf" or empty string
|
|
get_current_default() {
|
|
grep -E '^default\s+' "${LOADER_CONF}" 2>/dev/null \
|
|
| awk '{print $2}' \
|
|
|| true
|
|
}
|
|
|
|
# extract_gen_number: extracts the numeric generation from a conf filename
|
|
# Input: "nixos-generation-367.conf"
|
|
# Output: 367
|
|
extract_gen_number() {
|
|
echo "$1" | sed 's/nixos-generation-//;s/\.conf//'
|
|
}
|
|
|
|
# get_all_gen_numbers: returns sorted list of generation numbers from entries dir
|
|
get_all_gen_numbers() {
|
|
local -a gens=()
|
|
local f n
|
|
for f in "${ENTRIES_DIR}"/nixos-generation-*.conf; do
|
|
[ -f "${f}" ] || continue
|
|
n="$(basename "${f}" | sed 's/nixos-generation-//;s/\.conf//')"
|
|
gens+=("${n}")
|
|
done
|
|
|
|
if [ "${#gens[@]}" -eq 0 ]; then
|
|
return 1
|
|
fi
|
|
|
|
# Sort numerically and output
|
|
printf '%s\n' "${gens[@]}" | sort -n
|
|
}
|
|
|
|
# get_previous_gen: given current generation number, find the previous one
|
|
# from the list of all available generations
|
|
get_previous_gen() {
|
|
local current="$1"
|
|
shift
|
|
local -a gens=("$@")
|
|
|
|
local prev=""
|
|
local g
|
|
for g in "${gens[@]}"; do
|
|
if [ "${g}" -lt "${current}" ]; then
|
|
prev="${g}"
|
|
fi
|
|
done
|
|
|
|
if [ -z "${prev}" ]; then
|
|
return 1
|
|
fi
|
|
echo "${prev}"
|
|
}
|
|
|
|
# =============================================================================
|
|
# Main rollback logic
|
|
# =============================================================================
|
|
do_rollback() {
|
|
# Step 1: Read current default
|
|
local current_entry
|
|
current_entry="$(get_current_default)"
|
|
|
|
if [ -z "${current_entry}" ]; then
|
|
error "No 'default' entry found in ${LOADER_CONF}"
|
|
error "Cannot determine current generation — aborting"
|
|
exit 1
|
|
fi
|
|
|
|
info "Current default boot entry: ${current_entry}"
|
|
|
|
# Step 2: Build sorted list of all available generations
|
|
local -a all_gens=()
|
|
local line
|
|
while IFS= read -r line; do
|
|
all_gens+=("${line}")
|
|
done < <(get_all_gen_numbers || true)
|
|
|
|
if [ "${#all_gens[@]}" -eq 0 ]; then
|
|
error "No NixOS generation .conf files found in ${ENTRIES_DIR}"
|
|
exit 1
|
|
fi
|
|
|
|
info "Available generations: ${all_gens[*]}"
|
|
|
|
# Step 3: Find current generation number
|
|
local current_gen
|
|
current_gen="$(extract_gen_number "${current_entry}")"
|
|
|
|
# Verify current_gen is a valid number
|
|
if ! [[ "${current_gen}" =~ ^[0-9]+$ ]]; then
|
|
error "Could not parse generation number from '${current_entry}'"
|
|
exit 1
|
|
fi
|
|
|
|
# Step 4: Find the previous generation
|
|
local prev_gen
|
|
prev_gen="$(get_previous_gen "${current_gen}" "${all_gens[@]}")" || {
|
|
error "No previous generation found before generation ${current_gen}"
|
|
error "This is the oldest available generation — cannot roll back further"
|
|
exit 2
|
|
}
|
|
|
|
local prev_entry="nixos-generation-${prev_gen}.conf"
|
|
local prev_conf_path="${ENTRIES_DIR}/${prev_entry}"
|
|
|
|
if [ ! -f "${prev_conf_path}" ]; then
|
|
error "Previous generation entry not found: ${prev_conf_path}"
|
|
error "The .conf file for generation ${prev_gen} is missing — cannot roll back"
|
|
exit 1
|
|
fi
|
|
|
|
info "Target rollback generation: ${prev_gen} → ${prev_entry}"
|
|
|
|
# Step 5: Apply the rollback
|
|
if [ "${DRY_RUN}" = true ]; then
|
|
echo ""
|
|
echo " ${CYAN}[DRY RUN]${NC} Would change ${LOADER_CONF}:"
|
|
echo " ${YELLOW}-${NC} default ${current_entry}"
|
|
echo " ${GREEN}+${NC} default ${prev_entry}"
|
|
echo ""
|
|
info "DRY RUN — no changes made"
|
|
exit 0
|
|
fi
|
|
|
|
# Write new default
|
|
# Use sed with a backup (.bak)
|
|
sed -i.bak "s/^default\s\+${current_entry}/default ${prev_entry}/" "${LOADER_CONF}"
|
|
|
|
# Verify the change was applied
|
|
local new_default
|
|
new_default="$(get_current_default)"
|
|
if [ "${new_default}" != "${prev_entry}" ]; then
|
|
error "Failed to set default boot entry to ${prev_entry}"
|
|
error "Current default is still: ${new_default}"
|
|
# Attempt to restore backup
|
|
if [ -f "${LOADER_CONF}.bak" ]; then
|
|
cp "${LOADER_CONF}.bak" "${LOADER_CONF}"
|
|
info "Restored backup from ${LOADER_CONF}.bak"
|
|
fi
|
|
exit 1
|
|
fi
|
|
|
|
info "Successfully set default boot entry to ${prev_entry} (generation ${prev_gen})"
|
|
info "Backup of previous config saved to ${LOADER_CONF}.bak"
|
|
|
|
# Step 6: Optionally run nixos-rebuild switch --rollback
|
|
if [ "${ROLLBACK_NOW}" = true ]; then
|
|
echo ""
|
|
info "Running nixos-rebuild switch --rollback for immediate effect..."
|
|
if nixos-rebuild switch --rollback 2>&1 | while IFS= read -r line; do
|
|
logger -t "${SYSLOG_IDENT}" "nixos-rebuild: ${line}"
|
|
echo " ${line}"
|
|
done; then
|
|
info "nixos-rebuild switch --rollback completed successfully"
|
|
else
|
|
local rc=$?
|
|
error "nixos-rebuild switch --rollback failed with exit code ${rc}"
|
|
error "The boot default has been changed but the current system was NOT rolled back"
|
|
error "Reboot to apply the rollback"
|
|
exit 3
|
|
fi
|
|
fi
|
|
|
|
info "Rollback complete. Next boot will use generation ${prev_gen}."
|
|
if [ "${ROLLBACK_NOW}" = false ]; then
|
|
echo ""
|
|
echo " ${YELLOW}NOTE:${NC} The current running system is unchanged."
|
|
echo " Reboot to boot into generation ${prev_gen}."
|
|
echo " Or re-run with --rollback-now for immediate effect."
|
|
fi
|
|
}
|
|
|
|
# =============================================================================
|
|
# Main
|
|
# =============================================================================
|
|
main() {
|
|
# Parse arguments
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--rollback-now)
|
|
ROLLBACK_NOW=true
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo >&2 "Unknown option: $1"
|
|
echo >&2 "Use --help for usage information."
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo ""
|
|
echo " ${CYAN}═══ NixOS systemd-boot Rollback ═══${NC}"
|
|
echo ""
|
|
|
|
preflight
|
|
|
|
if [ "${DRY_RUN}" = true ]; then
|
|
info "DRY RUN mode — no changes will be made"
|
|
fi
|
|
if [ "${ROLLBACK_NOW}" = true ]; then
|
|
info "ROLLBACK NOW mode — will also run nixos-rebuild switch --rollback"
|
|
fi
|
|
|
|
echo ""
|
|
do_rollback
|
|
}
|
|
|
|
main "$@"
|