Files
disk-health/old/alma-v2.4.sh
2025-10-22 04:35:22 +08:00

537 lines
20 KiB
Bash
Executable File

#!/bin/bash
# Disk Health Check Script for Alma Linux 9
# Enhanced with SAS/PERC H730P controller support
# Checks SSD TBW/lifespan and HDD health status
SCRIPT_NAME=$(basename "$0")
VERSION="2.4"
# Color codes
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
YELLOW=$(tput setaf 3)
BLUE=$(tput setaf 4)
CYAN=$(tput setaf 6)
NC=$(tput sgr0)
# Function to print colored output
print_color() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# Check if command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Check dependencies
check_dependencies() {
local missing=()
if ! command_exists smartctl; then
missing+=("smartmontools")
fi
if ! command_exists bc; then
missing+=("bc")
fi
if [[ ${#missing[@]} -gt 0 ]]; then
print_color $RED "Error: Missing required packages: ${missing[*]}"
echo "Install with: sudo dnf install ${missing[*]}"
exit 1
fi
}
# Function to test SMART access and get available data
test_smart_access() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
# Test basic SMART access
if ! $smart_cmd -i "$disk" &>/dev/null; then
echo "no_access"
return
fi
# Check if SMART is enabled (don't enable it, just check status)
local smart_info=$($smart_cmd -i "$disk" 2>/dev/null)
local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}')
local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}')
if [[ "$smart_available" != "Available" ]]; then
echo "not_available"
return
fi
if [[ "$smart_enabled" != "Enabled" ]]; then
echo "disabled"
return
fi
# Test attribute reading
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
if [[ -z "$attributes" ]]; then
echo "no_attributes"
return
fi
# Check if we have basic attributes
local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1)
if [[ -z "$power_on_hours" ]]; then
echo "limited_attributes"
return
fi
echo "full_access"
}
# Function to get disk information with enhanced SAS support
get_disk_info() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local info=$($smart_cmd -i "$disk" 2>/dev/null)
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
local health=$($smart_cmd -H "$disk" 2>/dev/null)
# Extract information with multiple fallbacks for SAS drives
local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
[[ -n "$vendor" && -n "$model" ]] && model="$vendor $model"
local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1)
local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
[[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1)
# Get disk type with SAS support
local disk_type="UNKNOWN"
if echo "$info" | grep -qi "Solid State Device"; then
disk_type="SSD"
elif echo "$info" | grep -qi "Rotation Rate"; then
disk_type="HDD"
elif echo "$info" | grep -qi "SCSI\|SAS"; then
# SAS drives often don't specify, check rotation rate
if echo "$info" | grep -qi "15000\|10000\|7200"; then
disk_type="HDD"
else
disk_type="SSD"
fi
fi
# Extract SMART attributes with multiple field attempts for SAS
local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1)
[[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1)
local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1)
local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1)
local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1)
local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1)
# For SAS drives, try to get media wearout for SSDs
local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1)
echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout"
}
# Function to calculate TBW for SSD
calculate_tbw() {
local raw_value=$1
local sectors=$2
if [[ -n "$sectors" && "$sectors" != "0" ]]; then
local bytes=$((sectors * 512))
local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0")
echo "$tbw"
elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0")
echo "$tbw"
else
echo "0"
fi
}
# Function to estimate SSD endurance based on model and capacity
estimate_ssd_endurance() {
local disk_model=$1
local capacity_gb=$2
# SAS SSDs typically have very high endurance
if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then
# Enterprise SAS SSDs - very high endurance
if [[ $capacity_gb -ge 1000 ]]; then
echo "10000" # 10PB for 1TB+ enterprise SAS SSD
elif [[ $capacity_gb -ge 600 ]]; then
echo "6000" # 6PB for 600GB enterprise SAS SSD
elif [[ $capacity_gb -ge 400 ]]; then
echo "4000" # 4PB for 400GB enterprise SAS SSD
else
echo "2000" # 2PB for smaller enterprise SAS SSD
fi
elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then
# Enterprise SATA SSDs
if [[ $capacity_gb -ge 1000 ]]; then
echo "1200" # 1.2PB for 1TB enterprise
elif [[ $capacity_gb -ge 480 ]]; then
echo "600" # 600TB for 480GB enterprise
elif [[ $capacity_gb -ge 240 ]]; then
echo "300" # 300TB for 240GB enterprise
else
echo "150" # 150TB for smaller enterprise
fi
else
# Consumer SSDs
if [[ $capacity_gb -ge 1000 ]]; then
echo "600" # 600TB for 1TB consumer
elif [[ $capacity_gb -ge 480 ]]; then
echo "300" # 300TB for 480GB consumer
elif [[ $capacity_gb -ge 240 ]]; then
echo "150" # 150TB for 240GB consumer
elif [[ $capacity_gb -ge 120 ]]; then
echo "80" # 80TB for 120GB consumer
else
echo "40" # 40TB for smaller drives
fi
fi
}
# Function to estimate SSD lifespan with TBW remaining
estimate_ssd_lifespan() {
local power_on_hours=$1
local tbw_used=$2
local disk_model=$3
local capacity_gb=$4
local media_wearout=$5
if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then
echo "Unknown||Unknown||Unknown"
return
fi
local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb")
local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0")
# If we have media wearout indicator, use it for more accurate estimation
if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then
if [[ $media_wearout -le 10 ]]; then
echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout"
elif [[ $media_wearout -le 30 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout"
elif [[ $media_wearout -le 70 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout"
else
echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout"
fi
return
fi
if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then
local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0")
local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100")
if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then
echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw"
elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then
echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw"
else
echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw"
fi
else
echo "Unknown|${estimated_endurance} TB|New|estimated"
fi
}
# Function to estimate HDD lifespan
estimate_hdd_lifespan() {
local power_on_hours=$1
local reallocated_sectors=$2
local pending_sectors=$3
if [[ -z "$power_on_hours" ]]; then
echo "Unknown"
return
fi
power_on_hours=${power_on_hours:-0}
reallocated_sectors=${reallocated_sectors:-0}
pending_sectors=${pending_sectors:-0}
if [[ "$pending_sectors" -gt 0 ]]; then
echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)"
elif [[ "$reallocated_sectors" -gt 100 ]]; then
echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)"
elif [[ "$reallocated_sectors" -gt 10 ]]; then
echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)"
elif [[ "$power_on_hours" -gt 40000 ]]; then
echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)"
elif [[ "$power_on_hours" -gt 25000 ]]; then
echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)"
else
echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)"
fi
}
# Function to check a single disk with enhanced error handling
check_disk() {
local disk=$1
local controller=$2
print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})"
echo "=================================================="
# Test SMART access level
local access_level=$(test_smart_access "$disk" "$controller")
case $access_level in
"no_access")
print_color $RED "ERROR: Cannot access disk through controller"
echo "Possible reasons:"
echo " - Controller doesn't support SMART passthrough"
echo " - Disk is part of a hardware RAID array"
echo " - Insufficient permissions (try running as root)"
echo " - Controller busy or offline"
echo ""
return
;;
"not_available")
print_color $YELLOW "SMART not available on this disk"
echo "This disk does not support SMART monitoring"
echo ""
return
;;
"disabled")
print_color $YELLOW "SMART is disabled on this disk"
echo "SMART is available but currently disabled"
echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk"
echo ""
return
;;
"no_attributes")
print_color $YELLOW "WARNING: Cannot read SMART attributes"
echo "This is common with hardware RAID controllers like PERC H730P"
echo "Try checking through the RAID management interface"
echo ""
return
;;
"limited_attributes")
print_color $YELLOW "NOTE: Limited SMART data available"
echo "Controller is filtering some SMART attributes"
;;
esac
# Get disk information
local disk_info=$(get_disk_info "$disk" "$controller")
IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info"
# Display basic information
echo "Model: ${model:-Unknown}"
echo "Serial: ${serial:-Unknown}"
echo "Type: $disk_type"
echo "Capacity: ${capacity:-Unknown}"
echo "Firmware: ${firmware:-Unknown}"
echo "Health: ${health_status:-Unknown}"
# Only show power on hours if available
if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then
echo "Power On Hours: $power_on_hours"
else
echo "Power On Hours: Unknown"
fi
# Disk type specific analysis
if [[ "$disk_type" == "SSD" ]]; then
local tbw_used=0
if [[ -n "$total_written" && "$total_written" != "0" ]]; then
tbw_used=$(calculate_tbw "" "$total_written")
elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then
tbw_used=$(calculate_tbw "$host_writes_32mib" "")
fi
if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then
echo "TBW Used: ${tbw_used} TB"
fi
# Estimate capacity for endurance calculation
local capacity_gb=0
if echo "$capacity" | grep -qi "GB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1)
elif echo "$capacity" | grep -qi "TB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1)
fi
local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout")
local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1)
local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2)
local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3)
local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4)
if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then
echo "TBW Remaining: $tbw_remaining"
fi
echo "Lifespan: $lifespan_percent ($wear_status)"
# Show wear source if available
if [[ "$wear_source" == "media_wearout" ]]; then
echo "Wear Source: Media Wearout Indicator"
elif [[ "$wear_source" == "tbw" ]]; then
echo "Wear Source: TBW Calculation"
elif [[ "$wear_source" == "estimated" ]]; then
echo "Wear Source: Estimated Endurance"
fi
elif [[ "$disk_type" == "HDD" ]]; then
if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then
echo "Realloc Sectors: $reallocated_sectors"
fi
if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then
echo "Pending Sectors: $pending_sectors"
fi
local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}")
echo "Lifespan: $lifespan"
else
print_color $YELLOW "Limited information available for this disk type"
echo "This is normal for hardware RAID configurations like PERC H730P"
echo "For detailed SAS drive information, use controller management tools"
fi
echo ""
}
# Function to detect all disks with enhanced SAS support (no partitions)
detect_disks() {
local disks=()
# Check for SATA/SAS disks - only main devices, no partitions
for disk in /dev/sd[a-z]; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
fi
done
# Check for NVMe disks - only main devices, no partitions
for disk in /dev/nvme[0-9]n[0-9]; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
fi
done
# Check for SAS disks via SCSI generic - only main devices
for disk in /dev/sg[0-9]; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
fi
done
# Check for other disk types - only main devices
for disk in /dev/vd[a-z] /dev/xvd[a-z]; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
fi
done
echo "${disks[@]}"
}
# Main function
main() {
print_color $BLUE "Alma Linux 9 Disk Health Check Script v$VERSION"
print_color $BLUE "Enhanced with PERC H730P and SAS Support"
print_color $BLUE "============================================"
echo ""
check_dependencies
local disks=()
# If specific disk provided, check only that disk
if [[ $# -gt 0 ]]; then
for disk in "$@"; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
else
print_color $RED "Error: $disk is not a valid block device"
fi
done
else
# Auto-detect disks
print_color $CYAN "Auto-detecting disks (excluding partitions)..."
read -ra disks <<< "$(detect_disks)"
fi
if [[ ${#disks[@]} -eq 0 ]]; then
print_color $RED "No disks found or accessible"
echo "Try running as root or specifying disk paths manually"
exit 1
fi
print_color $GREEN "Found ${#disks[@]} disk(s) to check"
echo ""
# Check if running as root, warn if not
if [[ $EUID -ne 0 ]]; then
print_color $YELLOW "Warning: Not running as root."
print_color $YELLOW "Some disks/controllers may show limited information."
echo "For complete results, run as: sudo $0"
echo ""
fi
# Check each disk
for disk in "${disks[@]}"; do
check_disk "$disk"
done
print_color $BLUE "Check completed!"
echo ""
print_color $CYAN "Note: For PERC H730P controllers with SAS drives:"
print_color $CYAN " - Install 'storcli' for detailed controller information"
print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access"
print_color $CYAN " - Hardware RAID controllers often limit SMART data access"
}
# Usage information
usage() {
echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]"
echo ""
echo "If no disks specified, auto-detects all available disks"
echo ""
echo "Examples:"
echo " $SCRIPT_NAME # Check all auto-detected disks"
echo " sudo $SCRIPT_NAME # Check all disks (as root)"
echo " $SCRIPT_NAME /dev/sda # Check specific disk"
echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly"
echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks"
}
# Parse command line arguments
case "${1:-}" in
-h|--help)
usage
exit 0
;;
-v|--version)
echo "$SCRIPT_NAME version $VERSION"
exit 0
;;
*)
main "$@"
;;
esac