#!/bin/bash # Disk Health Check Script for Ubuntu 24.04 # Enhanced with SAS/PERC H730P controller support # Checks SSD TBW/lifespan and HDD health status SCRIPT_NAME=$(basename "$0") VERSION="2.6" # Color codes RED=$(tput setaf 1) GREEN=$(tput setaf 2) YELLOW=$(tput setaf 3) BLUE=$(tput setaf 4) CYAN=$(tput setaf 6) NC=$(tput sgr0) # Function to print colored output print_color() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # Check if command exists command_exists() { command -v "$1" >/dev/null 2>&1 } # Check dependencies check_dependencies() { local missing=() if ! command_exists smartctl; then missing+=("smartmontools") fi if ! command_exists bc; then missing+=("bc") fi if [[ ${#missing[@]} -gt 0 ]]; then print_color $RED "Error: Missing required packages: ${missing[*]}" echo "Install with: sudo apt update && sudo apt install ${missing[*]}" exit 1 fi } # Function to test SMART access and get available data - ENHANCED FOR NVMe test_smart_access() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" # Test basic SMART access if ! $smart_cmd -i "$disk" &>/dev/null; then echo "no_access" return fi # Get SMART information local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) # Check if this is an NVMe drive if echo "$smart_info" | grep -qi "NVMe"; then # NVMe drives have different SMART implementation if $smart_cmd -H "$disk" &>/dev/null; then echo "full_access" else echo "no_attributes" fi return fi # Check if SMART is available for SATA/SAS if ! echo "$smart_info" | grep -q "SMART support is:"; then echo "not_available" return fi # Extract SMART status local smart_support_line=$(echo "$smart_info" | grep "SMART support is:") local smart_available=$(echo "$smart_support_line" | grep -q "Available" && echo "Available" || echo "") local smart_enabled=$(echo "$smart_support_line" | grep -q "Enabled" && echo "Enabled" || echo "") if [[ -z "$smart_available" ]]; then echo "not_available" return fi if [[ -z "$smart_enabled" ]]; then echo "disabled" return fi # Test attribute reading local attributes=$($smart_cmd -A "$disk" 2>/dev/null) if [[ -z "$attributes" ]] || ! echo "$attributes" | grep -q "ATTRIBUTE_NAME"; then echo "no_attributes" return fi echo "full_access" } # Function to get disk information with enhanced SAS and NVMe support get_disk_info() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" local info=$($smart_cmd -i "$disk" 2>/dev/null) local attributes=$($smart_cmd -A "$disk" 2>/dev/null) local health=$($smart_cmd -H "$disk" 2>/dev/null) # Extract information with multiple fallbacks local model=$(echo "$info" | grep -i "Device Model:\|Product:\|Model Number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:\|Namespace 1 Size/Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health\|Health Status:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) # Get disk type local disk_type="UNKNOWN" if echo "$info" | grep -qi "Solid State Device\|NVMe"; then disk_type="SSD" elif echo "$info" | grep -qi "Rotation Rate"; then disk_type="HDD" elif echo "$info" | grep -qi "SCSI\|SAS"; then if echo "$info" | grep -qi "15000\|10000\|7200"; then disk_type="HDD" else disk_type="SSD" fi fi # Extract SMART attributes with multiple field attempts local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | awk '{print $10}' | head -1 | sed 's/[^0-9]//g') local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct" | awk '{print $10}' | head -1) local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $10}' | head -1) # For Kingston and other SSDs with different attribute names local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Lifetime_Writes_GiB\|Host_Writes_32MiB\|Flash_Writes_GiB\|Data Units Written" | awk '{print $10}' | head -1) local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $10}' | head -1) # For wear leveling indicators local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count\|SSD_Life_Left\|Percentage Used\|Available Spare" | awk '{print $10}' | head -1) echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" } # Function to calculate TBW for SSD - ENHANCED FOR KINGSTON AND NVMe calculate_tbw() { local raw_value=$1 local sectors=$2 local disk_model=$3 local attribute_name=$4 # Kingston SSDs use Lifetime_Writes_GiB and Flash_Writes_GiB if echo "$disk_model" | grep -qi "KINGSTON"; then if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then # Convert from GiB to TB local tbw=$(echo "scale=2; $raw_value / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" return fi fi # NVMe drives use Data Units Written (1 unit = 1,000,000 bytes for NVMe 1.0+, 512,000 bytes for older) if echo "$attribute_name" | grep -qi "Data Units Written"; then if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then # Convert from data units to TB (assuming 1,000,000 bytes per unit) local bytes=$(echo "$raw_value * 1000000" | bc 2>/dev/null) local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" return fi fi if [[ -n "$sectors" && "$sectors" != "0" ]]; then local bytes=$((sectors * 512)) local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" else echo "0" fi } # Function to estimate SSD endurance based on model and capacity estimate_ssd_endurance() { local disk_model=$1 local capacity_gb=$2 # Kingston consumer SSDs if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then if [[ $capacity_gb -ge 960 ]]; then echo "300" # 300TB for 960GB Kingston SA400 elif [[ $capacity_gb -ge 480 ]]; then echo "150" # 150TB for 480GB Kingston else echo "80" # 80TB for smaller Kingston fi # NVMe SSDs typically have higher endurance elif echo "$disk_model" | grep -qi "NVMe"; then if [[ $capacity_gb -ge 2000 ]]; then echo "1200" # 1.2PB for 2TB+ NVMe elif [[ $capacity_gb -ge 1000 ]]; then echo "600" # 600TB for 1TB NVMe elif [[ $capacity_gb -ge 500 ]]; then echo "300" # 300TB for 500GB NVMe else echo "150" # 150TB for smaller NVMe fi # SAS SSDs typically have very high endurance elif echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then if [[ $capacity_gb -ge 1000 ]]; then echo "10000" # 10PB for 1TB+ enterprise SAS SSD elif [[ $capacity_gb -ge 600 ]]; then echo "6000" # 6PB for 600GB enterprise SAS SSD elif [[ $capacity_gb -ge 400 ]]; then echo "4000" # 4PB for 400GB enterprise SAS SSD else echo "2000" # 2PB for smaller enterprise SAS SSD fi elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA\|WDC\|WESTERN DIGITAL"; then # Enterprise SATA/NVMe SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "1200" # 1.2PB for 1TB enterprise elif [[ $capacity_gb -ge 480 ]]; then echo "600" # 600TB for 480GB enterprise elif [[ $capacity_gb -ge 240 ]]; then echo "300" # 300TB for 240GB enterprise else echo "150" # 150TB for smaller enterprise fi else # Consumer SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "600" # 600TB for 1TB consumer elif [[ $capacity_gb -ge 480 ]]; then echo "300" # 300TB for 480GB consumer elif [[ $capacity_gb -ge 240 ]]; then echo "150" # 150TB for 240GB consumer elif [[ $capacity_gb -ge 120 ]]; then echo "80" # 80TB for 120GB consumer else echo "40" # 40TB for smaller drives fi fi } # Function to estimate SSD lifespan with TBW remaining - ENHANCED estimate_ssd_lifespan() { local power_on_hours=$1 local tbw_used=$2 local disk_model=$3 local capacity_gb=$4 local media_wearout=$5 if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then echo "Unknown||Unknown||Unknown" return fi local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") # If we have media wearout indicator, use it for more accurate estimation if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then # For Kingston, SSD_Life_Left is already a percentage if echo "$disk_model" | grep -qi "KINGSTON"; then if [[ $media_wearout -le 10 ]]; then echo "${RED}${media_wearout}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" elif [[ $media_wearout -le 30 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" elif [[ $media_wearout -le 70 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" else echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" fi else # For other drives, media_wearout might be countdown from 100 local wear_percent=$media_wearout if [[ $media_wearout -le 10 ]]; then echo "${RED}${wear_percent}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" elif [[ $media_wearout -le 30 ]]; then echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" elif [[ $media_wearout -le 70 ]]; then echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" else echo "${GREEN}${wear_percent}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" fi fi return fi if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" else echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" fi else echo "Unknown|${estimated_endurance} TB|New|estimated" fi } # Function to estimate HDD lifespan - FIXED POWER_ON_HOURS PARSING estimate_hdd_lifespan() { local power_on_hours=$1 local reallocated_sectors=$2 local pending_sectors=$3 # Clean power_on_hours to extract just the numeric part local clean_hours=$(echo "$power_on_hours" | sed 's/[^0-9].*//') clean_hours=${clean_hours:-0} if [[ -z "$clean_hours" || "$clean_hours" -eq 0 ]]; then echo "Unknown" return fi reallocated_sectors=${reallocated_sectors:-0} pending_sectors=${pending_sectors:-0} if [[ "$pending_sectors" -gt 0 ]]; then echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" elif [[ "$reallocated_sectors" -gt 100 ]]; then echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" elif [[ "$reallocated_sectors" -gt 10 ]]; then echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" elif [[ "$clean_hours" -gt 40000 ]]; then echo "${YELLOW}1-2 years${NC} (High usage: $clean_hours hours)" elif [[ "$clean_hours" -gt 25000 ]]; then echo "${GREEN}2-3 years${NC} (Moderate usage: $clean_hours hours)" else echo "${GREEN}> 3 years${NC} (Low usage: $clean_hours hours)" fi } # Function to check a single disk with enhanced error handling check_disk() { local disk=$1 local controller=$2 print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" echo "==================================================" # Test SMART access level local access_level=$(test_smart_access "$disk" "$controller") case $access_level in "no_access") print_color $RED "ERROR: Cannot access disk through controller" echo "Possible reasons:" echo " - Controller doesn't support SMART passthrough" echo " - Disk is part of a hardware RAID array" echo " - Insufficient permissions (try running as root)" echo " - Controller busy or offline" echo "" return ;; "not_available") print_color $YELLOW "SMART not available on this disk" echo "This disk does not support SMART monitoring" echo "" return ;; "disabled") print_color $YELLOW "SMART is disabled on this disk" echo "SMART is available but currently disabled" echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" echo "" return ;; "no_attributes") print_color $YELLOW "WARNING: Cannot read SMART attributes" echo "This is common with hardware RAID controllers like PERC H730P" echo "Try checking through the RAID management interface" echo "" return ;; "limited_attributes") print_color $YELLOW "NOTE: Limited SMART data available" echo "Controller is filtering some SMART attributes" ;; esac # Get disk information local disk_info=$(get_disk_info "$disk" "$controller") IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" # Display basic information echo "Model: ${model:-Unknown}" echo "Serial: ${serial:-Unknown}" echo "Type: $disk_type" echo "Capacity: ${capacity:-Unknown}" echo "Firmware: ${firmware:-Unknown}" echo "Health: ${health_status:-Unknown}" # Only show power on hours if available if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then echo "Power On Hours: $power_on_hours" else echo "Power On Hours: Unknown" fi # Disk type specific analysis if [[ "$disk_type" == "SSD" ]]; then # Get the actual attribute name for TBW calculation local attributes=$(smartctl -A "$disk" 2>/dev/null) local tbw_attribute_name=$(echo "$attributes" | grep -i "Lifetime_Writes_GiB\|Flash_Writes_GiB\|Data Units Written" | head -1 | awk '{print $2}') local tbw_used=0 if [[ -n "$total_written" && "$total_written" != "0" ]]; then tbw_used=$(calculate_tbw "" "$total_written" "$model" "$tbw_attribute_name") elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then tbw_used=$(calculate_tbw "$host_writes_32mib" "" "$model" "$tbw_attribute_name") fi # Always show TBW information for SSDs echo "TBW Used: ${tbw_used} TB" # Estimate capacity for endurance calculation local capacity_gb=0 if echo "$capacity" | grep -qi "GB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) elif echo "$capacity" | grep -qi "TB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) fi local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) echo "TBW Remaining: $tbw_remaining" echo "Lifespan: $lifespan_percent ($wear_status)" # Show wear source if available if [[ "$wear_source" == "media_wearout" ]]; then echo "Wear Source: Media Wearout Indicator" elif [[ "$wear_source" == "tbw" ]]; then echo "Wear Source: TBW Calculation" elif [[ "$wear_source" == "estimated" ]]; then echo "Wear Source: Estimated Endurance" fi elif [[ "$disk_type" == "HDD" ]]; then if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then echo "Realloc Sectors: $reallocated_sectors" fi if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then echo "Pending Sectors: $pending_sectors" fi local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") echo "Lifespan: $lifespan" else print_color $YELLOW "Limited information available for this disk type" echo "This is normal for hardware RAID configurations like PERC H730P" echo "For detailed SAS drive information, use controller management tools" fi echo "" } # Function to detect all disks with enhanced SAS support (no partitions) detect_disks() { local disks=() # Check for SATA/SAS disks - only main devices, no partitions for disk in /dev/sd[a-z]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for NVMe disks - only main devices, no partitions for disk in /dev/nvme[0-9]n[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for SAS disks via SCSI generic - only main devices for disk in /dev/sg[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for other disk types - only main devices for disk in /dev/vd[a-z] /dev/xvd[a-z]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done echo "${disks[@]}" } # Function to detect RAID controllers (Ubuntu specific) detect_raid_controllers() { local controllers=("megaraid" "cciss" "areca" "3ware" "hpt") local raid_disks=() # Check for RAID controllers for controller in "${controllers[@]}"; do for i in {0..31}; do # Try different disk devices for each controller for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do if [[ -b "$base_disk" ]]; then if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then raid_disks+=("$base_disk:$controller,$i") break fi fi done done done echo "${raid_disks[@]}" } # Main function main() { print_color $BLUE "Ubuntu 24.04 Disk Health Check Script v$VERSION" print_color $BLUE "Enhanced with PERC H730P and SAS Support" print_color $BLUE "============================================" echo "" check_dependencies local disks=() # If specific disk provided, check only that disk if [[ $# -gt 0 ]]; then for disk in "$@"; do if [[ -b "$disk" ]]; then disks+=("$disk") else print_color $RED "Error: $disk is not a valid block device" fi done else # Auto-detect disks print_color $CYAN "Auto-detecting disks (excluding partitions)..." local direct_disks=() read -ra direct_disks <<< "$(detect_disks)" print_color $CYAN "Scanning for RAID controllers..." local raid_disks=() read -ra raid_disks <<< "$(detect_raid_controllers)" # Combine both lists disks=("${direct_disks[@]}" "${raid_disks[@]}") fi if [[ ${#disks[@]} -eq 0 ]]; then print_color $RED "No disks found or accessible" echo "Try running as root or specifying disk paths manually" exit 1 fi print_color $GREEN "Found ${#disks[@]} disk(s) to check" echo "" # Check if running as root, warn if not if [[ $EUID -ne 0 ]]; then print_color $YELLOW "Warning: Not running as root." print_color $YELLOW "Some disks/controllers may show limited information." echo "For complete results, run as: sudo $0" echo "" fi # Check each disk for disk_info in "${disks[@]}"; do # Check if this is a RAID disk (has controller specified) if [[ "$disk_info" == *":"* ]]; then IFS=':' read -r disk controller <<< "$disk_info" check_disk "$disk" "$controller" else check_disk "$disk_info" fi done print_color $BLUE "Check completed!" echo "" print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" print_color $CYAN " - Install 'storcli' for detailed controller information" print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access" print_color $CYAN " - Hardware RAID controllers often limit SMART data access" echo "" print_color $CYAN "Ubuntu-specific tips:" print_color $CYAN " - Use 'lsblk' to see all available block devices" print_color $CYAN " - Use 'lshw -class disk' for detailed disk information" } # Usage information usage() { echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" echo "" echo "If no disks specified, auto-detects all available disks" echo "" echo "Examples:" echo " $SCRIPT_NAME # Check all auto-detected disks" echo " sudo $SCRIPT_NAME # Check all disks (as root)" echo " $SCRIPT_NAME /dev/sda # Check specific disk" echo " $SCRIPT_NAME /dev/nvme0n1 # Check NVMe disk" echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" } # Parse command line arguments case "${1:-}" in -h|--help) usage exit 0 ;; -v|--version) echo "$SCRIPT_NAME version $VERSION" exit 0 ;; *) main "$@" ;; esac