#!/bin/bash # Disk Health Check Script for Ubuntu 24.04 # Enhanced with SAS/PERC H730P controller support # Checks SSD TBW/lifespan and HDD health status SCRIPT_NAME=$(basename "$0") VERSION="2.5" # Color codes RED=$(tput setaf 1) GREEN=$(tput setaf 2) YELLOW=$(tput setaf 3) BLUE=$(tput setaf 4) CYAN=$(tput setaf 6) NC=$(tput sgr0) # Function to print colored output print_color() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # Check if command exists command_exists() { command -v "$1" >/dev/null 2>&1 } # Check dependencies check_dependencies() { local missing=() if ! command_exists smartctl; then missing+=("smartmontools") fi if ! command_exists bc; then missing+=("bc") fi if [[ ${#missing[@]} -gt 0 ]]; then print_color $RED "Error: Missing required packages: ${missing[*]}" echo "Install with: sudo apt update && sudo apt install ${missing[*]}" exit 1 fi } # Function to test SMART access and get available data - FIXED VERSION test_smart_access() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" # Test basic SMART access if ! $smart_cmd -i "$disk" &>/dev/null; then echo "no_access" return fi # Get SMART information local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) # Check if SMART is available - FIXED PARSING if ! echo "$smart_info" | grep -q "SMART support is:"; then echo "not_available" return fi # Extract SMART status - FIXED LOGIC local smart_support_line=$(echo "$smart_info" | grep "SMART support is:") local smart_available=$(echo "$smart_support_line" | grep -q "Available" && echo "Available" || echo "") local smart_enabled=$(echo "$smart_support_line" | grep -q "Enabled" && echo "Enabled" || echo "") if [[ -z "$smart_available" ]]; then echo "not_available" return fi if [[ -z "$smart_enabled" ]]; then echo "disabled" return fi # Test attribute reading local attributes=$($smart_cmd -A "$disk" 2>/dev/null) if [[ -z "$attributes" ]] || ! echo "$attributes" | grep -q "ATTRIBUTE_NAME"; then echo "no_attributes" return fi echo "full_access" } # Function to get disk information with enhanced SAS support get_disk_info() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" local info=$($smart_cmd -i "$disk" 2>/dev/null) local attributes=$($smart_cmd -A "$disk" 2>/dev/null) local health=$($smart_cmd -H "$disk" 2>/dev/null) # Extract information with multiple fallbacks for SAS drives local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) # Get disk type with SAS support local disk_type="UNKNOWN" if echo "$info" | grep -qi "Solid State Device"; then disk_type="SSD" elif echo "$info" | grep -qi "Rotation Rate"; then disk_type="HDD" elif echo "$info" | grep -qi "SCSI\|SAS"; then # SAS drives often don't specify, check rotation rate if echo "$info" | grep -qi "15000\|10000\|7200"; then disk_type="HDD" else disk_type="SSD" fi fi # Extract SMART attributes with multiple field attempts local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | awk '{print $10}' | head -1) local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct" | awk '{print $10}' | head -1) local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $10}' | head -1) # For Kingston and other SSDs with different attribute names local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Lifetime_Writes_GiB\|Host_Writes_32MiB\|Flash_Writes_GiB" | awk '{print $10}' | head -1) local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $10}' | head -1) # For wear leveling indicators local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count\|SSD_Life_Left" | awk '{print $10}' | head -1) echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" } # Function to calculate TBW for SSD - ENHANCED FOR KINGSTON calculate_tbw() { local raw_value=$1 local sectors=$2 local disk_model=$3 # Kingston SSDs use Lifetime_Writes_GiB and Flash_Writes_GiB if echo "$disk_model" | grep -qi "KINGSTON"; then if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then # Convert from GiB to TB local tbw=$(echo "scale=2; $raw_value / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" return fi fi if [[ -n "$sectors" && "$sectors" != "0" ]]; then local bytes=$((sectors * 512)) local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") echo "$tbw" else echo "0" fi } # Function to estimate SSD endurance based on model and capacity estimate_ssd_endurance() { local disk_model=$1 local capacity_gb=$2 # Kingston consumer SSDs if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then if [[ $capacity_gb -ge 960 ]]; then echo "300" # 300TB for 960GB Kingston SA400 elif [[ $capacity_gb -ge 480 ]]; then echo "150" # 150TB for 480GB Kingston else echo "80" # 80TB for smaller Kingston fi # SAS SSDs typically have very high endurance elif echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then # Enterprise SAS SSDs - very high endurance if [[ $capacity_gb -ge 1000 ]]; then echo "10000" # 10PB for 1TB+ enterprise SAS SSD elif [[ $capacity_gb -ge 600 ]]; then echo "6000" # 6PB for 600GB enterprise SAS SSD elif [[ $capacity_gb -ge 400 ]]; then echo "4000" # 4PB for 400GB enterprise SAS SSD else echo "2000" # 2PB for smaller enterprise SAS SSD fi elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA\|WDC\|WESTERN DIGITAL"; then # Enterprise SATA/NVMe SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "1200" # 1.2PB for 1TB enterprise elif [[ $capacity_gb -ge 480 ]]; then echo "600" # 600TB for 480GB enterprise elif [[ $capacity_gb -ge 240 ]]; then echo "300" # 300TB for 240GB enterprise else echo "150" # 150TB for smaller enterprise fi else # Consumer SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "600" # 600TB for 1TB consumer elif [[ $capacity_gb -ge 480 ]]; then echo "300" # 300TB for 480GB consumer elif [[ $capacity_gb -ge 240 ]]; then echo "150" # 150TB for 240GB consumer elif [[ $capacity_gb -ge 120 ]]; then echo "80" # 80TB for 120GB consumer else echo "40" # 40TB for smaller drives fi fi } # Function to estimate SSD lifespan with TBW remaining estimate_ssd_lifespan() { local power_on_hours=$1 local tbw_used=$2 local disk_model=$3 local capacity_gb=$4 local media_wearout=$5 if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then echo "Unknown||Unknown||Unknown" return fi local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") # If we have media wearout indicator, use it for more accurate estimation if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then # For Kingston, SSD_Life_Left is already a percentage if echo "$disk_model" | grep -qi "KINGSTON"; then if [[ $media_wearout -le 10 ]]; then echo "${RED}${media_wearout}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" elif [[ $media_wearout -le 30 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" elif [[ $media_wearout -le 70 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" else echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" fi else # For other drives, media_wearout might be countdown from 100 local wear_percent=$media_wearout if [[ $media_wearout -le 10 ]]; then echo "${RED}${wear_percent}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" elif [[ $media_wearout -le 30 ]]; then echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" elif [[ $media_wearout -le 70 ]]; then echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" else echo "${GREEN}${wear_percent}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" fi fi return fi if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" else echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" fi else echo "Unknown|${estimated_endurance} TB|New|estimated" fi } # Function to estimate HDD lifespan estimate_hdd_lifespan() { local power_on_hours=$1 local reallocated_sectors=$2 local pending_sectors=$3 if [[ -z "$power_on_hours" ]]; then echo "Unknown" return fi power_on_hours=${power_on_hours:-0} reallocated_sectors=${reallocated_sectors:-0} pending_sectors=${pending_sectors:-0} if [[ "$pending_sectors" -gt 0 ]]; then echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" elif [[ "$reallocated_sectors" -gt 100 ]]; then echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" elif [[ "$reallocated_sectors" -gt 10 ]]; then echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" elif [[ "$power_on_hours" -gt 40000 ]]; then echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" elif [[ "$power_on_hours" -gt 25000 ]]; then echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" else echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" fi } # Function to check a single disk with enhanced error handling check_disk() { local disk=$1 local controller=$2 print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" echo "==================================================" # Test SMART access level local access_level=$(test_smart_access "$disk" "$controller") case $access_level in "no_access") print_color $RED "ERROR: Cannot access disk through controller" echo "Possible reasons:" echo " - Controller doesn't support SMART passthrough" echo " - Disk is part of a hardware RAID array" echo " - Insufficient permissions (try running as root)" echo " - Controller busy or offline" echo "" return ;; "not_available") print_color $YELLOW "SMART not available on this disk" echo "This disk does not support SMART monitoring" echo "" return ;; "disabled") print_color $YELLOW "SMART is disabled on this disk" echo "SMART is available but currently disabled" echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" echo "" return ;; "no_attributes") print_color $YELLOW "WARNING: Cannot read SMART attributes" echo "This is common with hardware RAID controllers like PERC H730P" echo "Try checking through the RAID management interface" echo "" return ;; "limited_attributes") print_color $YELLOW "NOTE: Limited SMART data available" echo "Controller is filtering some SMART attributes" ;; esac # Get disk information local disk_info=$(get_disk_info "$disk" "$controller") IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" # Display basic information echo "Model: ${model:-Unknown}" echo "Serial: ${serial:-Unknown}" echo "Type: $disk_type" echo "Capacity: ${capacity:-Unknown}" echo "Firmware: ${firmware:-Unknown}" echo "Health: ${health_status:-Unknown}" # Only show power on hours if available if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then echo "Power On Hours: $power_on_hours" else echo "Power On Hours: Unknown" fi # Disk type specific analysis if [[ "$disk_type" == "SSD" ]]; then local tbw_used=0 if [[ -n "$total_written" && "$total_written" != "0" ]]; then tbw_used=$(calculate_tbw "" "$total_written" "$model") elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then tbw_used=$(calculate_tbw "$host_writes_32mib" "" "$model") fi if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then echo "TBW Used: ${tbw_used} TB" fi # Estimate capacity for endurance calculation local capacity_gb=0 if echo "$capacity" | grep -qi "GB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) elif echo "$capacity" | grep -qi "TB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) fi local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then echo "TBW Remaining: $tbw_remaining" fi echo "Lifespan: $lifespan_percent ($wear_status)" # Show wear source if available if [[ "$wear_source" == "media_wearout" ]]; then echo "Wear Source: Media Wearout Indicator" elif [[ "$wear_source" == "tbw" ]]; then echo "Wear Source: TBW Calculation" elif [[ "$wear_source" == "estimated" ]]; then echo "Wear Source: Estimated Endurance" fi elif [[ "$disk_type" == "HDD" ]]; then if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then echo "Realloc Sectors: $reallocated_sectors" fi if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then echo "Pending Sectors: $pending_sectors" fi local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") echo "Lifespan: $lifespan" else print_color $YELLOW "Limited information available for this disk type" echo "This is normal for hardware RAID configurations like PERC H730P" echo "For detailed SAS drive information, use controller management tools" fi echo "" } # Function to detect all disks with enhanced SAS support (no partitions) - FIXED detect_disks() { local disks=() # Check for SATA/SAS disks - only main devices, no partitions for disk in /dev/sd[a-z]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for NVMe disks - only main devices, no partitions for disk in /dev/nvme[0-9]n[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for SAS disks via SCSI generic - only main devices for disk in /dev/sg[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done # Check for other disk types - only main devices for disk in /dev/vd[a-z] /dev/xvd[a-z]; do if [[ -b "$disk" ]]; then disks+=("$disk") fi done echo "${disks[@]}" } # Function to detect RAID controllers (Ubuntu specific) - FIXED detect_raid_controllers() { local controllers=("megaraid" "cciss" "areca" "3ware" "hpt") local raid_disks=() # Check for RAID controllers for controller in "${controllers[@]}"; do for i in {0..31}; do # Try different disk devices for each controller for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do if [[ -b "$base_disk" ]]; then if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then raid_disks+=("$base_disk:$controller,$i") break fi fi done done done echo "${raid_disks[@]}" } # Main function - FIXED main() { print_color $BLUE "Ubuntu 24.04 Disk Health Check Script v$VERSION" print_color $BLUE "Enhanced with PERC H730P and SAS Support" print_color $BLUE "============================================" echo "" check_dependencies local disks=() # If specific disk provided, check only that disk if [[ $# -gt 0 ]]; then for disk in "$@"; do if [[ -b "$disk" ]]; then disks+=("$disk") else print_color $RED "Error: $disk is not a valid block device" fi done else # Auto-detect disks - FIXED: don't mix output with disk detection print_color $CYAN "Auto-detecting disks (excluding partitions)..." local direct_disks=() read -ra direct_disks <<< "$(detect_disks)" print_color $CYAN "Scanning for RAID controllers..." local raid_disks=() read -ra raid_disks <<< "$(detect_raid_controllers)" # Combine both lists disks=("${direct_disks[@]}" "${raid_disks[@]}") fi if [[ ${#disks[@]} -eq 0 ]]; then print_color $RED "No disks found or accessible" echo "Try running as root or specifying disk paths manually" exit 1 fi print_color $GREEN "Found ${#disks[@]} disk(s) to check" echo "" # Check if running as root, warn if not if [[ $EUID -ne 0 ]]; then print_color $YELLOW "Warning: Not running as root." print_color $YELLOW "Some disks/controllers may show limited information." echo "For complete results, run as: sudo $0" echo "" fi # Check each disk for disk_info in "${disks[@]}"; do # Check if this is a RAID disk (has controller specified) if [[ "$disk_info" == *":"* ]]; then IFS=':' read -r disk controller <<< "$disk_info" check_disk "$disk" "$controller" else check_disk "$disk_info" fi done print_color $BLUE "Check completed!" echo "" print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" print_color $CYAN " - Install 'storcli' for detailed controller information" print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access" print_color $CYAN " - Hardware RAID controllers often limit SMART data access" echo "" print_color $CYAN "Ubuntu-specific tips:" print_color $CYAN " - Use 'lsblk' to see all available block devices" print_color $CYAN " - Use 'lshw -class disk' for detailed disk information" } # Usage information usage() { echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" echo "" echo "If no disks specified, auto-detects all available disks" echo "" echo "Examples:" echo " $SCRIPT_NAME # Check all auto-detected disks" echo " sudo $SCRIPT_NAME # Check all disks (as root)" echo " $SCRIPT_NAME /dev/sda # Check specific disk" echo " $SCRIPT_NAME /dev/nvme0n1 # Check NVMe disk" echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" } # Parse command line arguments case "${1:-}" in -h|--help) usage exit 0 ;; -v|--version) echo "$SCRIPT_NAME version $VERSION" exit 0 ;; *) main "$@" ;; esac