#!/bin/bash # Disk Health Check Script for Harvester OS # Enhanced with SAS/PERC H730P controller support # Checks SSD TBW/lifespan and HDD health status SCRIPT_NAME=$(basename "$0") VERSION="2.4" # Color codes RED=$(tput setaf 1) GREEN=$(tput setaf 2) YELLOW=$(tput setaf 3) BLUE=$(tput setaf 4) CYAN=$(tput setaf 6) NC=$(tput sgr0) # Function to print colored output print_color() { local color=$1 local message=$2 echo -e "${color}${message}${NC}" } # Check if command exists command_exists() { command -v "$1" >/dev/null 2>&1 } if ! command_exists smartctl; then print_color $RED "Error: smartctl is not installed. Please install smartmontools package." exit 1 fi # Function to test SMART access and get available data test_smart_access() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" # Test basic SMART access if ! $smart_cmd -i "$disk" &>/dev/null; then echo "no_access" return fi # Check if SMART is enabled (don't enable it, just check status) local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}') local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}') if [[ "$smart_available" != "Available" ]]; then echo "not_available" return fi if [[ "$smart_enabled" != "Enabled" ]]; then echo "disabled" return fi # Test attribute reading local attributes=$($smart_cmd -A "$disk" 2>/dev/null) if [[ -z "$attributes" ]]; then echo "no_attributes" return fi # Check if we have basic attributes local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1) if [[ -z "$power_on_hours" ]]; then echo "limited_attributes" return fi echo "full_access" } # Function to get disk information with enhanced SAS support get_disk_info() { local disk=$1 local controller=$2 local smart_cmd="smartctl" [[ -n "$controller" ]] && smart_cmd+=" -d $controller" local info=$($smart_cmd -i "$disk" 2>/dev/null) local attributes=$($smart_cmd -A "$disk" 2>/dev/null) local health=$($smart_cmd -H "$disk" 2>/dev/null) # Extract information with multiple fallbacks for SAS drives local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) # Get disk type with SAS support local disk_type="UNKNOWN" if echo "$info" | grep -qi "Solid State Device"; then disk_type="SSD" elif echo "$info" | grep -qi "Rotation Rate"; then disk_type="HDD" elif echo "$info" | grep -qi "SCSI\|SAS"; then # SAS drives often don't specify, check rotation rate if echo "$info" | grep -qi "15000\|10000\|7200"; then disk_type="HDD" else disk_type="SSD" fi fi # Extract SMART attributes with multiple field attempts for SAS local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1) [[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1) local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1) local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1) local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1) local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1) # For SAS drives, try to get media wearout for SSDs local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1) echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" } # Function to calculate TBW for SSD calculate_tbw() { local raw_value=$1 local sectors=$2 if [[ -n "$sectors" && "$sectors" != "0" ]]; then local bytes=$((sectors * 512)) local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") echo "$tbw" elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") echo "$tbw" else echo "0" fi } # Function to estimate SSD endurance based on model and capacity estimate_ssd_endurance() { local disk_model=$1 local capacity_gb=$2 # SAS SSDs typically have very high endurance if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then # Enterprise SAS SSDs - very high endurance if [[ $capacity_gb -ge 1000 ]]; then echo "10000" # 10PB for 1TB+ enterprise SAS SSD elif [[ $capacity_gb -ge 600 ]]; then echo "6000" # 6PB for 600GB enterprise SAS SSD elif [[ $capacity_gb -ge 400 ]]; then echo "4000" # 4PB for 400GB enterprise SAS SSD else echo "2000" # 2PB for smaller enterprise SAS SSD fi elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then # Enterprise SATA SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "1200" # 1.2PB for 1TB enterprise elif [[ $capacity_gb -ge 480 ]]; then echo "600" # 600TB for 480GB enterprise elif [[ $capacity_gb -ge 240 ]]; then echo "300" # 300TB for 240GB enterprise else echo "150" # 150TB for smaller enterprise fi else # Consumer SSDs if [[ $capacity_gb -ge 1000 ]]; then echo "600" # 600TB for 1TB consumer elif [[ $capacity_gb -ge 480 ]]; then echo "300" # 300TB for 480GB consumer elif [[ $capacity_gb -ge 240 ]]; then echo "150" # 150TB for 240GB consumer elif [[ $capacity_gb -ge 120 ]]; then echo "80" # 80TB for 120GB consumer else echo "40" # 40TB for smaller drives fi fi } # Function to estimate SSD lifespan with TBW remaining estimate_ssd_lifespan() { local power_on_hours=$1 local tbw_used=$2 local disk_model=$3 local capacity_gb=$4 local media_wearout=$5 if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then echo "Unknown||Unknown||Unknown" return fi local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc -l 2>/dev/null || echo "0") # If we have media wearout indicator, use it for more accurate estimation if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then if [[ $media_wearout -le 10 ]]; then echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" elif [[ $media_wearout -le 30 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" elif [[ $media_wearout -le 70 ]]; then echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" else echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" fi return fi if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc -l 2>/dev/null || echo "0") local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc -l 2>/dev/null || echo "100") if [[ $(echo "$lifespan_used >= 80" | bc -l) -eq 1 ]]; then echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" elif [[ $(echo "$lifespan_used >= 50" | bc -l) -eq 1 ]]; then echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" else echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" fi else echo "Unknown|${estimated_endurance} TB|New|estimated" fi } # Function to estimate HDD lifespan estimate_hdd_lifespan() { local power_on_hours=$1 local reallocated_sectors=$2 local pending_sectors=$3 if [[ -z "$power_on_hours" ]]; then echo "Unknown" return fi power_on_hours=${power_on_hours:-0} reallocated_sectors=${reallocated_sectors:-0} pending_sectors=${pending_sectors:-0} if [[ "$pending_sectors" -gt 0 ]]; then echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" elif [[ "$reallocated_sectors" -gt 100 ]]; then echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" elif [[ "$reallocated_sectors" -gt 10 ]]; then echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" elif [[ "$power_on_hours" -gt 40000 ]]; then echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" elif [[ "$power_on_hours" -gt 25000 ]]; then echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" else echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" fi } # Function to check a single disk with enhanced error handling check_disk() { local disk=$1 local controller=$2 print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" echo "==================================================" # Test SMART access level local access_level=$(test_smart_access "$disk" "$controller") case $access_level in "no_access") print_color $RED "ERROR: Cannot access disk through controller" echo "Possible reasons:" echo " - Controller doesn't support SMART passthrough" echo " - Disk is part of a hardware RAID array" echo " - Insufficient permissions (try running as root)" echo " - Controller busy or offline" echo "" return ;; "not_available") print_color $YELLOW "SMART not available on this disk" echo "This disk does not support SMART monitoring" echo "" return ;; "disabled") print_color $YELLOW "SMART is disabled on this disk" echo "SMART is available but currently disabled" echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" echo "" return ;; "no_attributes") print_color $YELLOW "WARNING: Cannot read SMART attributes" echo "This is common with hardware RAID controllers like PERC H730P" echo "Try checking through the RAID management interface" echo "" return ;; "limited_attributes") print_color $YELLOW "NOTE: Limited SMART data available" echo "Controller is filtering some SMART attributes" ;; esac # Get disk information local disk_info=$(get_disk_info "$disk" "$controller") IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" # Display basic information echo "Model: ${model:-Unknown}" echo "Serial: ${serial:-Unknown}" echo "Type: $disk_type" echo "Capacity: ${capacity:-Unknown}" echo "Firmware: ${firmware:-Unknown}" echo "Health: ${health_status:-Unknown}" # Only show power on hours if available if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then echo "Power On Hours: $power_on_hours" else echo "Power On Hours: Unknown" fi # Disk type specific analysis if [[ "$disk_type" == "SSD" ]]; then local tbw_used=0 if [[ -n "$total_written" && "$total_written" != "0" ]]; then tbw_used=$(calculate_tbw "" "$total_written") elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then tbw_used=$(calculate_tbw "$host_writes_32mib" "") fi if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then echo "TBW Used: ${tbw_used} TB" fi # Estimate capacity for endurance calculation local capacity_gb=0 if echo "$capacity" | grep -qi "GB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) elif echo "$capacity" | grep -qi "TB"; then capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) fi local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then echo "TBW Remaining: $tbw_remaining" fi echo "Lifespan: $lifespan_percent ($wear_status)" # Show wear source if available if [[ "$wear_source" == "media_wearout" ]]; then echo "Wear Source: Media Wearout Indicator" elif [[ "$wear_source" == "tbw" ]]; then echo "Wear Source: TBW Calculation" elif [[ "$wear_source" == "estimated" ]]; then echo "Wear Source: Estimated Endurance" fi elif [[ "$disk_type" == "HDD" ]]; then if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then echo "Realloc Sectors: $reallocated_sectors" fi if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then echo "Pending Sectors: $pending_sectors" fi local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") echo "Lifespan: $lifespan" else print_color $YELLOW "Limited information available for this disk type" echo "This is normal for hardware RAID configurations like PERC H730P" echo "For detailed SAS drive information, use controller management tools" fi echo "" } # Function to detect RAID controllers and disks with PERC H730P support detect_raid_disks() { local controllers=("megaraid" "cciss" "areca" "3ware" "hpt" "auto") local disks=() # Check for direct disks first - only main devices, no partitions for disk in /dev/sd[a-z]; do if [[ -b "$disk" ]]; then disks+=("$disk:direct") fi done # Check for NVMe disks - only main devices, no partitions for disk in /dev/nvme[0-9]n[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk:direct") fi done # Check for SAS disks directly via SCSI generic for disk in /dev/sg[0-9]; do if [[ -b "$disk" ]]; then disks+=("$disk:direct") fi done # Check for RAID controllers with enhanced detection for controller in "${controllers[@]}"; do print_color $BLUE "Scanning for $controller controllers..." for i in {0..31}; do # Try different disk devices for each controller for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do if [[ -b "$base_disk" ]]; then if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then disks+=("$base_disk:$controller,$i") print_color $GREEN " Found $controller,$i on $base_disk" break fi fi done done done # Special detection for PERC H730P print_color $BLUE "Scanning for PERC H730P controllers..." if command_exists storcli; then print_color $GREEN " storcli detected - checking PERC H730P" disks+=("/dev/sda:perc-h730p") fi echo "${disks[@]}" } # Main function main() { print_color $BLUE "Harvester OS Disk Health Check Script v$VERSION" print_color $BLUE "Enhanced with PERC H730P and SAS Support" print_color $BLUE "============================================" echo "" if ! command_exists smartctl; then print_color $RED "Error: smartctl is not installed. Please install smartmontools package." exit 1 fi local disks=() # If specific disk provided, check only that disk if [[ $# -gt 0 ]]; then for disk in "$@"; do if [[ -b "$disk" ]]; then disks+=("$disk:direct") else print_color $RED "Error: $disk is not a valid block device" fi done else # Auto-detect disks print_color $CYAN "Auto-detecting disks and RAID controllers..." read -ra disks <<< "$(detect_raid_disks)" fi if [[ ${#disks[@]} -eq 0 ]]; then print_color $RED "No disks found or accessible" echo "Try running as root: sudo $0" exit 1 fi print_color $GREEN "Found ${#disks[@]} disk(s) to check" echo "" # Check if running as root if [[ $EUID -ne 0 ]]; then print_color $YELLOW "Warning: Not running as root." print_color $YELLOW "Some disks/controllers may show limited information." echo "" fi # Check each disk for disk_info in "${disks[@]}"; do IFS=':' read -r disk controller <<< "$disk_info" check_disk "$disk" "$controller" done print_color $BLUE "Check completed!" echo "" print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" print_color $CYAN " - Use 'storcli /c0 show all' for detailed information" print_color $CYAN " - Use 'storcli /c0/eall/sall show' for physical disk status" print_color $CYAN " - Hardware RAID controllers often limit SMART data access" } # Usage information usage() { echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" echo "" echo "If no disks specified, auto-detects all available disks and RAID arrays" echo "" echo "Examples:" echo " sudo $SCRIPT_NAME # Check all disks (recommended)" echo " $SCRIPT_NAME /dev/sda # Check specific disk" echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" echo " $SCRIPT_NAME /dev/sda /dev/sdb # Check multiple disks" } # Parse command line arguments case "${1:-}" in -h|--help) usage exit 0 ;; -v|--version) echo "$SCRIPT_NAME version $VERSION" exit 0 ;; *) main "$@" ;; esac