Files
disk-health/old/obsolete/harvester-v2.4.sh
2025-10-22 15:02:09 +08:00

543 lines
20 KiB
Bash

#!/bin/bash
# Disk Health Check Script for Harvester OS
# Enhanced with SAS/PERC H730P controller support
# Checks SSD TBW/lifespan and HDD health status
SCRIPT_NAME=$(basename "$0")
VERSION="2.4"
# Color codes
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
YELLOW=$(tput setaf 3)
BLUE=$(tput setaf 4)
CYAN=$(tput setaf 6)
NC=$(tput sgr0)
# Function to print colored output
print_color() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# Check if command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
if ! command_exists smartctl; then
print_color $RED "Error: smartctl is not installed. Please install smartmontools package."
exit 1
fi
# Function to test SMART access and get available data
test_smart_access() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
# Test basic SMART access
if ! $smart_cmd -i "$disk" &>/dev/null; then
echo "no_access"
return
fi
# Check if SMART is enabled (don't enable it, just check status)
local smart_info=$($smart_cmd -i "$disk" 2>/dev/null)
local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}')
local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}')
if [[ "$smart_available" != "Available" ]]; then
echo "not_available"
return
fi
if [[ "$smart_enabled" != "Enabled" ]]; then
echo "disabled"
return
fi
# Test attribute reading
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
if [[ -z "$attributes" ]]; then
echo "no_attributes"
return
fi
# Check if we have basic attributes
local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1)
if [[ -z "$power_on_hours" ]]; then
echo "limited_attributes"
return
fi
echo "full_access"
}
# Function to get disk information with enhanced SAS support
get_disk_info() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local info=$($smart_cmd -i "$disk" 2>/dev/null)
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
local health=$($smart_cmd -H "$disk" 2>/dev/null)
# Extract information with multiple fallbacks for SAS drives
local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
[[ -n "$vendor" && -n "$model" ]] && model="$vendor $model"
local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1)
local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
[[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1)
# Get disk type with SAS support
local disk_type="UNKNOWN"
if echo "$info" | grep -qi "Solid State Device"; then
disk_type="SSD"
elif echo "$info" | grep -qi "Rotation Rate"; then
disk_type="HDD"
elif echo "$info" | grep -qi "SCSI\|SAS"; then
# SAS drives often don't specify, check rotation rate
if echo "$info" | grep -qi "15000\|10000\|7200"; then
disk_type="HDD"
else
disk_type="SSD"
fi
fi
# Extract SMART attributes with multiple field attempts for SAS
local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1)
[[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1)
local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1)
local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1)
local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1)
local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1)
# For SAS drives, try to get media wearout for SSDs
local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1)
echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout"
}
# Function to calculate TBW for SSD
calculate_tbw() {
local raw_value=$1
local sectors=$2
if [[ -n "$sectors" && "$sectors" != "0" ]]; then
local bytes=$((sectors * 512))
local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc -l 2>/dev/null || echo "0")
echo "$tbw"
elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc -l 2>/dev/null || echo "0")
echo "$tbw"
else
echo "0"
fi
}
# Function to estimate SSD endurance based on model and capacity
estimate_ssd_endurance() {
local disk_model=$1
local capacity_gb=$2
# SAS SSDs typically have very high endurance
if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then
# Enterprise SAS SSDs - very high endurance
if [[ $capacity_gb -ge 1000 ]]; then
echo "10000" # 10PB for 1TB+ enterprise SAS SSD
elif [[ $capacity_gb -ge 600 ]]; then
echo "6000" # 6PB for 600GB enterprise SAS SSD
elif [[ $capacity_gb -ge 400 ]]; then
echo "4000" # 4PB for 400GB enterprise SAS SSD
else
echo "2000" # 2PB for smaller enterprise SAS SSD
fi
elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then
# Enterprise SATA SSDs
if [[ $capacity_gb -ge 1000 ]]; then
echo "1200" # 1.2PB for 1TB enterprise
elif [[ $capacity_gb -ge 480 ]]; then
echo "600" # 600TB for 480GB enterprise
elif [[ $capacity_gb -ge 240 ]]; then
echo "300" # 300TB for 240GB enterprise
else
echo "150" # 150TB for smaller enterprise
fi
else
# Consumer SSDs
if [[ $capacity_gb -ge 1000 ]]; then
echo "600" # 600TB for 1TB consumer
elif [[ $capacity_gb -ge 480 ]]; then
echo "300" # 300TB for 480GB consumer
elif [[ $capacity_gb -ge 240 ]]; then
echo "150" # 150TB for 240GB consumer
elif [[ $capacity_gb -ge 120 ]]; then
echo "80" # 80TB for 120GB consumer
else
echo "40" # 40TB for smaller drives
fi
fi
}
# Function to estimate SSD lifespan with TBW remaining
estimate_ssd_lifespan() {
local power_on_hours=$1
local tbw_used=$2
local disk_model=$3
local capacity_gb=$4
local media_wearout=$5
if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then
echo "Unknown||Unknown||Unknown"
return
fi
local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb")
local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc -l 2>/dev/null || echo "0")
# If we have media wearout indicator, use it for more accurate estimation
if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then
if [[ $media_wearout -le 10 ]]; then
echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout"
elif [[ $media_wearout -le 30 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout"
elif [[ $media_wearout -le 70 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout"
else
echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout"
fi
return
fi
if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc -l 2>/dev/null || echo "0")
local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc -l 2>/dev/null || echo "100")
if [[ $(echo "$lifespan_used >= 80" | bc -l) -eq 1 ]]; then
echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw"
elif [[ $(echo "$lifespan_used >= 50" | bc -l) -eq 1 ]]; then
echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw"
else
echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw"
fi
else
echo "Unknown|${estimated_endurance} TB|New|estimated"
fi
}
# Function to estimate HDD lifespan
estimate_hdd_lifespan() {
local power_on_hours=$1
local reallocated_sectors=$2
local pending_sectors=$3
if [[ -z "$power_on_hours" ]]; then
echo "Unknown"
return
fi
power_on_hours=${power_on_hours:-0}
reallocated_sectors=${reallocated_sectors:-0}
pending_sectors=${pending_sectors:-0}
if [[ "$pending_sectors" -gt 0 ]]; then
echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)"
elif [[ "$reallocated_sectors" -gt 100 ]]; then
echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)"
elif [[ "$reallocated_sectors" -gt 10 ]]; then
echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)"
elif [[ "$power_on_hours" -gt 40000 ]]; then
echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)"
elif [[ "$power_on_hours" -gt 25000 ]]; then
echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)"
else
echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)"
fi
}
# Function to check a single disk with enhanced error handling
check_disk() {
local disk=$1
local controller=$2
print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})"
echo "=================================================="
# Test SMART access level
local access_level=$(test_smart_access "$disk" "$controller")
case $access_level in
"no_access")
print_color $RED "ERROR: Cannot access disk through controller"
echo "Possible reasons:"
echo " - Controller doesn't support SMART passthrough"
echo " - Disk is part of a hardware RAID array"
echo " - Insufficient permissions (try running as root)"
echo " - Controller busy or offline"
echo ""
return
;;
"not_available")
print_color $YELLOW "SMART not available on this disk"
echo "This disk does not support SMART monitoring"
echo ""
return
;;
"disabled")
print_color $YELLOW "SMART is disabled on this disk"
echo "SMART is available but currently disabled"
echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk"
echo ""
return
;;
"no_attributes")
print_color $YELLOW "WARNING: Cannot read SMART attributes"
echo "This is common with hardware RAID controllers like PERC H730P"
echo "Try checking through the RAID management interface"
echo ""
return
;;
"limited_attributes")
print_color $YELLOW "NOTE: Limited SMART data available"
echo "Controller is filtering some SMART attributes"
;;
esac
# Get disk information
local disk_info=$(get_disk_info "$disk" "$controller")
IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info"
# Display basic information
echo "Model: ${model:-Unknown}"
echo "Serial: ${serial:-Unknown}"
echo "Type: $disk_type"
echo "Capacity: ${capacity:-Unknown}"
echo "Firmware: ${firmware:-Unknown}"
echo "Health: ${health_status:-Unknown}"
# Only show power on hours if available
if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then
echo "Power On Hours: $power_on_hours"
else
echo "Power On Hours: Unknown"
fi
# Disk type specific analysis
if [[ "$disk_type" == "SSD" ]]; then
local tbw_used=0
if [[ -n "$total_written" && "$total_written" != "0" ]]; then
tbw_used=$(calculate_tbw "" "$total_written")
elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then
tbw_used=$(calculate_tbw "$host_writes_32mib" "")
fi
if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
echo "TBW Used: ${tbw_used} TB"
fi
# Estimate capacity for endurance calculation
local capacity_gb=0
if echo "$capacity" | grep -qi "GB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1)
elif echo "$capacity" | grep -qi "TB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1)
fi
local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout")
local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1)
local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2)
local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3)
local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4)
if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
echo "TBW Remaining: $tbw_remaining"
fi
echo "Lifespan: $lifespan_percent ($wear_status)"
# Show wear source if available
if [[ "$wear_source" == "media_wearout" ]]; then
echo "Wear Source: Media Wearout Indicator"
elif [[ "$wear_source" == "tbw" ]]; then
echo "Wear Source: TBW Calculation"
elif [[ "$wear_source" == "estimated" ]]; then
echo "Wear Source: Estimated Endurance"
fi
elif [[ "$disk_type" == "HDD" ]]; then
if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then
echo "Realloc Sectors: $reallocated_sectors"
fi
if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then
echo "Pending Sectors: $pending_sectors"
fi
local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}")
echo "Lifespan: $lifespan"
else
print_color $YELLOW "Limited information available for this disk type"
echo "This is normal for hardware RAID configurations like PERC H730P"
echo "For detailed SAS drive information, use controller management tools"
fi
echo ""
}
# Function to detect RAID controllers and disks with PERC H730P support
detect_raid_disks() {
local controllers=("megaraid" "cciss" "areca" "3ware" "hpt" "auto")
local disks=()
# Check for direct disks first - only main devices, no partitions
for disk in /dev/sd[a-z]; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
fi
done
# Check for NVMe disks - only main devices, no partitions
for disk in /dev/nvme[0-9]n[0-9]; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
fi
done
# Check for SAS disks directly via SCSI generic
for disk in /dev/sg[0-9]; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
fi
done
# Check for RAID controllers with enhanced detection
for controller in "${controllers[@]}"; do
print_color $BLUE "Scanning for $controller controllers..."
for i in {0..31}; do
# Try different disk devices for each controller
for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do
if [[ -b "$base_disk" ]]; then
if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then
disks+=("$base_disk:$controller,$i")
print_color $GREEN " Found $controller,$i on $base_disk"
break
fi
fi
done
done
done
# Special detection for PERC H730P
print_color $BLUE "Scanning for PERC H730P controllers..."
if command_exists storcli; then
print_color $GREEN " storcli detected - checking PERC H730P"
disks+=("/dev/sda:perc-h730p")
fi
echo "${disks[@]}"
}
# Main function
main() {
print_color $BLUE "Harvester OS Disk Health Check Script v$VERSION"
print_color $BLUE "Enhanced with PERC H730P and SAS Support"
print_color $BLUE "============================================"
echo ""
if ! command_exists smartctl; then
print_color $RED "Error: smartctl is not installed. Please install smartmontools package."
exit 1
fi
local disks=()
# If specific disk provided, check only that disk
if [[ $# -gt 0 ]]; then
for disk in "$@"; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
else
print_color $RED "Error: $disk is not a valid block device"
fi
done
else
# Auto-detect disks
print_color $CYAN "Auto-detecting disks and RAID controllers..."
read -ra disks <<< "$(detect_raid_disks)"
fi
if [[ ${#disks[@]} -eq 0 ]]; then
print_color $RED "No disks found or accessible"
echo "Try running as root: sudo $0"
exit 1
fi
print_color $GREEN "Found ${#disks[@]} disk(s) to check"
echo ""
# Check if running as root
if [[ $EUID -ne 0 ]]; then
print_color $YELLOW "Warning: Not running as root."
print_color $YELLOW "Some disks/controllers may show limited information."
echo ""
fi
# Check each disk
for disk_info in "${disks[@]}"; do
IFS=':' read -r disk controller <<< "$disk_info"
check_disk "$disk" "$controller"
done
print_color $BLUE "Check completed!"
echo ""
print_color $CYAN "Note: For PERC H730P controllers with SAS drives:"
print_color $CYAN " - Use 'storcli /c0 show all' for detailed information"
print_color $CYAN " - Use 'storcli /c0/eall/sall show' for physical disk status"
print_color $CYAN " - Hardware RAID controllers often limit SMART data access"
}
# Usage information
usage() {
echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]"
echo ""
echo "If no disks specified, auto-detects all available disks and RAID arrays"
echo ""
echo "Examples:"
echo " sudo $SCRIPT_NAME # Check all disks (recommended)"
echo " $SCRIPT_NAME /dev/sda # Check specific disk"
echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly"
echo " $SCRIPT_NAME /dev/sda /dev/sdb # Check multiple disks"
}
# Parse command line arguments
case "${1:-}" in
-h|--help)
usage
exit 0
;;
-v|--version)
echo "$SCRIPT_NAME version $VERSION"
exit 0
;;
*)
main "$@"
;;
esac