Files
disk-health/harvester-v3.8.sh
2025-10-22 12:59:32 +08:00

870 lines
31 KiB
Bash

#!/bin/bash
# Disk Health Check Script for Harvester OS
# Checks SATA HDD, SATA SSD, SAS, NVMe, RAID controllers, and soft-raid
# Supports consumer and enterprise disk classification
# Created by Adam T. Lau
SCRIPT_NAME=$(basename "$0")
VERSION="3.8"
# Color codes
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
YELLOW=$(tput setaf 3)
BLUE=$(tput setaf 4)
CYAN=$(tput setaf 6)
MAGENTA=$(tput setaf 5)
NC=$(tput sgr0)
# Function to print colored output
print_color() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# Check if required commands are installed
command_exists() {
command -v "$1" >/dev/null 2>&1
}
if ! command_exists smartctl; then
print_color $RED "Error: smartctl is not installed. Please install smartmontools package."
exit 1
fi
# Known model capacities
declare -A MODEL_CAPACITIES=(
["ST91000640NS"]="1000"
["ST2000NM0033"]="2000"
["ST4000NM0033"]="4000"
["MB1000GCWCV"]="1000"
["MB2000GCWDB"]="2000"
["AL15SEB120N"]="1200"
["AL15SEB600N"]="600"
["HUC101212CSS600"]="1200"
["HUC103012CSS600"]="3000"
["HUC109090CSS600"]="900"
["MAX3147RC"]="147"
["ST3146356SS"]="146"
["ST3146855SS"]="146"
["ST33000650SS"]="3000"
["ST3600057SS"]="600"
["ST9146803SS"]="146"
["ST973451SS"]="73"
["AL13SXB300N"]="300"
["KPM6XRUG960G"]="960"
["MZILT3T8HBLS0D3"]="3840"
["MZILT960HBHQ0D3"]="960"
# Add more models as encountered
)
# TBW endurance standards (using lowest numbers)
declare -A CONSUMER_TBW=(
["250"]=150
["500"]=300
["1000"]=600
["2000"]=1200
["4000"]=2400
["8000"]=4800
)
declare -A ENTERPRISE_TBW=(
["250"]=450
["500"]=900
["1000"]=1800
["2000"]=3600
["4000"]=7200
["8000"]=14400
)
# Function to get closest capacity tier
get_capacity_tier() {
local capacity_gb=$1
local tiers=("250" "500" "1000" "2000" "4000" "8000")
for tier in "${tiers[@]}"; do
if [[ $capacity_gb -le $tier ]]; then
echo $tier
return
fi
done
echo "8000"
}
# Function to extract numeric hours from power_on_hours field
extract_numeric_hours() {
local power_on_hours=$1
local numeric_hours=$(echo "$power_on_hours" | sed 's/[^0-9].*$//')
if [[ -n "$numeric_hours" && "$numeric_hours" =~ ^[0-9]+$ ]]; then
echo "$numeric_hours"
else
echo "0"
fi
}
# Function to get disk type and interface
get_disk_info() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
smart_cmd+=" -i $disk"
local info=$($smart_cmd 2>/dev/null)
local transport=""
local disk_type="UNKNOWN"
local is_enterprise=false
# Check if it's NVMe
if [[ "$disk" == /dev/nvme* ]] || echo "$info" | grep -qi "NVMe"; then
disk_type="NVMe"
transport="NVMe"
# Check for SAS
elif echo "$info" | grep -qi "SAS"; then
transport="SAS"
is_enterprise=true
# Determine if SAS disk is HDD or SSD
if echo "$info" | grep -qi "Solid State Device\|SSD"; then
disk_type="SAS SSD"
elif echo "$info" | grep -qi "Rotation Rate"; then
disk_type="SAS HDD"
else
local model=$(echo "$info" | grep -i "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//')
if [[ -n "$model" ]]; then
if echo "$model" | grep -qi "SSD\|Solid State"; then
disk_type="SAS SSD"
else
disk_type="SAS HDD"
fi
else
disk_type="SAS HDD"
fi
fi
# Check for SATA SSD
elif echo "$info" | grep -qi "Solid State Device\|SSD"; then
disk_type="SATA SSD"
transport="SATA"
# Check for SATA HDD
elif echo "$info" | grep -qi "Rotation Rate"; then
disk_type="SATA HDD"
transport="SATA"
fi
# Check for enterprise features
if echo "$info" | grep -qi "ENTERPRISE\|EP\|SAS\|Xeon\|Xeons\|DualPort\|PowerLoss\|PLP"; then
is_enterprise=true
fi
# Check device type by model name
local model=$(echo "$info" | grep -i "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//')
if [[ -n "$model" ]]; then
if echo "$model" | grep -qi "PRO\|EP\|DC\|ENT\|ENTERPRISE"; then
is_enterprise=true
fi
if echo "$model" | grep -qi "SSD\|Solid State" && [[ "$disk_type" == "UNKNOWN" ]]; then
disk_type="SSD"
[[ "$transport" == "" ]] && transport="SATA"
fi
fi
if [[ "$disk_type" == "UNKNOWN" ]]; then
disk_type="Unknown"
fi
echo "$disk_type|$transport|$is_enterprise"
}
# Function to get SAS disk attributes
get_sas_attributes() {
local disk=$1
local controller=$2
local disk_type=$3
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local power_on_hours=""
local reallocated_sectors=""
local pending_sectors=""
local start_stop_count=""
local load_cycle_count=""
local total_written=""
local temperature=""
local model=""
local serial=""
local firmware=""
local media_wearout=""
local percent_lifetime_used=""
local has_write_data=false
# Try extended information first for SAS disks
local attributes=$($smart_cmd -x "$disk" 2>/dev/null)
# If extended fails, try standard attributes
if [[ -z "$attributes" ]]; then
attributes=$($smart_cmd -a "$disk" 2>/dev/null)
fi
if [[ -n "$attributes" ]]; then
# Extract model information
model=$(echo "$attributes" | grep -i "Product:" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$model" ]] && model=$(echo "$attributes" | grep -i "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//')
serial=$(echo "$attributes" | grep -i "Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//')
firmware=$(echo "$attributes" | grep -i "Revision:" | cut -d: -f2 | sed 's/^[ \t]*//')
# Extract power on hours - try multiple formats
power_on_hours=$(echo "$attributes" | grep -i "Accumulated power on time" | grep -oE "[0-9]+:[0-9]+" | head -1)
if [[ -n "$power_on_hours" ]]; then
local hours=$(echo "$power_on_hours" | cut -d: -f1)
power_on_hours="$hours"
else
# Try alternative format
power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | awk '{print $NF}' | head -1)
fi
# Extract temperature
temperature=$(echo "$attributes" | grep -i "Current Drive Temperature" | grep -oE "[0-9]+" | head -1)
[[ -z "$temperature" ]] && temperature=$(echo "$attributes" | grep -i "Temperature_Celsius" | awk '{print $10}' | head -1)
# Extract mechanical counters for SAS HDDs
if [[ "$disk_type" == "SAS HDD" ]]; then
start_stop_count=$(echo "$attributes" | grep -i "Accumulated start-stop cycles" | grep -oE "[0-9]+" | head -1)
load_cycle_count=$(echo "$attributes" | grep -i "Accumulated load-unload cycles" | grep -oE "[0-9]+" | head -1)
fi
# Extract error counters
local error_count=$(echo "$attributes" | grep -i "Elements in grown defect list" | grep -oE "[0-9]+" | head -1)
if [[ -n "$error_count" ]]; then
reallocated_sectors="$error_count"
fi
# For SAS SSDs, look for comprehensive wear indicators
if [[ "$disk_type" == "SAS SSD" ]]; then
# Try multiple patterns for write data
total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written" | awk '{print $NF}' | head -1)
[[ -z "$total_written" ]] && total_written=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1)
[[ -z "$total_written" ]] && total_written=$(echo "$attributes" | grep -i "Lifetime_Writes" | awk '{print $NF}' | head -1)
[[ -z "$total_written" ]] && total_written=$(echo "$attributes" | grep -i "NAND_Writes" | awk '{print $NF}' | head -1)
# Check if we actually found write data
if [[ -n "$total_written" && "$total_written" != "0" ]]; then
has_write_data=true
fi
# Look for wear level indicators
media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator" | awk '{print $NF}' | head -1)
percent_lifetime_used=$(echo "$attributes" | grep -i "Percent_Lifetime_Used" | awk '{print $NF}' | head -1)
[[ -z "$percent_lifetime_used" ]] && percent_lifetime_used=$(echo "$attributes" | grep -i "Wear_Leveling_Count" | awk '{print $NF}' | head -1)
fi
fi
echo "$power_on_hours|$reallocated_sectors|$pending_sectors|$start_stop_count|$load_cycle_count|$temperature|$model|$serial|$firmware|$total_written|$media_wearout|$percent_lifetime_used|$has_write_data"
}
# Function to calculate TBW for SSD/NVMe
calculate_tbw() {
local disk_type=$1
local raw_value=$2
local sectors=$3
local tbw=0
if [[ -n "$sectors" && "$sectors" != "0" ]]; then
# Calculate from sectors (most common for SATA SSDs)
local bytes=$((sectors * 512))
tbw=$((bytes / 1000000000000))
elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
if [[ "$disk_type" == "NVMe" ]]; then
# NVMe: raw value is in 32MB units
tbw=$((raw_value * 32 / 1000000))
else
# SATA/SAS SSD: various manufacturers
tbw=$((raw_value * 32 / 1000000))
fi
fi
echo "$tbw"
}
# Function to get estimated endurance - SIMPLIFIED FOR SAS SSDs
get_estimated_endurance() {
local capacity_gb=$1
local is_enterprise=$2
local disk_type=$3
local has_write_data=$4
# HDDs don't have TBW
if [[ "$disk_type" == "SATA HDD" || "$disk_type" == "SAS HDD" || "$disk_type" == "HDD" ]]; then
echo "N/A"
return
fi
# For SAS SSDs without write data, don't provide unrealistic estimates
if [[ "$disk_type" == "SAS SSD" && "$has_write_data" == "false" ]]; then
echo "UNKNOWN"
return
fi
local capacity_tier=$(get_capacity_tier "$capacity_gb")
if [[ "$is_enterprise" == "true" ]]; then
echo "${ENTERPRISE_TBW[$capacity_tier]}"
else
echo "${CONSUMER_TBW[$capacity_tier]}"
fi
}
# Function to estimate SSD lifespan with TBW remaining
estimate_ssd_lifespan() {
local power_on_hours=$1
local tbw_used=$2
local estimated_endurance=$3
local disk_type=$4
local percent_lifetime_used=$5
local has_write_data=$6
# For SAS SSDs without write data, be honest about limitations
if [[ "$disk_type" == "SAS SSD" && "$has_write_data" == "false" ]]; then
echo "N/A|N/A|Cannot determine - SAS SSD does not expose write statistics"
return
fi
if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then
echo "Unknown|Unknown|New drive"
return
fi
if [[ "$estimated_endurance" == "N/A" ]]; then
echo "N/A|N/A|HDD - no endurance rating"
return
fi
local clean_tbw_used=$(echo "$tbw_used" | sed 's/[^0-9.]//g')
if [[ -z "$clean_tbw_used" ]]; then
clean_tbw_used=0
fi
local tbw_remaining=$((estimated_endurance - clean_tbw_used))
if [[ $clean_tbw_used -gt 0 ]]; then
local lifespan_used=$((clean_tbw_used * 100 / estimated_endurance))
local lifespan_remaining=$((100 - lifespan_used))
if [[ $lifespan_used -ge 80 ]]; then
echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|High wear"
elif [[ $lifespan_used -ge 50 ]]; then
echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|Moderate wear"
else
echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|Healthy"
fi
else
echo "Unknown|${estimated_endurance} TB|New"
fi
}
# Function to estimate HDD lifespan
estimate_hdd_lifespan() {
local power_on_hours=$1
local reallocated_sectors=$2
local pending_sectors=$3
local start_stop_count=$4
local load_cycle_count=$5
local disk_type=$6
local temperature=$7
local numeric_hours=$(extract_numeric_hours "$power_on_hours")
if [[ -z "$numeric_hours" || "$numeric_hours" -eq 0 ]]; then
echo "Unknown"
return
fi
local severity=0
# Critical issues
if [[ "$pending_sectors" -gt 0 ]]; then
echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)"
return
elif [[ "$reallocated_sectors" -gt 100 ]]; then
severity=$((severity + 3))
elif [[ "$reallocated_sectors" -gt 10 ]]; then
severity=$((severity + 2))
elif [[ "$reallocated_sectors" -gt 0 ]]; then
severity=$((severity + 1))
fi
# Temperature warning
if [[ -n "$temperature" && "$temperature" -gt 50 ]]; then
severity=$((severity + 2))
elif [[ -n "$temperature" && "$temperature" -gt 40 ]]; then
severity=$((severity + 1))
fi
# Usage-based assessment
if [[ "$numeric_hours" -gt 50000 ]]; then
severity=$((severity + 3))
elif [[ "$numeric_hours" -gt 30000 ]]; then
severity=$((severity + 2))
elif [[ "$numeric_hours" -gt 15000 ]]; then
severity=$((severity + 1))
fi
# Mechanical wear (for HDDs)
if [[ "$disk_type" == "SATA HDD" || "$disk_type" == "SAS HDD" ]]; then
if [[ "$start_stop_count" -gt 50000 ]]; then
severity=$((severity + 2))
elif [[ "$start_stop_count" -gt 20000 ]]; then
severity=$((severity + 1))
fi
if [[ "$load_cycle_count" -gt 500000 ]]; then
severity=$((severity + 2))
elif [[ "$load_cycle_count" -gt 200000 ]]; then
severity=$((severity + 1))
fi
fi
if [[ $severity -ge 5 ]]; then
echo "${RED}< 6 months${NC} (Multiple risk factors)"
elif [[ $severity -ge 3 ]]; then
echo "${YELLOW}6-18 months${NC} (Moderate wear)"
elif [[ $severity -ge 1 ]]; then
echo "${YELLOW}1-3 years${NC} (Light wear)"
else
echo "${GREEN}> 3 years${NC} (Healthy)"
fi
}
# Function to check soft-raid (MDRAID)
check_mdraid() {
local md_devices=()
if [[ -f /proc/mdstat ]]; then
while IFS= read -r line; do
if [[ $line =~ ^md[0-9]+ ]]; then
md_devices+=("/dev/${line%% *}")
fi
done < /proc/mdstat
fi
for md in "${md_devices[@]}"; do
if [[ -b "$md" ]]; then
print_color $MAGENTA "Found software RAID: $md"
if command_exists mdadm; then
local md_info=$(mdadm --detail "$md" 2>/dev/null)
if [[ -n "$md_info" ]]; then
echo "RAID Level: $(echo "$md_info" | grep "Raid Level" | cut -d: -f2 | sed 's/^[ \t]*//')"
echo "State: $(echo "$md_info" | grep "State" | head -1 | cut -d: -f2 | sed 's/^[ \t]*//')"
echo "Devices: $(echo "$md_info" | grep "Active Devices" | cut -d: -f2 | sed 's/^[ \t]*//')"
echo ""
fi
fi
fi
done
}
# Function to get capacity using direct block device methods
get_disk_capacity() {
local disk=$1
local controller=$2
local disk_type=$3
local capacity_gb=0
local capacity_human="Unknown"
# Method 1: Try lsblk first
if command_exists lsblk; then
local lsblk_output=$(lsblk -b "$disk" -o SIZE -n 2>/dev/null)
if [[ -n "$lsblk_output" && "$lsblk_output" =~ ^[0-9]+$ ]]; then
capacity_gb=$((lsblk_output / 1000000000))
fi
fi
# Method 2: Try fdisk
if [[ $capacity_gb -eq 0 ]] && command_exists fdisk; then
local fdisk_info=$(fdisk -l "$disk" 2>/dev/null | grep "Disk $disk")
if [[ -n "$fdisk_info" ]]; then
if [[ $fdisk_info =~ ([0-9,.]+)\s*([GT])iB ]]; then
local size=$(echo "${BASH_REMATCH[1]}" | tr -d ',')
local unit="${BASH_REMATCH[2]}"
if [[ "$unit" == "T" ]]; then
capacity_gb=$((size * 1000))
else
capacity_gb=$size
fi
elif [[ $fdisk_info =~ ([0-9,.]+)\s*([GT])B ]]; then
local size=$(echo "${BASH_REMATCH[1]}" | tr -d ',')
local unit="${BASH_REMATCH[2]}"
if [[ "$unit" == "T" ]]; then
capacity_gb=$((size * 1000))
else
capacity_gb=$size
fi
elif [[ $fdisk_info =~ ([0-9,.]+)\s*bytes ]]; then
local bytes=$(echo "$fdisk_info" | grep -oE '[0-9,]+' | head -1 | tr -d ',')
capacity_gb=$((bytes / 1000000000))
fi
fi
fi
# Method 3: Try blockdev
if [[ $capacity_gb -eq 0 ]] && command_exists blockdev; then
local blockdev_size=$(blockdev --getsize64 "$disk" 2>/dev/null)
if [[ -n "$blockdev_size" && "$blockdev_size" =~ ^[0-9]+$ ]]; then
capacity_gb=$((blockdev_size / 1000000000))
fi
fi
# Method 4: Model-based lookup
if [[ $capacity_gb -eq 0 ]]; then
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local model=$($smart_cmd -i "$disk" 2>/dev/null | grep -i "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$model" ]] && model=$($smart_cmd -i "$disk" 2>/dev/null | grep -i "Product:" | cut -d: -f2 | sed 's/^[ \t]*//')
if [[ -n "$model" && -n "${MODEL_CAPACITIES[$model]}" ]]; then
capacity_gb="${MODEL_CAPACITIES[$model]}"
fi
fi
# Generate human readable capacity
if [[ $capacity_gb -gt 0 ]]; then
if [[ $capacity_gb -ge 1000 ]]; then
local tb_capacity=$((capacity_gb / 1000))
capacity_human="${tb_capacity} TB"
else
capacity_human="${capacity_gb} GB"
fi
else
capacity_human="Unknown"
fi
echo "$capacity_gb|$capacity_human"
}
# Function to check a single disk
check_disk() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})"
echo "=================================================="
# Get disk information
local disk_info=$(get_disk_info "$disk" "$controller")
local disk_type=$(echo "$disk_info" | cut -d'|' -f1)
local transport=$(echo "$disk_info" | cut -d'|' -f2)
local is_enterprise=$(echo "$disk_info" | cut -d'|' -f3)
# Get basic disk information
local info=$($smart_cmd -i "$disk" 2>/dev/null)
local health=$($smart_cmd -H "$disk" 2>/dev/null)
# Check if we can read the disk
if [[ $? -ne 0 ]] || [[ -z "$info" ]]; then
print_color $YELLOW "Cannot read disk information. It may be offline, unsupported, or need controller specification."
echo ""
return
fi
# Extract disk information
local model=$(echo "$info" | grep -i "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$model" ]] && model=$(echo "$info" | grep -i "Product:" | cut -d: -f2 | sed 's/^[ \t]*//')
local serial=$(echo "$info" | grep -i "Serial Number:" | cut -d: -f2 | sed 's/^[ \t]*//')
local firmware=$(echo "$info" | grep -i "Firmware Version:" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$firmware" ]] && firmware=$(echo "$info" | grep -i "Revision:" | cut -d: -f2 | sed 's/^[ \t]*//')
# Get capacity
local capacity_info=$(get_disk_capacity "$disk" "$controller" "$disk_type")
local capacity_gb=$(echo "$capacity_info" | cut -d'|' -f1)
local capacity_human=$(echo "$capacity_info" | cut -d'|' -f2)
local health_status=$(echo "$health" | grep "result:" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$health_status" ]] && health_status=$(echo "$health" | grep "SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//')
[[ -z "$health_status" ]] && health_status=$(echo "$health" | grep "SMART Health Status" | cut -d: -f2 | sed 's/^[ \t]*//')
# Extract SMART attributes based on disk type
local power_on_hours=""
local reallocated_sectors=""
local pending_sectors=""
local start_stop_count=""
local load_cycle_count=""
local total_written=""
local host_writes_32mib=""
local temperature=""
local sas_model=""
local sas_serial=""
local sas_firmware=""
local sas_total_written=""
local media_wearout=""
local percent_lifetime_used=""
local has_write_data="false"
if [[ "$disk_type" == "SAS HDD" || "$disk_type" == "SAS SSD" ]]; then
local sas_attrs=$(get_sas_attributes "$disk" "$controller" "$disk_type")
power_on_hours=$(echo "$sas_attrs" | cut -d'|' -f1)
reallocated_sectors=$(echo "$sas_attrs" | cut -d'|' -f2)
pending_sectors=$(echo "$sas_attrs" | cut -d'|' -f3)
start_stop_count=$(echo "$sas_attrs" | cut -d'|' -f4)
load_cycle_count=$(echo "$sas_attrs" | cut -d'|' -f5)
temperature=$(echo "$sas_attrs" | cut -d'|' -f6)
sas_model=$(echo "$sas_attrs" | cut -d'|' -f7)
sas_serial=$(echo "$sas_attrs" | cut -d'|' -f8)
sas_firmware=$(echo "$sas_attrs" | cut -d'|' -f9)
sas_total_written=$(echo "$sas_attrs" | cut -d'|' -f10)
media_wearout=$(echo "$sas_attrs" | cut -d'|' -f11)
percent_lifetime_used=$(echo "$sas_attrs" | cut -d'|' -f12)
has_write_data=$(echo "$sas_attrs" | cut -d'|' -f13)
# Use SAS-extracted data if available
[[ -n "$sas_model" ]] && model="$sas_model"
[[ -n "$sas_serial" ]] && serial="$sas_serial"
[[ -n "$sas_firmware" ]] && firmware="$sas_firmware"
[[ -n "$sas_total_written" ]] && total_written="$sas_total_written"
else
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
if [[ -n "$attributes" ]]; then
power_on_hours=$(echo "$attributes" | grep "Power_On_Hours" | awk '{print $10}')
reallocated_sectors=$(echo "$attributes" | grep "Reallocated_Sector_Ct" | awk '{print $10}')
pending_sectors=$(echo "$attributes" | grep "Current_Pending_Sector" | awk '{print $10}')
start_stop_count=$(echo "$attributes" | grep "Start_Stop_Count" | awk '{print $10}')
load_cycle_count=$(echo "$attributes" | grep "Load_Cycle_Count" | awk '{print $10}')
total_written=$(echo "$attributes" | grep -E "Total_LBAs_Written|Host_Writes_32MiB" | awk '{print $10}')
host_writes_32mib=$(echo "$attributes" | grep "Host_Writes_32MiB" | awk '{print $10}')
temperature=$(echo "$attributes" | grep -i "Temperature_Celsius" | awk '{print $10}')
# For non-SAS SSDs, we assume write data is available if we found any
if [[ "$disk_type" == "SATA SSD" || "$disk_type" == "NVMe" ]]; then
if [[ -n "$total_written" || -n "$host_writes_32mib" ]]; then
has_write_data="true"
fi
fi
fi
fi
# Clean up extracted values
power_on_hours=$(extract_numeric_hours "$power_on_hours")
reallocated_sectors=${reallocated_sectors:-0}
pending_sectors=${pending_sectors:-0}
start_stop_count=${start_stop_count:-0}
load_cycle_count=${load_cycle_count:-0}
# Display basic information
echo "Model: ${model:-Unknown}"
echo "Serial: ${serial:-Unknown}"
echo "Type: $disk_type"
echo "Interface: $transport"
echo "Class: $($is_enterprise && echo "Enterprise" || echo "Consumer")"
echo "Capacity: $capacity_human"
echo "Firmware: ${firmware:-Unknown}"
echo "Health: ${health_status:-Unknown}"
# Show temperature if available
if [[ -n "$temperature" && "$temperature" != "0" ]]; then
echo "Temperature: ${temperature} C"
fi
# Only show Power On Hours if we have a valid value
if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then
echo "Power On Hours: $power_on_hours"
else
echo "Power On Hours: Unknown"
fi
# Show wear indicators for SSDs if available
if [[ "$disk_type" == "SAS SSD" || "$disk_type" == "SATA SSD" ]]; then
if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then
echo "Media Wearout: $media_wearout"
fi
if [[ -n "$percent_lifetime_used" && "$percent_lifetime_used" != "0" ]]; then
echo "Lifetime Used: ${percent_lifetime_used}%"
fi
fi
# Disk type specific analysis
if [[ "$disk_type" == "SATA HDD" || "$disk_type" == "SAS HDD" ]]; then
echo "Realloc Sectors: $reallocated_sectors"
echo "Pending Sectors: $pending_sectors"
# Only show mechanical counters if we have values
if [[ -n "$start_stop_count" && "$start_stop_count" != "0" ]]; then
echo "Start/Stop Count: $start_stop_count"
fi
if [[ -n "$load_cycle_count" && "$load_cycle_count" != "0" ]]; then
echo "Load Cycle Count: $load_cycle_count"
fi
local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "$reallocated_sectors" "$pending_sectors" "$start_stop_count" "$load_cycle_count" "$disk_type" "$temperature")
echo "Lifespan: $lifespan"
elif [[ "$disk_type" == "SATA SSD" || "$disk_type" == "SAS SSD" || "$disk_type" == "NVMe" ]]; then
local tbw_used=0
if [[ -n "$total_written" && "$total_written" != "0" ]]; then
tbw_used=$(calculate_tbw "$disk_type" "" "$total_written")
elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then
tbw_used=$(calculate_tbw "$disk_type" "$host_writes_32mib" "")
fi
local estimated_endurance=$(get_estimated_endurance "$capacity_gb" "$is_enterprise" "$disk_type" "$has_write_data")
# Handle SAS SSDs without write data specially
if [[ "$disk_type" == "SAS SSD" && "$estimated_endurance" == "UNKNOWN" ]]; then
echo "TBW Used: Not available"
echo "TBW Endurance: Not available (SAS SSD does not expose write statistics)"
echo "Lifespan: ${GREEN}Healthy${NC} (based on SMART health status)"
else
if [[ "$estimated_endurance" != "N/A" ]]; then
echo "TBW Used: ${tbw_used} TB"
echo "TBW Endurance: ${estimated_endurance} TB (Minimum guaranteed - actual may be higher)"
local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$estimated_endurance" "$disk_type" "$percent_lifetime_used" "$has_write_data")
local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1)
local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2)
local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3)
echo "TBW Remaining: $tbw_remaining"
echo "Lifespan: $lifespan_percent ($wear_status)"
else
echo "TBW Used: ${tbw_used} TB"
echo "Lifespan: Unknown (Cannot estimate without usage data)"
fi
fi
if [[ "$disk_type" == "SAS SSD" ]]; then
echo "Realloc Sectors: $reallocated_sectors"
echo "Pending Sectors: $pending_sectors"
fi
else
print_color $YELLOW "Unknown disk type - limited information available"
echo "Realloc Sectors: $reallocated_sectors"
echo "Pending Sectors: $pending_sectors"
fi
echo ""
}
# Function to detect RAID controllers and disks
detect_raid_disks() {
local controllers=("megaraid" "cciss" "areca" "3ware" "hpt" "aacraid" "auto")
local disks=()
# Check for direct disks first
for disk in /dev/sd[a-z] /dev/sd[a-z][a-z] /dev/nvme[0-9]n[0-9]; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
fi
done
# Check for RAID controllers
for controller in "${controllers[@]}"; do
for i in {0..31}; do
for base_disk in "/dev/sda" "/dev/sg$i" "/dev/sr$i"; do
if smartctl -d "$controller,$i" -i "$base_disk" >/dev/null 2>&1; then
disks+=("$base_disk:$controller,$i")
break
fi
done
done
done
echo "${disks[@]}"
}
# Main function
main() {
print_color $BLUE "Disk Health Check Script v$VERSION for Harvester OS"
print_color $BLUE "Created by Adam T. Lau"
print_color $BLUE "===================================================="
echo ""
local disks=()
# Check for soft-raid first
check_mdraid
# If specific disk provided, check only that disk
if [[ $# -gt 0 ]]; then
for disk in "$@"; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
else
print_color $RED "Error: $disk is not a valid block device"
fi
done
else
# Auto-detect disks
print_color $CYAN "Auto-detecting disks..."
read -ra disks <<< "$(detect_raid_disks)"
fi
if [[ ${#disks[@]} -eq 0 ]]; then
print_color $YELLOW "No disks found via auto-detection"
print_color $CYAN "Trying direct disk access..."
for disk in /dev/sda /dev/sdb /dev/sdc /dev/nvme0n1; do
if [[ -b "$disk" ]]; then
disks+=("$disk:direct")
fi
done
fi
if [[ ${#disks[@]} -eq 0 ]]; then
print_color $RED "No disks found or accessible"
echo "Try running as root or specifying disk paths manually"
exit 1
fi
print_color $GREEN "Found ${#disks[@]} disk(s) to check"
echo ""
# Check each disk
for disk_info in "${disks[@]}"; do
IFS=':' read -r disk controller <<< "$disk_info"
check_disk "$disk" "$controller"
done
print_color $BLUE "Check completed!"
echo ""
print_color $YELLOW "Note: SAS SSDs often do not expose write statistics through SMART."
print_color $YELLOW " TBW information may not be available for these drives."
print_color $YELLOW " SSD/NVMe TBW endurance may be higher depending on the specific model."
}
# Usage information
usage() {
echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]"
echo ""
echo "If no disks specified, auto-detects all available disks and RAID arrays"
echo ""
echo "Examples:"
echo " $SCRIPT_NAME # Check all auto-detected disks"
echo " $SCRIPT_NAME /dev/sda # Check specific disk"
echo " $SCRIPT_NAME /dev/nvme0n1 # Check specific NVMe disk"
echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks"
echo ""
echo "Supported: SATA HDD/SSD, SAS HDD/SSD, NVMe, Hardware RAID, Software RAID"
echo "Created by Adam T. Lau"
}
# Parse command line arguments
case "${1:-}" in
-h|--help)
usage
exit 0
;;
-v|--version)
echo "$SCRIPT_NAME version $VERSION"
echo "Created by Adam T. Lau"
exit 0
;;
*)
main "$@"
;;
esac