Files
disk-health/ubuntu-v2.8.sh
2025-10-22 03:28:05 +08:00

495 lines
17 KiB
Bash
Executable File

#!/bin/bash
# Disk Health Check Script for Ubuntu 24.04
# Enhanced with SAS/PERC H730P controller support
# Checks SSD TBW/lifespan and HDD health status
SCRIPT_NAME=$(basename "$0")
VERSION="2.8"
# Color codes
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
YELLOW=$(tput setaf 3)
BLUE=$(tput setaf 4)
CYAN=$(tput setaf 6)
NC=$(tput sgr0)
# Function to print colored output
print_color() {
local color=$1
local message=$2
echo -e "${color}${message}${NC}"
}
# Check if command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Check dependencies
check_dependencies() {
local missing=()
if ! command_exists smartctl; then
missing+=("smartmontools")
fi
if ! command_exists bc; then
missing+=("bc")
fi
if [[ ${#missing[@]} -gt 0 ]]; then
print_color $RED "Error: Missing required packages: ${missing[*]}"
echo "Install with: sudo apt update && sudo apt install ${missing[*]}"
exit 1
fi
}
# Function to test SMART access
test_smart_access() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
if ! $smart_cmd -i "$disk" &>/dev/null; then
echo "no_access"
return
fi
echo "full_access"
}
# Function to get basic disk info
get_basic_disk_info() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local info=$($smart_cmd -i "$disk" 2>/dev/null)
local model=$(echo "$info" | grep -i "Device Model:\|Product:\|Model Number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
[[ -n "$vendor" && -n "$model" ]] && model="$vendor $model"
local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1)
local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
# Get disk type
local disk_type="UNKNOWN"
if echo "$info" | grep -qi "Solid State Device\|NVMe"; then
disk_type="SSD"
elif echo "$info" | grep -qi "Rotation Rate"; then
disk_type="HDD"
fi
echo "$model|$serial|$capacity|$firmware|$disk_type"
}
# Function to get SATA/SAS disk details
get_sata_disk_details() {
local disk=$1
local controller=$2
local smart_cmd="smartctl"
[[ -n "$controller" ]] && smart_cmd+=" -d $controller"
local health=$($smart_cmd -H "$disk" 2>/dev/null)
local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | head -1 | awk '{print $10}')
local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct" | head -1 | awk '{print $10}')
local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | head -1 | awk '{print $10}')
# Kingston SA400 specific attributes
local total_written=$(echo "$attributes" | grep -i "Flash_Writes_GiB\|Lifetime_Writes_GiB" | head -1 | awk '{print $NF}')
local media_wearout=$(echo "$attributes" | grep -i "SSD_Life_Left" | head -1 | awk '{print $NF}')
# Standard SATA attributes
if [[ -z "$total_written" ]]; then
total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written" | head -1 | awk '{print $10}')
fi
echo "$health_status|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$media_wearout"
}
# Function to get NVMe disk details
get_nvme_disk_details() {
local disk=$1
local info=$(smartctl -i "$disk" 2>/dev/null)
local health=$(smartctl -H "$disk" 2>/dev/null)
local attributes=$(smartctl -A "$disk" 2>/dev/null)
local health_status=$(echo "$health" | grep -i "Health Status:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
local power_on_hours=$(echo "$attributes" | grep -i "Power On Hours" | head -1 | awk '{print $NF}')
local data_units_written=$(echo "$attributes" | grep -i "Data Units Written" | head -1 | awk '{print $NF}')
local percentage_used=$(echo "$attributes" | grep -i "Percentage Used" | head -1 | awk '{print $NF}')
echo "$health_status|$power_on_hours|0|0|$data_units_written|$percentage_used"
}
# Function to calculate TBW for SSD
calculate_tbw() {
local raw_value=$1
local disk_model=$2
local attribute_name=$3
# Kingston SA400 SSDs use Flash_Writes_GiB (value in GiB)
if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then
if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
local tbw=$(echo "scale=2; $raw_value / 1000" | bc 2>/dev/null || echo "0")
echo "$tbw"
return
fi
fi
# NVMe drives use Data Units Written (1 unit = 1,000,000 bytes)
if echo "$disk_model" | grep -qi "NVMe" || [[ "$attribute_name" == *"Data Units Written"* ]]; then
if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
local bytes=$(echo "$raw_value * 1000000" | bc 2>/dev/null)
local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0")
echo "$tbw"
return
fi
fi
# Standard SATA SSDs with Total_LBAs_Written
if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
local bytes=$((raw_value * 512))
local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0")
echo "$tbw"
return
fi
echo "0"
}
# Function to estimate SSD endurance
estimate_ssd_endurance() {
local disk_model=$1
local capacity_gb=$2
if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then
if [[ $capacity_gb -ge 960 ]]; then
echo "300"
elif [[ $capacity_gb -ge 480 ]]; then
echo "150"
else
echo "80"
fi
elif echo "$disk_model" | grep -qi "KINGSTON.*SA2000"; then
if [[ $capacity_gb -ge 2000 ]]; then
echo "800"
elif [[ $capacity_gb -ge 1000 ]]; then
echo "400"
elif [[ $capacity_gb -ge 500 ]]; then
echo "200"
else
echo "100"
fi
elif echo "$disk_model" | grep -qi "NVMe"; then
if [[ $capacity_gb -ge 2000 ]]; then
echo "1200"
elif [[ $capacity_gb -ge 1000 ]]; then
echo "600"
elif [[ $capacity_gb -ge 500 ]]; then
echo "300"
else
echo "150"
fi
else
if [[ $capacity_gb -ge 1000 ]]; then
echo "600"
elif [[ $capacity_gb -ge 480 ]]; then
echo "300"
elif [[ $capacity_gb -ge 240 ]]; then
echo "150"
else
echo "80"
fi
fi
}
# Function to estimate SSD lifespan
estimate_ssd_lifespan() {
local tbw_used=$1
local disk_model=$2
local capacity_gb=$3
local media_wearout=$4
local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb")
if [[ -z "$estimated_endurance" || "$estimated_endurance" -eq 0 ]]; then
echo "Unknown||Unknown||Unknown"
return
fi
local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0")
# Use media wearout indicator if available
if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then
# For NVMe, percentage_used needs to be converted to remaining
if echo "$disk_model" | grep -qi "NVMe" && [[ $media_wearout -le 100 ]]; then
local remaining=$((100 - media_wearout))
media_wearout=$remaining
fi
if [[ $media_wearout -le 10 ]]; then
echo "${RED}${media_wearout}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout"
elif [[ $media_wearout -le 30 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout"
elif [[ $media_wearout -le 70 ]]; then
echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout"
else
echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout"
fi
return
fi
# Fall back to TBW calculation
if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then
local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0")
local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100")
if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then
echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw"
elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then
echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw"
else
echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw"
fi
else
echo "Unknown|${estimated_endurance} TB|New|estimated"
fi
}
# Function to estimate HDD lifespan
estimate_hdd_lifespan() {
local power_on_hours=$1
local reallocated_sectors=$2
local pending_sectors=$3
local clean_hours=$(echo "$power_on_hours" | sed 's/[^0-9].*//')
clean_hours=${clean_hours:-0}
if [[ -z "$clean_hours" || "$clean_hours" -eq 0 ]]; then
echo "Unknown"
return
fi
reallocated_sectors=${reallocated_sectors:-0}
pending_sectors=${pending_sectors:-0}
if [[ "$pending_sectors" -gt 0 ]]; then
echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)"
elif [[ "$reallocated_sectors" -gt 100 ]]; then
echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)"
elif [[ "$reallocated_sectors" -gt 10 ]]; then
echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)"
elif [[ "$clean_hours" -gt 40000 ]]; then
echo "${YELLOW}1-2 years${NC} (High usage: $clean_hours hours)"
elif [[ "$clean_hours" -gt 25000 ]]; then
echo "${GREEN}2-3 years${NC} (Moderate usage: $clean_hours hours)"
else
echo "${GREEN}> 3 years${NC} (Low usage: $clean_hours hours)"
fi
}
# Function to check a single disk
check_disk() {
local disk=$1
local controller=$2
print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})"
echo "=================================================="
local access_level=$(test_smart_access "$disk" "$controller")
if [[ "$access_level" != "full_access" ]]; then
case $access_level in
"no_access")
print_color $RED "ERROR: Cannot access disk through controller"
echo "Possible reasons:"
echo " - Controller doesn't support SMART passthrough"
echo " - Disk is part of a hardware RAID array"
echo " - Insufficient permissions (try running as root)"
echo " - Controller busy or offline"
;;
esac
echo ""
return
fi
# Get basic disk information
local basic_info=$(get_basic_disk_info "$disk" "$controller")
IFS='|' read -r model serial capacity firmware disk_type <<< "$basic_info"
# Get detailed information based on disk type
local details=""
if [[ "$disk_type" == "SSD" ]] && echo "$model" | grep -qi "NVMe"; then
details=$(get_nvme_disk_details "$disk")
else
details=$(get_sata_disk_details "$disk" "$controller")
fi
IFS='|' read -r health_status power_on_hours reallocated_sectors pending_sectors total_written media_wearout <<< "$details"
# Display basic information
echo "Model: ${model:-Unknown}"
echo "Serial: ${serial:-Unknown}"
echo "Type: $disk_type"
echo "Capacity: ${capacity:-Unknown}"
echo "Firmware: ${firmware:-Unknown}"
echo "Health: ${health_status:-Unknown}"
if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then
echo "Power On Hours: $power_on_hours"
else
echo "Power On Hours: Unknown"
fi
# Disk type specific analysis
if [[ "$disk_type" == "SSD" ]]; then
# Calculate capacity in GB
local capacity_gb=0
if echo "$capacity" | grep -qi "GB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9]*' | head -1)
elif echo "$capacity" | grep -qi "TB"; then
capacity_gb=$(echo "$capacity" | grep -o '[0-9]*' | head -1)
capacity_gb=$((capacity_gb * 1000))
else
# Extract from raw bytes
local bytes=$(echo "$capacity" | grep -o '[0-9]*' | head -1)
capacity_gb=$((bytes / 1000000000))
fi
# Get attribute name for TBW calculation
local attribute_name=""
if echo "$model" | grep -qi "KINGSTON.*SA400"; then
attribute_name="Flash_Writes_GiB"
elif echo "$model" | grep -qi "NVMe"; then
attribute_name="Data Units Written"
else
attribute_name="Total_LBAs_Written"
fi
local tbw_used=$(calculate_tbw "$total_written" "$model" "$attribute_name")
echo "TBW Used: ${tbw_used} TB"
local lifespan_info=$(estimate_ssd_lifespan "$tbw_used" "$model" "$capacity_gb" "$media_wearout")
local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1)
local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2)
local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3)
local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4)
echo "TBW Remaining: $tbw_remaining"
echo "Lifespan: $lifespan_percent ($wear_status)"
if [[ "$wear_source" == "media_wearout" ]]; then
echo "Wear Source: Media Wearout Indicator"
elif [[ "$wear_source" == "tbw" ]]; then
echo "Wear Source: TBW Calculation"
elif [[ "$wear_source" == "estimated" ]]; then
echo "Wear Source: Estimated Endurance"
fi
elif [[ "$disk_type" == "HDD" ]]; then
if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then
echo "Realloc Sectors: $reallocated_sectors"
fi
if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then
echo "Pending Sectors: $pending_sectors"
fi
local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}")
echo "Lifespan: $lifespan"
else
print_color $YELLOW "Limited information available for this disk type"
fi
echo ""
}
# Function to detect all disks
detect_disks() {
local disks=()
for disk in /dev/sd[a-z] /dev/nvme[0-9]n[0-9] /dev/sg[0-9] /dev/vd[a-z] /dev/xvd[a-z]; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
fi
done
echo "${disks[@]}"
}
# Main function
main() {
print_color $BLUE "Ubuntu 24.04 Disk Health Check Script v$VERSION"
print_color $BLUE "============================================"
echo ""
check_dependencies
local disks=()
if [[ $# -gt 0 ]]; then
for disk in "$@"; do
if [[ -b "$disk" ]]; then
disks+=("$disk")
else
print_color $RED "Error: $disk is not a valid block device"
fi
done
else
print_color $CYAN "Auto-detecting disks..."
read -ra disks <<< "$(detect_disks)"
fi
if [[ ${#disks[@]} -eq 0 ]]; then
print_color $RED "No disks found or accessible"
exit 1
fi
print_color $GREEN "Found ${#disks[@]} disk(s) to check"
echo ""
if [[ $EUID -ne 0 ]]; then
print_color $YELLOW "Warning: Not running as root. Some information may be limited."
echo ""
fi
for disk in "${disks[@]}"; do
check_disk "$disk"
done
print_color $BLUE "Check completed!"
}
# Usage information
usage() {
echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]"
echo ""
echo "Examples:"
echo " $SCRIPT_NAME # Check all disks"
echo " sudo $SCRIPT_NAME # Check all disks (as root)"
echo " $SCRIPT_NAME /dev/sda # Check specific disk"
echo " $SCRIPT_NAME /dev/nvme0n1 # Check NVMe disk"
}
case "${1:-}" in
-h|--help) usage; exit 0 ;;
-v|--version) echo "$SCRIPT_NAME version $VERSION"; exit 0 ;;
*) main "$@" ;;
esac