diff --git a/alma-v2.1.sh b/alma-v2.1.sh new file mode 100755 index 0000000..4a2847b --- /dev/null +++ b/alma-v2.1.sh @@ -0,0 +1,398 @@ +#!/bin/bash + +# Disk Health Check Script for Alma Linux 9 +# Checks SSD TBW/lifespan and HDD health status + +SCRIPT_NAME=$(basename "$0") +VERSION="2.1" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Check dependencies +check_dependencies() { + local missing=() + + if ! command_exists smartctl; then + missing+=("smartmontools") + fi + + if ! command_exists bc; then + missing+=("bc") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_color $RED "Error: Missing required packages: ${missing[*]}" + echo "Install with: sudo dnf install ${missing[*]}" + exit 1 + fi +} + +# Function to get disk type +get_disk_type() { + local disk=$1 + local info=$(smartctl -i "$disk" 2>/dev/null) + + if echo "$info" | grep -q "Solid State Device"; then + echo "SSD" + elif echo "$info" | grep -q "Rotation Rate"; then + echo "HDD" + else + echo "UNKNOWN" + fi +} + +# Function to calculate TBW for SSD +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # Enterprise SSDs typically have higher endurance + if echo "$disk_model" | grep -qi "MTFDDAK480TDS\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then + # Enterprise SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|High wear" + elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|Moderate wear" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|Healthy" + fi + else + echo "Unknown|${estimated_endurance} TB|New" + fi +} + +# Function to estimate HDD lifespan +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + if [[ -z "$power_on_hours" ]]; then + echo "Unknown" + return + fi + + # Convert to integers + power_on_hours=${power_on_hours:-0} + reallocated_sectors=${reallocated_sectors:-0} + pending_sectors=${pending_sectors:-0} + + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$power_on_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" + elif [[ "$power_on_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" + fi +} + +# Function to check disk SMART capabilities +check_smart_capability() { + local disk=$1 + local info=$(smartctl -i "$disk" 2>/dev/null) + + if echo "$info" | grep -q "SMART support is: Available"; then + if echo "$info" | grep -q "SMART support is: Enabled"; then + echo "enabled" + else + echo "available" + fi + else + echo "unavailable" + fi +} + +# Function to check a single disk +check_disk() { + local disk=$1 + + print_color $CYAN "Checking disk: $disk" + echo "==================================================" + + # Check if disk exists and is accessible + if [[ ! -b "$disk" ]]; then + print_color $RED "Error: $disk is not a valid block device" + echo "" + return + fi + + # Check SMART capability + local smart_status=$(check_smart_capability "$disk") + if [[ "$smart_status" == "unavailable" ]]; then + print_color $YELLOW "SMART not supported on $disk" + echo "" + return + elif [[ "$smart_status" == "available" ]]; then + print_color $YELLOW "SMART available but not enabled on $disk" + echo "Enable with: smartctl -s on $disk" + echo "" + return + fi + + # Get basic disk information + local info=$(smartctl -i "$disk" 2>/dev/null) + local health=$(smartctl -H "$disk" 2>/dev/null) + local attributes=$(smartctl -A "$disk" 2>/dev/null) + + # Check if smartctl command succeeded + if [[ $? -ne 0 ]]; then + print_color $RED "Error: Cannot read SMART data from $disk" + echo "You may need to run this script as root" + echo "" + return + fi + + # Extract disk information + local model=$(echo "$info" | grep "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//') + local serial=$(echo "$info" | grep "Serial Number:" | cut -d: -f2 | sed 's/^[ \t]*//') + local capacity=$(echo "$info" | grep "User Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1) + local firmware=$(echo "$info" | grep "Firmware Version:" | cut -d: -f2 | sed 's/^[ \t]*//') + + # Extract capacity in GB for endurance calculation + local capacity_gb=0 + if echo "$capacity" | grep -qi "TB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) + else + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) + fi + + local disk_type=$(get_disk_type "$disk") + local health_status=$(echo "$health" | grep "result:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + # Extract SMART attributes + local power_on_hours=$(echo "$attributes" | grep "Power_On_Hours" | awk '{print $10}' | head -1) + local reallocated_sectors=$(echo "$attributes" | grep "Reallocated_Sector_Ct" | awk '{print $10}' | head -1) + local pending_sectors=$(echo "$attributes" | grep "Current_Pending_Sector" | awk '{print $10}' | head -1) + local total_written=$(echo "$attributes" | grep -E "Total_LBAs_Written|Host_Writes_32MiB" | awk '{print $10}' | head -1) + local host_writes_32mib=$(echo "$attributes" | grep "Host_Writes_32MiB" | awk '{print $10}' | head -1) + + # Display basic information + echo "Model: ${model:-Unknown}" + echo "Serial: ${serial:-Unknown}" + echo "Type: $disk_type" + echo "Capacity: ${capacity:-Unknown}" + echo "Firmware: ${firmware:-Unknown}" + echo "Health: ${health_status:-Unknown}" + echo "Power On Hours: ${power_on_hours:-Unknown}" + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "") + fi + + echo "TBW Used: ${tbw_used} TB" + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + + echo "TBW Remaining: $tbw_remaining" + echo "Lifespan: $lifespan_percent ($wear_status)" + + elif [[ "$disk_type" == "HDD" ]]; then + echo "Realloc Sectors: ${reallocated_sectors:-0}" + echo "Pending Sectors: ${pending_sectors:-0}" + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Limited information available for this disk type" + fi + + echo "" +} + +# Function to detect all disks +detect_disks() { + local disks=() + + # Check for SATA/SAS disks + for disk in /dev/sd[a-z] /dev/sd[a-z][a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for NVMe disks + for disk in /dev/nvme[0-9]n[0-9] /dev/nvme[0-9]n[0-9]p[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for other disk types + for disk in /dev/vd[a-z] /dev/xvd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + echo "${disks[@]}" +} + +# Main function +main() { + print_color $BLUE "Disk Health Check Script v$VERSION for Alma Linux 9" + print_color $BLUE "====================================================" + echo "" + + check_dependencies + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks + print_color $CYAN "Auto-detecting disks..." + read -ra disks <<< "$(detect_disks)" + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + echo "Try running as root or specifying disk paths manually" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check if running as root, warn if not + if [[ $EUID -ne 0 ]]; then + print_color $YELLOW "Warning: Not running as root. Some disks may not be accessible." + echo "For complete results, run as: sudo $0" + echo "" + fi + + # Check each disk + for disk in "${disks[@]}"; do + check_disk "$disk" + done + + print_color $BLUE "Check completed!" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks" + echo "" + echo "Examples:" + echo " $SCRIPT_NAME # Check all auto-detected disks" + echo " sudo $SCRIPT_NAME # Check all disks (as root)" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac diff --git a/alma-v2.4.sh b/alma-v2.4.sh new file mode 100755 index 0000000..9790690 --- /dev/null +++ b/alma-v2.4.sh @@ -0,0 +1,536 @@ +#!/bin/bash + +# Disk Health Check Script for Alma Linux 9 +# Enhanced with SAS/PERC H730P controller support +# Checks SSD TBW/lifespan and HDD health status + +SCRIPT_NAME=$(basename "$0") +VERSION="2.4" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Check dependencies +check_dependencies() { + local missing=() + + if ! command_exists smartctl; then + missing+=("smartmontools") + fi + + if ! command_exists bc; then + missing+=("bc") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_color $RED "Error: Missing required packages: ${missing[*]}" + echo "Install with: sudo dnf install ${missing[*]}" + exit 1 + fi +} + +# Function to test SMART access and get available data +test_smart_access() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + # Test basic SMART access + if ! $smart_cmd -i "$disk" &>/dev/null; then + echo "no_access" + return + fi + + # Check if SMART is enabled (don't enable it, just check status) + local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) + local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}') + local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}') + + if [[ "$smart_available" != "Available" ]]; then + echo "not_available" + return + fi + + if [[ "$smart_enabled" != "Enabled" ]]; then + echo "disabled" + return + fi + + # Test attribute reading + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + if [[ -z "$attributes" ]]; then + echo "no_attributes" + return + fi + + # Check if we have basic attributes + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1) + if [[ -z "$power_on_hours" ]]; then + echo "limited_attributes" + return + fi + + echo "full_access" +} + +# Function to get disk information with enhanced SAS support +get_disk_info() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + local info=$($smart_cmd -i "$disk" 2>/dev/null) + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + local health=$($smart_cmd -H "$disk" 2>/dev/null) + + # Extract information with multiple fallbacks for SAS drives + local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" + + local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) + + local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) + + # Get disk type with SAS support + local disk_type="UNKNOWN" + if echo "$info" | grep -qi "Solid State Device"; then + disk_type="SSD" + elif echo "$info" | grep -qi "Rotation Rate"; then + disk_type="HDD" + elif echo "$info" | grep -qi "SCSI\|SAS"; then + # SAS drives often don't specify, check rotation rate + if echo "$info" | grep -qi "15000\|10000\|7200"; then + disk_type="HDD" + else + disk_type="SSD" + fi + fi + + # Extract SMART attributes with multiple field attempts for SAS + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1) + [[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1) + + local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1) + + local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1) + + local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1) + local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1) + + # For SAS drives, try to get media wearout for SSDs + local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1) + + echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" +} + +# Function to calculate TBW for SSD +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # SAS SSDs typically have very high endurance + if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then + # Enterprise SAS SSDs - very high endurance + if [[ $capacity_gb -ge 1000 ]]; then + echo "10000" # 10PB for 1TB+ enterprise SAS SSD + elif [[ $capacity_gb -ge 600 ]]; then + echo "6000" # 6PB for 600GB enterprise SAS SSD + elif [[ $capacity_gb -ge 400 ]]; then + echo "4000" # 4PB for 400GB enterprise SAS SSD + else + echo "2000" # 2PB for smaller enterprise SAS SSD + fi + elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then + # Enterprise SATA SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + local media_wearout=$5 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") + + # If we have media wearout indicator, use it for more accurate estimation + if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then + if [[ $media_wearout -le 10 ]]; then + echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + return + fi + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" + elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" + fi + else + echo "Unknown|${estimated_endurance} TB|New|estimated" + fi +} + +# Function to estimate HDD lifespan +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + if [[ -z "$power_on_hours" ]]; then + echo "Unknown" + return + fi + + power_on_hours=${power_on_hours:-0} + reallocated_sectors=${reallocated_sectors:-0} + pending_sectors=${pending_sectors:-0} + + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$power_on_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" + elif [[ "$power_on_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" + fi +} + +# Function to check a single disk with enhanced error handling +check_disk() { + local disk=$1 + local controller=$2 + + print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" + echo "==================================================" + + # Test SMART access level + local access_level=$(test_smart_access "$disk" "$controller") + + case $access_level in + "no_access") + print_color $RED "ERROR: Cannot access disk through controller" + echo "Possible reasons:" + echo " - Controller doesn't support SMART passthrough" + echo " - Disk is part of a hardware RAID array" + echo " - Insufficient permissions (try running as root)" + echo " - Controller busy or offline" + echo "" + return + ;; + "not_available") + print_color $YELLOW "SMART not available on this disk" + echo "This disk does not support SMART monitoring" + echo "" + return + ;; + "disabled") + print_color $YELLOW "SMART is disabled on this disk" + echo "SMART is available but currently disabled" + echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" + echo "" + return + ;; + "no_attributes") + print_color $YELLOW "WARNING: Cannot read SMART attributes" + echo "This is common with hardware RAID controllers like PERC H730P" + echo "Try checking through the RAID management interface" + echo "" + return + ;; + "limited_attributes") + print_color $YELLOW "NOTE: Limited SMART data available" + echo "Controller is filtering some SMART attributes" + ;; + esac + + # Get disk information + local disk_info=$(get_disk_info "$disk" "$controller") + IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" + + # Display basic information + echo "Model: ${model:-Unknown}" + echo "Serial: ${serial:-Unknown}" + echo "Type: $disk_type" + echo "Capacity: ${capacity:-Unknown}" + echo "Firmware: ${firmware:-Unknown}" + echo "Health: ${health_status:-Unknown}" + + # Only show power on hours if available + if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then + echo "Power On Hours: $power_on_hours" + else + echo "Power On Hours: Unknown" + fi + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "") + fi + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + echo "TBW Used: ${tbw_used} TB" + fi + + # Estimate capacity for endurance calculation + local capacity_gb=0 + if echo "$capacity" | grep -qi "GB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) + elif echo "$capacity" | grep -qi "TB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) + fi + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) + + if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + echo "TBW Remaining: $tbw_remaining" + fi + + echo "Lifespan: $lifespan_percent ($wear_status)" + + # Show wear source if available + if [[ "$wear_source" == "media_wearout" ]]; then + echo "Wear Source: Media Wearout Indicator" + elif [[ "$wear_source" == "tbw" ]]; then + echo "Wear Source: TBW Calculation" + elif [[ "$wear_source" == "estimated" ]]; then + echo "Wear Source: Estimated Endurance" + fi + + elif [[ "$disk_type" == "HDD" ]]; then + if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then + echo "Realloc Sectors: $reallocated_sectors" + fi + if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then + echo "Pending Sectors: $pending_sectors" + fi + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Limited information available for this disk type" + echo "This is normal for hardware RAID configurations like PERC H730P" + echo "For detailed SAS drive information, use controller management tools" + fi + + echo "" +} + +# Function to detect all disks with enhanced SAS support (no partitions) +detect_disks() { + local disks=() + + # Check for SATA/SAS disks - only main devices, no partitions + for disk in /dev/sd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for NVMe disks - only main devices, no partitions + for disk in /dev/nvme[0-9]n[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for SAS disks via SCSI generic - only main devices + for disk in /dev/sg[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for other disk types - only main devices + for disk in /dev/vd[a-z] /dev/xvd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + echo "${disks[@]}" +} + +# Main function +main() { + print_color $BLUE "Alma Linux 9 Disk Health Check Script v$VERSION" + print_color $BLUE "Enhanced with PERC H730P and SAS Support" + print_color $BLUE "============================================" + echo "" + + check_dependencies + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks + print_color $CYAN "Auto-detecting disks (excluding partitions)..." + read -ra disks <<< "$(detect_disks)" + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + echo "Try running as root or specifying disk paths manually" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check if running as root, warn if not + if [[ $EUID -ne 0 ]]; then + print_color $YELLOW "Warning: Not running as root." + print_color $YELLOW "Some disks/controllers may show limited information." + echo "For complete results, run as: sudo $0" + echo "" + fi + + # Check each disk + for disk in "${disks[@]}"; do + check_disk "$disk" + done + + print_color $BLUE "Check completed!" + echo "" + print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" + print_color $CYAN " - Install 'storcli' for detailed controller information" + print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access" + print_color $CYAN " - Hardware RAID controllers often limit SMART data access" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks" + echo "" + echo "Examples:" + echo " $SCRIPT_NAME # Check all auto-detected disks" + echo " sudo $SCRIPT_NAME # Check all disks (as root)" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" + echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac diff --git a/harvester-v2.1.sh b/harvester-v2.1.sh new file mode 100755 index 0000000..d085f78 --- /dev/null +++ b/harvester-v2.1.sh @@ -0,0 +1,337 @@ +#!/bin/bash + +# Disk Health Check Script for Harvester OS +# Checks SSD TBW/lifespan and HDD health status +# Supports RAID controllers and direct disks + +SCRIPT_NAME=$(basename "$0") +VERSION="2.1" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if smartctl is installed +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +if ! command_exists smartctl; then + print_color $RED "Error: smartctl is not installed. Please install smartmontools package." + exit 1 +fi + +# Function to get disk type +get_disk_type() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + smart_cmd+=" -i $disk" + + local info=$($smart_cmd 2>/dev/null) + + if echo "$info" | grep -q "Solid State Device"; then + echo "SSD" + elif echo "$info" | grep -q "Rotation Rate"; then + echo "HDD" + else + echo "UNKNOWN" + fi +} + +# Function to calculate TBW for SSD +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + # Calculate from sectors (most common) + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + # Try to calculate from raw value (varies by manufacturer) + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # Enterprise SSDs typically have higher endurance + if echo "$disk_model" | grep -qi "MTFDDAK480TDS\|MICRON\|INTEL\|SAMSUNG"; then + # Enterprise SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc -l 2>/dev/null || echo "0") + + if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc -l 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc -l 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc -l) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|High wear" + elif [[ $(echo "$lifespan_used >= 50" | bc -l) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|Moderate wear" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|Healthy" + fi + else + echo "Unknown|${estimated_endurance} TB|New" + fi +} + +# Function to estimate HDD lifespan +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + if [[ -z "$power_on_hours" ]]; then + echo "Unknown" + return + fi + + # HDD lifespan estimation based on common failure patterns + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$power_on_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" + elif [[ "$power_on_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" + fi +} + +# Function to check a single disk +check_disk() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" + echo "==================================================" + + # Get basic disk information + local info=$($smart_cmd -i "$disk" 2>/dev/null) + local health=$($smart_cmd -H "$disk" 2>/dev/null) + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + + # Extract disk information + local model=$(echo "$info" | grep "Device Model:" | cut -d: -f2 | sed 's/^[ \t]*//') + local serial=$(echo "$info" | grep "Serial Number:" | cut -d: -f2 | sed 's/^[ \t]*//') + local capacity=$(echo "$info" | grep "User Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1) + local firmware=$(echo "$info" | grep "Firmware Version:" | cut -d: -f2 | sed 's/^[ \t]*//') + + # Extract capacity in GB for endurance calculation + local capacity_gb=0 + if [[ $capacity =~ \[([0-9.]+)\s+GB\] ]]; then + capacity_gb=${BASH_REMATCH[1]} + elif [[ $capacity =~ \[([0-9.]+)\s+TB\] ]]; then + capacity_gb=$(echo "${BASH_REMATCH[1]} * 1000" | bc -l 2>/dev/null | cut -d. -f1) + fi + + local disk_type=$(get_disk_type "$disk" "$controller") + local health_status=$(echo "$health" | grep "result:" | cut -d: -f2 | sed 's/^[ \t]*//') + + # Extract SMART attributes + local power_on_hours=$(echo "$attributes" | grep "Power_On_Hours" | awk '{print $10}') + local reallocated_sectors=$(echo "$attributes" | grep "Reallocated_Sector_Ct" | awk '{print $10}') + local pending_sectors=$(echo "$attributes" | grep "Current_Pending_Sector" | awk '{print $10}') + local total_written=$(echo "$attributes" | grep -E "Total_LBAs_Written|Host_Writes_32MiB" | awk '{print $10}') + + # For SSDs with Host_Writes_32MiB + local host_writes_32mib=$(echo "$attributes" | grep "Host_Writes_32MiB" | awk '{print $10}') + + # Display basic information + echo "Model: $model" + echo "Serial: $serial" + echo "Type: $disk_type" + echo "Capacity: $capacity" + echo "Firmware: $firmware" + echo "Health: $health_status" + echo "Power On Hours: $power_on_hours" + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "") + fi + + echo "TBW Used: ${tbw_used} TB" + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + + echo "TBW Remaining: $tbw_remaining" + echo "Lifespan: $lifespan_percent ($wear_status)" + + elif [[ "$disk_type" == "HDD" ]]; then + echo "Realloc Sectors: ${reallocated_sectors:-0}" + echo "Pending Sectors: ${pending_sectors:-0}" + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Unknown disk type - limited information available" + fi + + echo "" +} + +# Function to detect RAID controllers and disks +detect_raid_disks() { + local controllers=("megaraid" "cciss" "areca" "3ware" "hpt") + local disks=() + + # Check for direct disks first + for disk in /dev/sd[a-z] /dev/nvme[0-9]n[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + fi + done + + # Check for RAID controllers + for controller in "${controllers[@]}"; do + for i in {0..15}; do + if smartctl -d "$controller,$i" -i /dev/sda >/dev/null 2>&1; then + disks+=("/dev/sda:$controller,$i") + fi + done + done + + echo "${disks[@]}" +} + +# Main function +main() { + print_color $BLUE "Disk Health Check Script v$VERSION for Harvester OS" + print_color $BLUE "====================================================" + echo "" + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks + print_color $CYAN "Auto-detecting disks..." + read -ra disks <<< "$(detect_raid_disks)" + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check each disk + for disk_info in "${disks[@]}"; do + IFS=':' read -r disk controller <<< "$disk_info" + check_disk "$disk" "$controller" + done + + print_color $BLUE "Check completed!" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks and RAID arrays" + echo "" + echo "Examples:" + echo " $SCRIPT_NAME # Check all auto-detected disks" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/sda /dev/sdb # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac diff --git a/harvester-v2.4.sh b/harvester-v2.4.sh new file mode 100755 index 0000000..9e82677 --- /dev/null +++ b/harvester-v2.4.sh @@ -0,0 +1,542 @@ +#!/bin/bash + +# Disk Health Check Script for Harvester OS +# Enhanced with SAS/PERC H730P controller support +# Checks SSD TBW/lifespan and HDD health status + +SCRIPT_NAME=$(basename "$0") +VERSION="2.4" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +if ! command_exists smartctl; then + print_color $RED "Error: smartctl is not installed. Please install smartmontools package." + exit 1 +fi + +# Function to test SMART access and get available data +test_smart_access() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + # Test basic SMART access + if ! $smart_cmd -i "$disk" &>/dev/null; then + echo "no_access" + return + fi + + # Check if SMART is enabled (don't enable it, just check status) + local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) + local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}') + local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}') + + if [[ "$smart_available" != "Available" ]]; then + echo "not_available" + return + fi + + if [[ "$smart_enabled" != "Enabled" ]]; then + echo "disabled" + return + fi + + # Test attribute reading + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + if [[ -z "$attributes" ]]; then + echo "no_attributes" + return + fi + + # Check if we have basic attributes + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1) + if [[ -z "$power_on_hours" ]]; then + echo "limited_attributes" + return + fi + + echo "full_access" +} + +# Function to get disk information with enhanced SAS support +get_disk_info() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + local info=$($smart_cmd -i "$disk" 2>/dev/null) + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + local health=$($smart_cmd -H "$disk" 2>/dev/null) + + # Extract information with multiple fallbacks for SAS drives + local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" + + local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) + + local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) + + # Get disk type with SAS support + local disk_type="UNKNOWN" + if echo "$info" | grep -qi "Solid State Device"; then + disk_type="SSD" + elif echo "$info" | grep -qi "Rotation Rate"; then + disk_type="HDD" + elif echo "$info" | grep -qi "SCSI\|SAS"; then + # SAS drives often don't specify, check rotation rate + if echo "$info" | grep -qi "15000\|10000\|7200"; then + disk_type="HDD" + else + disk_type="SSD" + fi + fi + + # Extract SMART attributes with multiple field attempts for SAS + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1) + [[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1) + + local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1) + + local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1) + + local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1) + local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1) + + # For SAS drives, try to get media wearout for SSDs + local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1) + + echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" +} + +# Function to calculate TBW for SSD +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc -l 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # SAS SSDs typically have very high endurance + if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then + # Enterprise SAS SSDs - very high endurance + if [[ $capacity_gb -ge 1000 ]]; then + echo "10000" # 10PB for 1TB+ enterprise SAS SSD + elif [[ $capacity_gb -ge 600 ]]; then + echo "6000" # 6PB for 600GB enterprise SAS SSD + elif [[ $capacity_gb -ge 400 ]]; then + echo "4000" # 4PB for 400GB enterprise SAS SSD + else + echo "2000" # 2PB for smaller enterprise SAS SSD + fi + elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then + # Enterprise SATA SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + local media_wearout=$5 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc -l 2>/dev/null || echo "0") + + # If we have media wearout indicator, use it for more accurate estimation + if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then + if [[ $media_wearout -le 10 ]]; then + echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + return + fi + + if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc -l 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc -l 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc -l) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" + elif [[ $(echo "$lifespan_used >= 50" | bc -l) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" + fi + else + echo "Unknown|${estimated_endurance} TB|New|estimated" + fi +} + +# Function to estimate HDD lifespan +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + if [[ -z "$power_on_hours" ]]; then + echo "Unknown" + return + fi + + power_on_hours=${power_on_hours:-0} + reallocated_sectors=${reallocated_sectors:-0} + pending_sectors=${pending_sectors:-0} + + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$power_on_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" + elif [[ "$power_on_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" + fi +} + +# Function to check a single disk with enhanced error handling +check_disk() { + local disk=$1 + local controller=$2 + + print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" + echo "==================================================" + + # Test SMART access level + local access_level=$(test_smart_access "$disk" "$controller") + + case $access_level in + "no_access") + print_color $RED "ERROR: Cannot access disk through controller" + echo "Possible reasons:" + echo " - Controller doesn't support SMART passthrough" + echo " - Disk is part of a hardware RAID array" + echo " - Insufficient permissions (try running as root)" + echo " - Controller busy or offline" + echo "" + return + ;; + "not_available") + print_color $YELLOW "SMART not available on this disk" + echo "This disk does not support SMART monitoring" + echo "" + return + ;; + "disabled") + print_color $YELLOW "SMART is disabled on this disk" + echo "SMART is available but currently disabled" + echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" + echo "" + return + ;; + "no_attributes") + print_color $YELLOW "WARNING: Cannot read SMART attributes" + echo "This is common with hardware RAID controllers like PERC H730P" + echo "Try checking through the RAID management interface" + echo "" + return + ;; + "limited_attributes") + print_color $YELLOW "NOTE: Limited SMART data available" + echo "Controller is filtering some SMART attributes" + ;; + esac + + # Get disk information + local disk_info=$(get_disk_info "$disk" "$controller") + IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" + + # Display basic information + echo "Model: ${model:-Unknown}" + echo "Serial: ${serial:-Unknown}" + echo "Type: $disk_type" + echo "Capacity: ${capacity:-Unknown}" + echo "Firmware: ${firmware:-Unknown}" + echo "Health: ${health_status:-Unknown}" + + # Only show power on hours if available + if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then + echo "Power On Hours: $power_on_hours" + else + echo "Power On Hours: Unknown" + fi + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "") + fi + + if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then + echo "TBW Used: ${tbw_used} TB" + fi + + # Estimate capacity for endurance calculation + local capacity_gb=0 + if echo "$capacity" | grep -qi "GB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) + elif echo "$capacity" | grep -qi "TB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) + fi + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) + + if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then + echo "TBW Remaining: $tbw_remaining" + fi + + echo "Lifespan: $lifespan_percent ($wear_status)" + + # Show wear source if available + if [[ "$wear_source" == "media_wearout" ]]; then + echo "Wear Source: Media Wearout Indicator" + elif [[ "$wear_source" == "tbw" ]]; then + echo "Wear Source: TBW Calculation" + elif [[ "$wear_source" == "estimated" ]]; then + echo "Wear Source: Estimated Endurance" + fi + + elif [[ "$disk_type" == "HDD" ]]; then + if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then + echo "Realloc Sectors: $reallocated_sectors" + fi + if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then + echo "Pending Sectors: $pending_sectors" + fi + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Limited information available for this disk type" + echo "This is normal for hardware RAID configurations like PERC H730P" + echo "For detailed SAS drive information, use controller management tools" + fi + + echo "" +} + +# Function to detect RAID controllers and disks with PERC H730P support +detect_raid_disks() { + local controllers=("megaraid" "cciss" "areca" "3ware" "hpt" "auto") + local disks=() + + # Check for direct disks first - only main devices, no partitions + for disk in /dev/sd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + fi + done + + # Check for NVMe disks - only main devices, no partitions + for disk in /dev/nvme[0-9]n[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + fi + done + + # Check for SAS disks directly via SCSI generic + for disk in /dev/sg[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + fi + done + + # Check for RAID controllers with enhanced detection + for controller in "${controllers[@]}"; do + print_color $BLUE "Scanning for $controller controllers..." + for i in {0..31}; do + # Try different disk devices for each controller + for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do + if [[ -b "$base_disk" ]]; then + if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then + disks+=("$base_disk:$controller,$i") + print_color $GREEN " Found $controller,$i on $base_disk" + break + fi + fi + done + done + done + + # Special detection for PERC H730P + print_color $BLUE "Scanning for PERC H730P controllers..." + if command_exists storcli; then + print_color $GREEN " storcli detected - checking PERC H730P" + disks+=("/dev/sda:perc-h730p") + fi + + echo "${disks[@]}" +} + +# Main function +main() { + print_color $BLUE "Harvester OS Disk Health Check Script v$VERSION" + print_color $BLUE "Enhanced with PERC H730P and SAS Support" + print_color $BLUE "============================================" + echo "" + + if ! command_exists smartctl; then + print_color $RED "Error: smartctl is not installed. Please install smartmontools package." + exit 1 + fi + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk:direct") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks + print_color $CYAN "Auto-detecting disks and RAID controllers..." + read -ra disks <<< "$(detect_raid_disks)" + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + echo "Try running as root: sudo $0" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check if running as root + if [[ $EUID -ne 0 ]]; then + print_color $YELLOW "Warning: Not running as root." + print_color $YELLOW "Some disks/controllers may show limited information." + echo "" + fi + + # Check each disk + for disk_info in "${disks[@]}"; do + IFS=':' read -r disk controller <<< "$disk_info" + check_disk "$disk" "$controller" + done + + print_color $BLUE "Check completed!" + echo "" + print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" + print_color $CYAN " - Use 'storcli /c0 show all' for detailed information" + print_color $CYAN " - Use 'storcli /c0/eall/sall show' for physical disk status" + print_color $CYAN " - Hardware RAID controllers often limit SMART data access" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks and RAID arrays" + echo "" + echo "Examples:" + echo " sudo $SCRIPT_NAME # Check all disks (recommended)" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" + echo " $SCRIPT_NAME /dev/sda /dev/sdb # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac diff --git a/ubuntu-v2.5.sh b/ubuntu-v2.5.sh new file mode 100755 index 0000000..bb64ff9 --- /dev/null +++ b/ubuntu-v2.5.sh @@ -0,0 +1,615 @@ +#!/bin/bash + +# Disk Health Check Script for Ubuntu 24.04 +# Enhanced with SAS/PERC H730P controller support +# Checks SSD TBW/lifespan and HDD health status + +SCRIPT_NAME=$(basename "$0") +VERSION="2.5" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Check dependencies +check_dependencies() { + local missing=() + + if ! command_exists smartctl; then + missing+=("smartmontools") + fi + + if ! command_exists bc; then + missing+=("bc") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_color $RED "Error: Missing required packages: ${missing[*]}" + echo "Install with: sudo apt update && sudo apt install ${missing[*]}" + exit 1 + fi +} + +# Function to test SMART access and get available data - FIXED VERSION +test_smart_access() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + # Test basic SMART access + if ! $smart_cmd -i "$disk" &>/dev/null; then + echo "no_access" + return + fi + + # Get SMART information + local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) + + # Check if SMART is available - FIXED PARSING + if ! echo "$smart_info" | grep -q "SMART support is:"; then + echo "not_available" + return + fi + + # Extract SMART status - FIXED LOGIC + local smart_support_line=$(echo "$smart_info" | grep "SMART support is:") + local smart_available=$(echo "$smart_support_line" | grep -q "Available" && echo "Available" || echo "") + local smart_enabled=$(echo "$smart_support_line" | grep -q "Enabled" && echo "Enabled" || echo "") + + if [[ -z "$smart_available" ]]; then + echo "not_available" + return + fi + + if [[ -z "$smart_enabled" ]]; then + echo "disabled" + return + fi + + # Test attribute reading + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + if [[ -z "$attributes" ]] || ! echo "$attributes" | grep -q "ATTRIBUTE_NAME"; then + echo "no_attributes" + return + fi + + echo "full_access" +} + +# Function to get disk information with enhanced SAS support +get_disk_info() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + local info=$($smart_cmd -i "$disk" 2>/dev/null) + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + local health=$($smart_cmd -H "$disk" 2>/dev/null) + + # Extract information with multiple fallbacks for SAS drives + local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" + + local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) + + local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) + + # Get disk type with SAS support + local disk_type="UNKNOWN" + if echo "$info" | grep -qi "Solid State Device"; then + disk_type="SSD" + elif echo "$info" | grep -qi "Rotation Rate"; then + disk_type="HDD" + elif echo "$info" | grep -qi "SCSI\|SAS"; then + # SAS drives often don't specify, check rotation rate + if echo "$info" | grep -qi "15000\|10000\|7200"; then + disk_type="HDD" + else + disk_type="SSD" + fi + fi + + # Extract SMART attributes with multiple field attempts + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | awk '{print $10}' | head -1) + + local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct" | awk '{print $10}' | head -1) + + local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $10}' | head -1) + + # For Kingston and other SSDs with different attribute names + local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Lifetime_Writes_GiB\|Host_Writes_32MiB\|Flash_Writes_GiB" | awk '{print $10}' | head -1) + local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $10}' | head -1) + + # For wear leveling indicators + local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count\|SSD_Life_Left" | awk '{print $10}' | head -1) + + echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" +} + +# Function to calculate TBW for SSD - ENHANCED FOR KINGSTON +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + local disk_model=$3 + + # Kingston SSDs use Lifetime_Writes_GiB and Flash_Writes_GiB + if echo "$disk_model" | grep -qi "KINGSTON"; then + if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + # Convert from GiB to TB + local tbw=$(echo "scale=2; $raw_value / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + return + fi + fi + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # Kingston consumer SSDs + if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then + if [[ $capacity_gb -ge 960 ]]; then + echo "300" # 300TB for 960GB Kingston SA400 + elif [[ $capacity_gb -ge 480 ]]; then + echo "150" # 150TB for 480GB Kingston + else + echo "80" # 80TB for smaller Kingston + fi + # SAS SSDs typically have very high endurance + elif echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then + # Enterprise SAS SSDs - very high endurance + if [[ $capacity_gb -ge 1000 ]]; then + echo "10000" # 10PB for 1TB+ enterprise SAS SSD + elif [[ $capacity_gb -ge 600 ]]; then + echo "6000" # 6PB for 600GB enterprise SAS SSD + elif [[ $capacity_gb -ge 400 ]]; then + echo "4000" # 4PB for 400GB enterprise SAS SSD + else + echo "2000" # 2PB for smaller enterprise SAS SSD + fi + elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA\|WDC\|WESTERN DIGITAL"; then + # Enterprise SATA/NVMe SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + local media_wearout=$5 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") + + # If we have media wearout indicator, use it for more accurate estimation + if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then + # For Kingston, SSD_Life_Left is already a percentage + if echo "$disk_model" | grep -qi "KINGSTON"; then + if [[ $media_wearout -le 10 ]]; then + echo "${RED}${media_wearout}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + else + # For other drives, media_wearout might be countdown from 100 + local wear_percent=$media_wearout + if [[ $media_wearout -le 10 ]]; then + echo "${RED}${wear_percent}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${wear_percent}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + fi + return + fi + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" + elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" + fi + else + echo "Unknown|${estimated_endurance} TB|New|estimated" + fi +} + +# Function to estimate HDD lifespan +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + if [[ -z "$power_on_hours" ]]; then + echo "Unknown" + return + fi + + power_on_hours=${power_on_hours:-0} + reallocated_sectors=${reallocated_sectors:-0} + pending_sectors=${pending_sectors:-0} + + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$power_on_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)" + elif [[ "$power_on_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)" + fi +} + +# Function to check a single disk with enhanced error handling +check_disk() { + local disk=$1 + local controller=$2 + + print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" + echo "==================================================" + + # Test SMART access level + local access_level=$(test_smart_access "$disk" "$controller") + + case $access_level in + "no_access") + print_color $RED "ERROR: Cannot access disk through controller" + echo "Possible reasons:" + echo " - Controller doesn't support SMART passthrough" + echo " - Disk is part of a hardware RAID array" + echo " - Insufficient permissions (try running as root)" + echo " - Controller busy or offline" + echo "" + return + ;; + "not_available") + print_color $YELLOW "SMART not available on this disk" + echo "This disk does not support SMART monitoring" + echo "" + return + ;; + "disabled") + print_color $YELLOW "SMART is disabled on this disk" + echo "SMART is available but currently disabled" + echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" + echo "" + return + ;; + "no_attributes") + print_color $YELLOW "WARNING: Cannot read SMART attributes" + echo "This is common with hardware RAID controllers like PERC H730P" + echo "Try checking through the RAID management interface" + echo "" + return + ;; + "limited_attributes") + print_color $YELLOW "NOTE: Limited SMART data available" + echo "Controller is filtering some SMART attributes" + ;; + esac + + # Get disk information + local disk_info=$(get_disk_info "$disk" "$controller") + IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" + + # Display basic information + echo "Model: ${model:-Unknown}" + echo "Serial: ${serial:-Unknown}" + echo "Type: $disk_type" + echo "Capacity: ${capacity:-Unknown}" + echo "Firmware: ${firmware:-Unknown}" + echo "Health: ${health_status:-Unknown}" + + # Only show power on hours if available + if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then + echo "Power On Hours: $power_on_hours" + else + echo "Power On Hours: Unknown" + fi + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written" "$model") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "" "$model") + fi + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + echo "TBW Used: ${tbw_used} TB" + fi + + # Estimate capacity for endurance calculation + local capacity_gb=0 + if echo "$capacity" | grep -qi "GB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) + elif echo "$capacity" | grep -qi "TB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) + fi + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) + + if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + echo "TBW Remaining: $tbw_remaining" + fi + + echo "Lifespan: $lifespan_percent ($wear_status)" + + # Show wear source if available + if [[ "$wear_source" == "media_wearout" ]]; then + echo "Wear Source: Media Wearout Indicator" + elif [[ "$wear_source" == "tbw" ]]; then + echo "Wear Source: TBW Calculation" + elif [[ "$wear_source" == "estimated" ]]; then + echo "Wear Source: Estimated Endurance" + fi + + elif [[ "$disk_type" == "HDD" ]]; then + if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then + echo "Realloc Sectors: $reallocated_sectors" + fi + if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then + echo "Pending Sectors: $pending_sectors" + fi + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Limited information available for this disk type" + echo "This is normal for hardware RAID configurations like PERC H730P" + echo "For detailed SAS drive information, use controller management tools" + fi + + echo "" +} + +# Function to detect all disks with enhanced SAS support (no partitions) - FIXED +detect_disks() { + local disks=() + + # Check for SATA/SAS disks - only main devices, no partitions + for disk in /dev/sd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for NVMe disks - only main devices, no partitions + for disk in /dev/nvme[0-9]n[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for SAS disks via SCSI generic - only main devices + for disk in /dev/sg[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for other disk types - only main devices + for disk in /dev/vd[a-z] /dev/xvd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + echo "${disks[@]}" +} + +# Function to detect RAID controllers (Ubuntu specific) - FIXED +detect_raid_controllers() { + local controllers=("megaraid" "cciss" "areca" "3ware" "hpt") + local raid_disks=() + + # Check for RAID controllers + for controller in "${controllers[@]}"; do + for i in {0..31}; do + # Try different disk devices for each controller + for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do + if [[ -b "$base_disk" ]]; then + if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then + raid_disks+=("$base_disk:$controller,$i") + break + fi + fi + done + done + done + + echo "${raid_disks[@]}" +} + +# Main function - FIXED +main() { + print_color $BLUE "Ubuntu 24.04 Disk Health Check Script v$VERSION" + print_color $BLUE "Enhanced with PERC H730P and SAS Support" + print_color $BLUE "============================================" + echo "" + + check_dependencies + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks - FIXED: don't mix output with disk detection + print_color $CYAN "Auto-detecting disks (excluding partitions)..." + local direct_disks=() + read -ra direct_disks <<< "$(detect_disks)" + + print_color $CYAN "Scanning for RAID controllers..." + local raid_disks=() + read -ra raid_disks <<< "$(detect_raid_controllers)" + + # Combine both lists + disks=("${direct_disks[@]}" "${raid_disks[@]}") + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + echo "Try running as root or specifying disk paths manually" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check if running as root, warn if not + if [[ $EUID -ne 0 ]]; then + print_color $YELLOW "Warning: Not running as root." + print_color $YELLOW "Some disks/controllers may show limited information." + echo "For complete results, run as: sudo $0" + echo "" + fi + + # Check each disk + for disk_info in "${disks[@]}"; do + # Check if this is a RAID disk (has controller specified) + if [[ "$disk_info" == *":"* ]]; then + IFS=':' read -r disk controller <<< "$disk_info" + check_disk "$disk" "$controller" + else + check_disk "$disk_info" + fi + done + + print_color $BLUE "Check completed!" + echo "" + print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" + print_color $CYAN " - Install 'storcli' for detailed controller information" + print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access" + print_color $CYAN " - Hardware RAID controllers often limit SMART data access" + echo "" + print_color $CYAN "Ubuntu-specific tips:" + print_color $CYAN " - Use 'lsblk' to see all available block devices" + print_color $CYAN " - Use 'lshw -class disk' for detailed disk information" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks" + echo "" + echo "Examples:" + echo " $SCRIPT_NAME # Check all auto-detected disks" + echo " sudo $SCRIPT_NAME # Check all disks (as root)" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/nvme0n1 # Check NVMe disk" + echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" + echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac diff --git a/ubuntu-v2.6.sh b/ubuntu-v2.6.sh new file mode 100755 index 0000000..44ad14b --- /dev/null +++ b/ubuntu-v2.6.sh @@ -0,0 +1,650 @@ +#!/bin/bash + +# Disk Health Check Script for Ubuntu 24.04 +# Enhanced with SAS/PERC H730P controller support +# Checks SSD TBW/lifespan and HDD health status + +SCRIPT_NAME=$(basename "$0") +VERSION="2.6" + +# Color codes +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +YELLOW=$(tput setaf 3) +BLUE=$(tput setaf 4) +CYAN=$(tput setaf 6) +NC=$(tput sgr0) + +# Function to print colored output +print_color() { + local color=$1 + local message=$2 + echo -e "${color}${message}${NC}" +} + +# Check if command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Check dependencies +check_dependencies() { + local missing=() + + if ! command_exists smartctl; then + missing+=("smartmontools") + fi + + if ! command_exists bc; then + missing+=("bc") + fi + + if [[ ${#missing[@]} -gt 0 ]]; then + print_color $RED "Error: Missing required packages: ${missing[*]}" + echo "Install with: sudo apt update && sudo apt install ${missing[*]}" + exit 1 + fi +} + +# Function to test SMART access and get available data - ENHANCED FOR NVMe +test_smart_access() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + # Test basic SMART access + if ! $smart_cmd -i "$disk" &>/dev/null; then + echo "no_access" + return + fi + + # Get SMART information + local smart_info=$($smart_cmd -i "$disk" 2>/dev/null) + + # Check if this is an NVMe drive + if echo "$smart_info" | grep -qi "NVMe"; then + # NVMe drives have different SMART implementation + if $smart_cmd -H "$disk" &>/dev/null; then + echo "full_access" + else + echo "no_attributes" + fi + return + fi + + # Check if SMART is available for SATA/SAS + if ! echo "$smart_info" | grep -q "SMART support is:"; then + echo "not_available" + return + fi + + # Extract SMART status + local smart_support_line=$(echo "$smart_info" | grep "SMART support is:") + local smart_available=$(echo "$smart_support_line" | grep -q "Available" && echo "Available" || echo "") + local smart_enabled=$(echo "$smart_support_line" | grep -q "Enabled" && echo "Enabled" || echo "") + + if [[ -z "$smart_available" ]]; then + echo "not_available" + return + fi + + if [[ -z "$smart_enabled" ]]; then + echo "disabled" + return + fi + + # Test attribute reading + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + if [[ -z "$attributes" ]] || ! echo "$attributes" | grep -q "ATTRIBUTE_NAME"; then + echo "no_attributes" + return + fi + + echo "full_access" +} + +# Function to get disk information with enhanced SAS and NVMe support +get_disk_info() { + local disk=$1 + local controller=$2 + + local smart_cmd="smartctl" + [[ -n "$controller" ]] && smart_cmd+=" -d $controller" + + local info=$($smart_cmd -i "$disk" 2>/dev/null) + local attributes=$($smart_cmd -A "$disk" 2>/dev/null) + local health=$($smart_cmd -H "$disk" 2>/dev/null) + + # Extract information with multiple fallbacks + local model=$(echo "$info" | grep -i "Device Model:\|Product:\|Model Number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model" + + local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:\|Namespace 1 Size/Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1) + + local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + + local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health\|Health Status:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1) + [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1) + + # Get disk type + local disk_type="UNKNOWN" + if echo "$info" | grep -qi "Solid State Device\|NVMe"; then + disk_type="SSD" + elif echo "$info" | grep -qi "Rotation Rate"; then + disk_type="HDD" + elif echo "$info" | grep -qi "SCSI\|SAS"; then + if echo "$info" | grep -qi "15000\|10000\|7200"; then + disk_type="HDD" + else + disk_type="SSD" + fi + fi + + # Extract SMART attributes with multiple field attempts + local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours" | awk '{print $10}' | head -1 | sed 's/[^0-9]//g') + + local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct" | awk '{print $10}' | head -1) + + local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $10}' | head -1) + + # For Kingston and other SSDs with different attribute names + local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Lifetime_Writes_GiB\|Host_Writes_32MiB\|Flash_Writes_GiB\|Data Units Written" | awk '{print $10}' | head -1) + local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $10}' | head -1) + + # For wear leveling indicators + local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count\|SSD_Life_Left\|Percentage Used\|Available Spare" | awk '{print $10}' | head -1) + + echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout" +} + +# Function to calculate TBW for SSD - ENHANCED FOR KINGSTON AND NVMe +calculate_tbw() { + local raw_value=$1 + local sectors=$2 + local disk_model=$3 + local attribute_name=$4 + + # Kingston SSDs use Lifetime_Writes_GiB and Flash_Writes_GiB + if echo "$disk_model" | grep -qi "KINGSTON"; then + if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + # Convert from GiB to TB + local tbw=$(echo "scale=2; $raw_value / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + return + fi + fi + + # NVMe drives use Data Units Written (1 unit = 1,000,000 bytes for NVMe 1.0+, 512,000 bytes for older) + if echo "$attribute_name" | grep -qi "Data Units Written"; then + if [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + # Convert from data units to TB (assuming 1,000,000 bytes per unit) + local bytes=$(echo "$raw_value * 1000000" | bc 2>/dev/null) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + return + fi + fi + + if [[ -n "$sectors" && "$sectors" != "0" ]]; then + local bytes=$((sectors * 512)) + local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then + local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc 2>/dev/null || echo "0") + echo "$tbw" + else + echo "0" + fi +} + +# Function to estimate SSD endurance based on model and capacity +estimate_ssd_endurance() { + local disk_model=$1 + local capacity_gb=$2 + + # Kingston consumer SSDs + if echo "$disk_model" | grep -qi "KINGSTON.*SA400"; then + if [[ $capacity_gb -ge 960 ]]; then + echo "300" # 300TB for 960GB Kingston SA400 + elif [[ $capacity_gb -ge 480 ]]; then + echo "150" # 150TB for 480GB Kingston + else + echo "80" # 80TB for smaller Kingston + fi + # NVMe SSDs typically have higher endurance + elif echo "$disk_model" | grep -qi "NVMe"; then + if [[ $capacity_gb -ge 2000 ]]; then + echo "1200" # 1.2PB for 2TB+ NVMe + elif [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB NVMe + elif [[ $capacity_gb -ge 500 ]]; then + echo "300" # 300TB for 500GB NVMe + else + echo "150" # 150TB for smaller NVMe + fi + # SAS SSDs typically have very high endurance + elif echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then + if [[ $capacity_gb -ge 1000 ]]; then + echo "10000" # 10PB for 1TB+ enterprise SAS SSD + elif [[ $capacity_gb -ge 600 ]]; then + echo "6000" # 6PB for 600GB enterprise SAS SSD + elif [[ $capacity_gb -ge 400 ]]; then + echo "4000" # 4PB for 400GB enterprise SAS SSD + else + echo "2000" # 2PB for smaller enterprise SAS SSD + fi + elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA\|WDC\|WESTERN DIGITAL"; then + # Enterprise SATA/NVMe SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "1200" # 1.2PB for 1TB enterprise + elif [[ $capacity_gb -ge 480 ]]; then + echo "600" # 600TB for 480GB enterprise + elif [[ $capacity_gb -ge 240 ]]; then + echo "300" # 300TB for 240GB enterprise + else + echo "150" # 150TB for smaller enterprise + fi + else + # Consumer SSDs + if [[ $capacity_gb -ge 1000 ]]; then + echo "600" # 600TB for 1TB consumer + elif [[ $capacity_gb -ge 480 ]]; then + echo "300" # 300TB for 480GB consumer + elif [[ $capacity_gb -ge 240 ]]; then + echo "150" # 150TB for 240GB consumer + elif [[ $capacity_gb -ge 120 ]]; then + echo "80" # 80TB for 120GB consumer + else + echo "40" # 40TB for smaller drives + fi + fi +} + +# Function to estimate SSD lifespan with TBW remaining - ENHANCED +estimate_ssd_lifespan() { + local power_on_hours=$1 + local tbw_used=$2 + local disk_model=$3 + local capacity_gb=$4 + local media_wearout=$5 + + if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then + echo "Unknown||Unknown||Unknown" + return + fi + + local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb") + local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc 2>/dev/null || echo "0") + + # If we have media wearout indicator, use it for more accurate estimation + if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then + # For Kingston, SSD_Life_Left is already a percentage + if echo "$disk_model" | grep -qi "KINGSTON"; then + if [[ $media_wearout -le 10 ]]; then + echo "${RED}${media_wearout}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + else + # For other drives, media_wearout might be countdown from 100 + local wear_percent=$media_wearout + if [[ $media_wearout -le 10 ]]; then + echo "${RED}${wear_percent}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout" + elif [[ $media_wearout -le 30 ]]; then + echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout" + elif [[ $media_wearout -le 70 ]]; then + echo "${YELLOW}${wear_percent}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout" + else + echo "${GREEN}${wear_percent}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout" + fi + fi + return + fi + + if [[ $(echo "$tbw_used > 0" | bc 2>/dev/null) -eq 1 ]]; then + local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc 2>/dev/null || echo "0") + local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc 2>/dev/null || echo "100") + + if [[ $(echo "$lifespan_used >= 80" | bc 2>/dev/null) -eq 1 ]]; then + echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw" + elif [[ $(echo "$lifespan_used >= 50" | bc 2>/dev/null) -eq 1 ]]; then + echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw" + else + echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw" + fi + else + echo "Unknown|${estimated_endurance} TB|New|estimated" + fi +} + +# Function to estimate HDD lifespan - FIXED POWER_ON_HOURS PARSING +estimate_hdd_lifespan() { + local power_on_hours=$1 + local reallocated_sectors=$2 + local pending_sectors=$3 + + # Clean power_on_hours to extract just the numeric part + local clean_hours=$(echo "$power_on_hours" | sed 's/[^0-9].*//') + clean_hours=${clean_hours:-0} + + if [[ -z "$clean_hours" || "$clean_hours" -eq 0 ]]; then + echo "Unknown" + return + fi + + reallocated_sectors=${reallocated_sectors:-0} + pending_sectors=${pending_sectors:-0} + + if [[ "$pending_sectors" -gt 0 ]]; then + echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)" + elif [[ "$reallocated_sectors" -gt 100 ]]; then + echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)" + elif [[ "$reallocated_sectors" -gt 10 ]]; then + echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)" + elif [[ "$clean_hours" -gt 40000 ]]; then + echo "${YELLOW}1-2 years${NC} (High usage: $clean_hours hours)" + elif [[ "$clean_hours" -gt 25000 ]]; then + echo "${GREEN}2-3 years${NC} (Moderate usage: $clean_hours hours)" + else + echo "${GREEN}> 3 years${NC} (Low usage: $clean_hours hours)" + fi +} + +# Function to check a single disk with enhanced error handling +check_disk() { + local disk=$1 + local controller=$2 + + print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})" + echo "==================================================" + + # Test SMART access level + local access_level=$(test_smart_access "$disk" "$controller") + + case $access_level in + "no_access") + print_color $RED "ERROR: Cannot access disk through controller" + echo "Possible reasons:" + echo " - Controller doesn't support SMART passthrough" + echo " - Disk is part of a hardware RAID array" + echo " - Insufficient permissions (try running as root)" + echo " - Controller busy or offline" + echo "" + return + ;; + "not_available") + print_color $YELLOW "SMART not available on this disk" + echo "This disk does not support SMART monitoring" + echo "" + return + ;; + "disabled") + print_color $YELLOW "SMART is disabled on this disk" + echo "SMART is available but currently disabled" + echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk" + echo "" + return + ;; + "no_attributes") + print_color $YELLOW "WARNING: Cannot read SMART attributes" + echo "This is common with hardware RAID controllers like PERC H730P" + echo "Try checking through the RAID management interface" + echo "" + return + ;; + "limited_attributes") + print_color $YELLOW "NOTE: Limited SMART data available" + echo "Controller is filtering some SMART attributes" + ;; + esac + + # Get disk information + local disk_info=$(get_disk_info "$disk" "$controller") + IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info" + + # Display basic information + echo "Model: ${model:-Unknown}" + echo "Serial: ${serial:-Unknown}" + echo "Type: $disk_type" + echo "Capacity: ${capacity:-Unknown}" + echo "Firmware: ${firmware:-Unknown}" + echo "Health: ${health_status:-Unknown}" + + # Only show power on hours if available + if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then + echo "Power On Hours: $power_on_hours" + else + echo "Power On Hours: Unknown" + fi + + # Disk type specific analysis + if [[ "$disk_type" == "SSD" ]]; then + # Get the actual attribute name for TBW calculation + local attributes=$(smartctl -A "$disk" 2>/dev/null) + local tbw_attribute_name=$(echo "$attributes" | grep -i "Lifetime_Writes_GiB\|Flash_Writes_GiB\|Data Units Written" | head -1 | awk '{print $2}') + + local tbw_used=0 + if [[ -n "$total_written" && "$total_written" != "0" ]]; then + tbw_used=$(calculate_tbw "" "$total_written" "$model" "$tbw_attribute_name") + elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then + tbw_used=$(calculate_tbw "$host_writes_32mib" "" "$model" "$tbw_attribute_name") + fi + + # Always show TBW information for SSDs + echo "TBW Used: ${tbw_used} TB" + + # Estimate capacity for endurance calculation + local capacity_gb=0 + if echo "$capacity" | grep -qi "GB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1) + elif echo "$capacity" | grep -qi "TB"; then + capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1) + fi + + local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout") + local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1) + local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2) + local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3) + local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4) + + echo "TBW Remaining: $tbw_remaining" + echo "Lifespan: $lifespan_percent ($wear_status)" + + # Show wear source if available + if [[ "$wear_source" == "media_wearout" ]]; then + echo "Wear Source: Media Wearout Indicator" + elif [[ "$wear_source" == "tbw" ]]; then + echo "Wear Source: TBW Calculation" + elif [[ "$wear_source" == "estimated" ]]; then + echo "Wear Source: Estimated Endurance" + fi + + elif [[ "$disk_type" == "HDD" ]]; then + if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then + echo "Realloc Sectors: $reallocated_sectors" + fi + if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then + echo "Pending Sectors: $pending_sectors" + fi + + local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}") + echo "Lifespan: $lifespan" + else + print_color $YELLOW "Limited information available for this disk type" + echo "This is normal for hardware RAID configurations like PERC H730P" + echo "For detailed SAS drive information, use controller management tools" + fi + + echo "" +} + +# Function to detect all disks with enhanced SAS support (no partitions) +detect_disks() { + local disks=() + + # Check for SATA/SAS disks - only main devices, no partitions + for disk in /dev/sd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for NVMe disks - only main devices, no partitions + for disk in /dev/nvme[0-9]n[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for SAS disks via SCSI generic - only main devices + for disk in /dev/sg[0-9]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + # Check for other disk types - only main devices + for disk in /dev/vd[a-z] /dev/xvd[a-z]; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + fi + done + + echo "${disks[@]}" +} + +# Function to detect RAID controllers (Ubuntu specific) +detect_raid_controllers() { + local controllers=("megaraid" "cciss" "areca" "3ware" "hpt") + local raid_disks=() + + # Check for RAID controllers + for controller in "${controllers[@]}"; do + for i in {0..31}; do + # Try different disk devices for each controller + for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do + if [[ -b "$base_disk" ]]; then + if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then + raid_disks+=("$base_disk:$controller,$i") + break + fi + fi + done + done + done + + echo "${raid_disks[@]}" +} + +# Main function +main() { + print_color $BLUE "Ubuntu 24.04 Disk Health Check Script v$VERSION" + print_color $BLUE "Enhanced with PERC H730P and SAS Support" + print_color $BLUE "============================================" + echo "" + + check_dependencies + + local disks=() + + # If specific disk provided, check only that disk + if [[ $# -gt 0 ]]; then + for disk in "$@"; do + if [[ -b "$disk" ]]; then + disks+=("$disk") + else + print_color $RED "Error: $disk is not a valid block device" + fi + done + else + # Auto-detect disks + print_color $CYAN "Auto-detecting disks (excluding partitions)..." + local direct_disks=() + read -ra direct_disks <<< "$(detect_disks)" + + print_color $CYAN "Scanning for RAID controllers..." + local raid_disks=() + read -ra raid_disks <<< "$(detect_raid_controllers)" + + # Combine both lists + disks=("${direct_disks[@]}" "${raid_disks[@]}") + fi + + if [[ ${#disks[@]} -eq 0 ]]; then + print_color $RED "No disks found or accessible" + echo "Try running as root or specifying disk paths manually" + exit 1 + fi + + print_color $GREEN "Found ${#disks[@]} disk(s) to check" + echo "" + + # Check if running as root, warn if not + if [[ $EUID -ne 0 ]]; then + print_color $YELLOW "Warning: Not running as root." + print_color $YELLOW "Some disks/controllers may show limited information." + echo "For complete results, run as: sudo $0" + echo "" + fi + + # Check each disk + for disk_info in "${disks[@]}"; do + # Check if this is a RAID disk (has controller specified) + if [[ "$disk_info" == *":"* ]]; then + IFS=':' read -r disk controller <<< "$disk_info" + check_disk "$disk" "$controller" + else + check_disk "$disk_info" + fi + done + + print_color $BLUE "Check completed!" + echo "" + print_color $CYAN "Note: For PERC H730P controllers with SAS drives:" + print_color $CYAN " - Install 'storcli' for detailed controller information" + print_color $CYAN " - Use 'smartctl -d sat /dev/sgX' to try direct access" + print_color $CYAN " - Hardware RAID controllers often limit SMART data access" + echo "" + print_color $CYAN "Ubuntu-specific tips:" + print_color $CYAN " - Use 'lsblk' to see all available block devices" + print_color $CYAN " - Use 'lshw -class disk' for detailed disk information" +} + +# Usage information +usage() { + echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]" + echo "" + echo "If no disks specified, auto-detects all available disks" + echo "" + echo "Examples:" + echo " $SCRIPT_NAME # Check all auto-detected disks" + echo " sudo $SCRIPT_NAME # Check all disks (as root)" + echo " $SCRIPT_NAME /dev/sda # Check specific disk" + echo " $SCRIPT_NAME /dev/nvme0n1 # Check NVMe disk" + echo " $SCRIPT_NAME /dev/sg0 # Check SAS disk directly" + echo " $SCRIPT_NAME /dev/sda /dev/nvme0n1 # Check multiple disks" +} + +# Parse command line arguments +case "${1:-}" in + -h|--help) + usage + exit 0 + ;; + -v|--version) + echo "$SCRIPT_NAME version $VERSION" + exit 0 + ;; + *) + main "$@" + ;; +esac