disk-health/old/obsolete/harvester-v2.4.sh

#!/bin/bash

# Disk Health Check Script for Harvester OS
# Enhanced with SAS/PERC H730P controller support
# Checks SSD TBW/lifespan and HDD health status

SCRIPT_NAME=$(basename "$0")
VERSION="2.4"

# Color codes
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
YELLOW=$(tput setaf 3)
BLUE=$(tput setaf 4)
CYAN=$(tput setaf 6)
NC=$(tput sgr0)

# Function to print colored output
print_color() {
    local color=$1
    local message=$2
    echo -e "${color}${message}${NC}"
}

# Check if command exists
command_exists() {
    command -v "$1" >/dev/null 2>&1
}

if ! command_exists smartctl; then
    print_color $RED "Error: smartctl is not installed. Please install smartmontools package."
    exit 1
fi

# Function to test SMART access and get available data
test_smart_access() {
    local disk=$1
    local controller=$2

    local smart_cmd="smartctl"
    [[ -n "$controller" ]] && smart_cmd+=" -d $controller"

    # Test basic SMART access
    if ! $smart_cmd -i "$disk" &>/dev/null; then
        echo "no_access"
        return
    fi

    # Check if SMART is enabled (don't enable it, just check status)
    local smart_info=$($smart_cmd -i "$disk" 2>/dev/null)
    local smart_available=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $4}')
    local smart_enabled=$(echo "$smart_info" | grep "SMART support is:" | awk '{print $6}')

    if [[ "$smart_available" != "Available" ]]; then
        echo "not_available"
        return
    fi

    if [[ "$smart_enabled" != "Enabled" ]]; then
        echo "disabled"
        return
    fi

    # Test attribute reading
    local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
    if [[ -z "$attributes" ]]; then
        echo "no_attributes"
        return
    fi

    # Check if we have basic attributes
    local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $10}' | head -1)
    if [[ -z "$power_on_hours" ]]; then
        echo "limited_attributes"
        return
    fi

    echo "full_access"
}

# Function to get disk information with enhanced SAS support
get_disk_info() {
    local disk=$1
    local controller=$2

    local smart_cmd="smartctl"
    [[ -n "$controller" ]] && smart_cmd+=" -d $controller"

    local info=$($smart_cmd -i "$disk" 2>/dev/null)
    local attributes=$($smart_cmd -A "$disk" 2>/dev/null)
    local health=$($smart_cmd -H "$disk" 2>/dev/null)

    # Extract information with multiple fallbacks for SAS drives
    local model=$(echo "$info" | grep -i "Device Model:\|Product:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
    local vendor=$(echo "$info" | grep -i "Vendor:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
    [[ -n "$vendor" && -n "$model" ]] && model="$vendor $model"

    local serial=$(echo "$info" | grep -i "Serial Number:\|Serial number:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)

    local capacity=$(echo "$info" | grep -i "User Capacity:\|Total NVM Capacity:" | cut -d: -f2 | sed 's/^[ \t]*//' | cut -d'[' -f1 | head -1)

    local firmware=$(echo "$info" | grep -i "Firmware Version:\|Firmware revision:\|Revision:" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)

    local health_status=$(echo "$health" | grep -i "result:\|SMART overall-health" | cut -d: -f2 | sed 's/^[ \t]*//' | head -1)
    [[ -z "$health_status" ]] && health_status=$(echo "$health" | grep -i "SMART overall-health" | awk -F'[' '{print $2}' | cut -d']' -f1)

    # Get disk type with SAS support
    local disk_type="UNKNOWN"
    if echo "$info" | grep -qi "Solid State Device"; then
        disk_type="SSD"
    elif echo "$info" | grep -qi "Rotation Rate"; then
        disk_type="HDD"
    elif echo "$info" | grep -qi "SCSI\|SAS"; then
        # SAS drives often don't specify, check rotation rate
        if echo "$info" | grep -qi "15000\|10000\|7200"; then
            disk_type="HDD"
        else
            disk_type="SSD"
        fi
    fi

    # Extract SMART attributes with multiple field attempts for SAS
    local power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours\|Power-On" | awk '{print $NF}' | head -1)
    [[ -z "$power_on_hours" ]] && power_on_hours=$(echo "$attributes" | grep -i "Power_On_Hours_and_Msec" | awk '{print $10}' | head -1)

    local reallocated_sectors=$(echo "$attributes" | grep -i "Reallocated_Sector_Ct\|Reallocated_Event_Count" | awk '{print $NF}' | head -1)

    local pending_sectors=$(echo "$attributes" | grep -i "Current_Pending_Sector" | awk '{print $NF}' | head -1)

    local total_written=$(echo "$attributes" | grep -i "Total_LBAs_Written\|Host_Writes_32MiB\|Lifetime_Writes" | awk '{print $NF}' | head -1)
    local host_writes_32mib=$(echo "$attributes" | grep -i "Host_Writes_32MiB" | awk '{print $NF}' | head -1)

    # For SAS drives, try to get media wearout for SSDs
    local media_wearout=$(echo "$attributes" | grep -i "Media_Wearout_Indicator\|Wear_Leveling_Count" | awk '{print $NF}' | head -1)

    echo "$model|$serial|$capacity|$firmware|$health_status|$disk_type|$power_on_hours|$reallocated_sectors|$pending_sectors|$total_written|$host_writes_32mib|$media_wearout"
}

# Function to calculate TBW for SSD
calculate_tbw() {
    local raw_value=$1
    local sectors=$2

    if [[ -n "$sectors" && "$sectors" != "0" ]]; then
        local bytes=$((sectors * 512))
        local tbw=$(echo "scale=2; $bytes / 1000 / 1000 / 1000 / 1000" | bc -l 2>/dev/null || echo "0")
        echo "$tbw"
    elif [[ -n "$raw_value" && "$raw_value" != "0" ]]; then
        local tbw=$(echo "scale=2; $raw_value * 32 / 1000 / 1000" | bc -l 2>/dev/null || echo "0")
        echo "$tbw"
    else
        echo "0"
    fi
}

# Function to estimate SSD endurance based on model and capacity
estimate_ssd_endurance() {
    local disk_model=$1
    local capacity_gb=$2

    # SAS SSDs typically have very high endurance
    if echo "$disk_model" | grep -qi "ST600MP\|SEAGATE.*SSD\|SAS.*SSD"; then
        # Enterprise SAS SSDs - very high endurance
        if [[ $capacity_gb -ge 1000 ]]; then
            echo "10000"  # 10PB for 1TB+ enterprise SAS SSD
        elif [[ $capacity_gb -ge 600 ]]; then
            echo "6000"   # 6PB for 600GB enterprise SAS SSD
        elif [[ $capacity_gb -ge 400 ]]; then
            echo "4000"   # 4PB for 400GB enterprise SAS SSD
        else
            echo "2000"   # 2PB for smaller enterprise SAS SSD
        fi
    elif echo "$disk_model" | grep -qi "MTFDDAK\|MICRON\|INTEL\|SAMSUNG\|KIOXIA"; then
        # Enterprise SATA SSDs
        if [[ $capacity_gb -ge 1000 ]]; then
            echo "1200"  # 1.2PB for 1TB enterprise
        elif [[ $capacity_gb -ge 480 ]]; then
            echo "600"   # 600TB for 480GB enterprise
        elif [[ $capacity_gb -ge 240 ]]; then
            echo "300"   # 300TB for 240GB enterprise
        else
            echo "150"   # 150TB for smaller enterprise
        fi
    else
        # Consumer SSDs
        if [[ $capacity_gb -ge 1000 ]]; then
            echo "600"   # 600TB for 1TB consumer
        elif [[ $capacity_gb -ge 480 ]]; then
            echo "300"   # 300TB for 480GB consumer
        elif [[ $capacity_gb -ge 240 ]]; then
            echo "150"   # 150TB for 240GB consumer
        elif [[ $capacity_gb -ge 120 ]]; then
            echo "80"    # 80TB for 120GB consumer
        else
            echo "40"    # 40TB for smaller drives
        fi
    fi
}

# Function to estimate SSD lifespan with TBW remaining
estimate_ssd_lifespan() {
    local power_on_hours=$1
    local tbw_used=$2
    local disk_model=$3
    local capacity_gb=$4
    local media_wearout=$5

    if [[ -z "$power_on_hours" || "$power_on_hours" -eq 0 ]]; then
        echo "Unknown||Unknown||Unknown"
        return
    fi

    local estimated_endurance=$(estimate_ssd_endurance "$disk_model" "$capacity_gb")
    local tbw_remaining=$(echo "scale=2; $estimated_endurance - $tbw_used" | bc -l 2>/dev/null || echo "0")

    # If we have media wearout indicator, use it for more accurate estimation
    if [[ -n "$media_wearout" && "$media_wearout" != "0" ]]; then
        if [[ $media_wearout -le 10 ]]; then
            echo "${RED}10%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}Critical wear${NC}|media_wearout"
        elif [[ $media_wearout -le 30 ]]; then
            echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}High wear${NC}|media_wearout"
        elif [[ $media_wearout -le 70 ]]; then
            echo "${YELLOW}${media_wearout}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|media_wearout"
        else
            echo "${GREEN}${media_wearout}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|media_wearout"
        fi
        return
    fi

    if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
        local lifespan_used=$(echo "scale=1; $tbw_used * 100 / $estimated_endurance" | bc -l 2>/dev/null || echo "0")
        local lifespan_remaining=$(echo "scale=1; 100 - $lifespan_used" | bc -l 2>/dev/null || echo "100")

        if [[ $(echo "$lifespan_used >= 80" | bc -l) -eq 1 ]]; then
            echo "${RED}${lifespan_remaining}%${NC}|${RED}${tbw_remaining} TB${NC}|${RED}High wear${NC}|tbw"
        elif [[ $(echo "$lifespan_used >= 50" | bc -l) -eq 1 ]]; then
            echo "${YELLOW}${lifespan_remaining}%${NC}|${YELLOW}${tbw_remaining} TB${NC}|${YELLOW}Moderate wear${NC}|tbw"
        else
            echo "${GREEN}${lifespan_remaining}%${NC}|${GREEN}${tbw_remaining} TB${NC}|${GREEN}Healthy${NC}|tbw"
        fi
    else
        echo "Unknown|${estimated_endurance} TB|New|estimated"
    fi
}

# Function to estimate HDD lifespan
estimate_hdd_lifespan() {
    local power_on_hours=$1
    local reallocated_sectors=$2
    local pending_sectors=$3

    if [[ -z "$power_on_hours" ]]; then
        echo "Unknown"
        return
    fi

    power_on_hours=${power_on_hours:-0}
    reallocated_sectors=${reallocated_sectors:-0}
    pending_sectors=${pending_sectors:-0}

    if [[ "$pending_sectors" -gt 0 ]]; then
        echo "${RED}CRITICAL${NC} (Pending sectors: $pending_sectors)"
    elif [[ "$reallocated_sectors" -gt 100 ]]; then
        echo "${RED}< 6 months${NC} (High reallocated sectors: $reallocated_sectors)"
    elif [[ "$reallocated_sectors" -gt 10 ]]; then
        echo "${YELLOW}6-12 months${NC} (Reallocated sectors: $reallocated_sectors)"
    elif [[ "$power_on_hours" -gt 40000 ]]; then
        echo "${YELLOW}1-2 years${NC} (High usage: $power_on_hours hours)"
    elif [[ "$power_on_hours" -gt 25000 ]]; then
        echo "${GREEN}2-3 years${NC} (Moderate usage: $power_on_hours hours)"
    else
        echo "${GREEN}> 3 years${NC} (Low usage: $power_on_hours hours)"
    fi
}

# Function to check a single disk with enhanced error handling
check_disk() {
    local disk=$1
    local controller=$2

    print_color $CYAN "Checking disk: $disk (Controller: ${controller:-direct})"
    echo "=================================================="

    # Test SMART access level
    local access_level=$(test_smart_access "$disk" "$controller")

    case $access_level in
        "no_access")
            print_color $RED "ERROR: Cannot access disk through controller"
            echo "Possible reasons:"
            echo "  - Controller doesn't support SMART passthrough"
            echo "  - Disk is part of a hardware RAID array"
            echo "  - Insufficient permissions (try running as root)"
            echo "  - Controller busy or offline"
            echo ""
            return
            ;;
        "not_available")
            print_color $YELLOW "SMART not available on this disk"
            echo "This disk does not support SMART monitoring"
            echo ""
            return
            ;;
        "disabled")
            print_color $YELLOW "SMART is disabled on this disk"
            echo "SMART is available but currently disabled"
            echo "To enable manually: smartctl -s on ${controller:+-d $controller} $disk"
            echo ""
            return
            ;;
        "no_attributes")
            print_color $YELLOW "WARNING: Cannot read SMART attributes"
            echo "This is common with hardware RAID controllers like PERC H730P"
            echo "Try checking through the RAID management interface"
            echo ""
            return
            ;;
        "limited_attributes")
            print_color $YELLOW "NOTE: Limited SMART data available"
            echo "Controller is filtering some SMART attributes"
            ;;
    esac

    # Get disk information
    local disk_info=$(get_disk_info "$disk" "$controller")
    IFS='|' read -r model serial capacity firmware health_status disk_type power_on_hours reallocated_sectors pending_sectors total_written host_writes_32mib media_wearout <<< "$disk_info"

    # Display basic information
    echo "Model:          ${model:-Unknown}"
    echo "Serial:         ${serial:-Unknown}"
    echo "Type:           $disk_type"
    echo "Capacity:       ${capacity:-Unknown}"
    echo "Firmware:       ${firmware:-Unknown}"
    echo "Health:         ${health_status:-Unknown}"

    # Only show power on hours if available
    if [[ -n "$power_on_hours" && "$power_on_hours" != "0" ]]; then
        echo "Power On Hours: $power_on_hours"
    else
        echo "Power On Hours: Unknown"
    fi

    # Disk type specific analysis
    if [[ "$disk_type" == "SSD" ]]; then
        local tbw_used=0
        if [[ -n "$total_written" && "$total_written" != "0" ]]; then
            tbw_used=$(calculate_tbw "" "$total_written")
        elif [[ -n "$host_writes_32mib" && "$host_writes_32mib" != "0" ]]; then
            tbw_used=$(calculate_tbw "$host_writes_32mib" "")
        fi

        if [[ $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
            echo "TBW Used:       ${tbw_used} TB"
        fi

        # Estimate capacity for endurance calculation
        local capacity_gb=0
        if echo "$capacity" | grep -qi "GB"; then
            capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | cut -d. -f1)
        elif echo "$capacity" | grep -qi "TB"; then
            capacity_gb=$(echo "$capacity" | grep -o '[0-9.]*' | head -1 | awk '{print $1 * 1000}' | bc 2>/dev/null | cut -d. -f1)
        fi

        local lifespan_info=$(estimate_ssd_lifespan "$power_on_hours" "$tbw_used" "$model" "$capacity_gb" "$media_wearout")
        local lifespan_percent=$(echo "$lifespan_info" | cut -d'|' -f1)
        local tbw_remaining=$(echo "$lifespan_info" | cut -d'|' -f2)
        local wear_status=$(echo "$lifespan_info" | cut -d'|' -f3)
        local wear_source=$(echo "$lifespan_info" | cut -d'|' -f4)

        if [[ "$wear_source" != "media_wearout" && $(echo "$tbw_used > 0" | bc -l 2>/dev/null) -eq 1 ]]; then
            echo "TBW Remaining:  $tbw_remaining"
        fi

        echo "Lifespan:       $lifespan_percent ($wear_status)"

        # Show wear source if available
        if [[ "$wear_source" == "media_wearout" ]]; then
            echo "Wear Source:    Media Wearout Indicator"
        elif [[ "$wear_source" == "tbw" ]]; then
            echo "Wear Source:    TBW Calculation"
        elif [[ "$wear_source" == "estimated" ]]; then
            echo "Wear Source:    Estimated Endurance"
        fi

    elif [[ "$disk_type" == "HDD" ]]; then
        if [[ -n "$reallocated_sectors" && "$reallocated_sectors" != "0" ]]; then
            echo "Realloc Sectors: $reallocated_sectors"
        fi
        if [[ -n "$pending_sectors" && "$pending_sectors" != "0" ]]; then
            echo "Pending Sectors: $pending_sectors"
        fi

        local lifespan=$(estimate_hdd_lifespan "$power_on_hours" "${reallocated_sectors:-0}" "${pending_sectors:-0}")
        echo "Lifespan:       $lifespan"
    else
        print_color $YELLOW "Limited information available for this disk type"
        echo "This is normal for hardware RAID configurations like PERC H730P"
        echo "For detailed SAS drive information, use controller management tools"
    fi

    echo ""
}

# Function to detect RAID controllers and disks with PERC H730P support
detect_raid_disks() {
    local controllers=("megaraid" "cciss" "areca" "3ware" "hpt" "auto")
    local disks=()

    # Check for direct disks first - only main devices, no partitions
    for disk in /dev/sd[a-z]; do
        if [[ -b "$disk" ]]; then
            disks+=("$disk:direct")
        fi
    done

    # Check for NVMe disks - only main devices, no partitions
    for disk in /dev/nvme[0-9]n[0-9]; do
        if [[ -b "$disk" ]]; then
            disks+=("$disk:direct")
        fi
    done

    # Check for SAS disks directly via SCSI generic
    for disk in /dev/sg[0-9]; do
        if [[ -b "$disk" ]]; then
            disks+=("$disk:direct")
        fi
    done

    # Check for RAID controllers with enhanced detection
    for controller in "${controllers[@]}"; do
        print_color $BLUE "Scanning for $controller controllers..."
        for i in {0..31}; do
            # Try different disk devices for each controller
            for base_disk in "/dev/sda" "/dev/sdb" "/dev/sdc" "/dev/sg0" "/dev/sg1"; do
                if [[ -b "$base_disk" ]]; then
                    if smartctl -d "$controller,$i" -i "$base_disk" &>/dev/null; then
                        disks+=("$base_disk:$controller,$i")
                        print_color $GREEN "  Found $controller,$i on $base_disk"
                        break
                    fi
                fi
            done
        done
    done

    # Special detection for PERC H730P
    print_color $BLUE "Scanning for PERC H730P controllers..."
    if command_exists storcli; then
        print_color $GREEN "  storcli detected - checking PERC H730P"
        disks+=("/dev/sda:perc-h730p")
    fi

    echo "${disks[@]}"
}

# Main function
main() {
    print_color $BLUE "Harvester OS Disk Health Check Script v$VERSION"
    print_color $BLUE "Enhanced with PERC H730P and SAS Support"
    print_color $BLUE "============================================"
    echo ""

    if ! command_exists smartctl; then
        print_color $RED "Error: smartctl is not installed. Please install smartmontools package."
        exit 1
    fi

    local disks=()

    # If specific disk provided, check only that disk
    if [[ $# -gt 0 ]]; then
        for disk in "$@"; do
            if [[ -b "$disk" ]]; then
                disks+=("$disk:direct")
            else
                print_color $RED "Error: $disk is not a valid block device"
            fi
        done
    else
        # Auto-detect disks
        print_color $CYAN "Auto-detecting disks and RAID controllers..."
        read -ra disks <<< "$(detect_raid_disks)"
    fi

    if [[ ${#disks[@]} -eq 0 ]]; then
        print_color $RED "No disks found or accessible"
        echo "Try running as root: sudo $0"
        exit 1
    fi

    print_color $GREEN "Found ${#disks[@]} disk(s) to check"
    echo ""

    # Check if running as root
    if [[ $EUID -ne 0 ]]; then
        print_color $YELLOW "Warning: Not running as root."
        print_color $YELLOW "Some disks/controllers may show limited information."
        echo ""
    fi

    # Check each disk
    for disk_info in "${disks[@]}"; do
        IFS=':' read -r disk controller <<< "$disk_info"
        check_disk "$disk" "$controller"
    done

    print_color $BLUE "Check completed!"
    echo ""
    print_color $CYAN "Note: For PERC H730P controllers with SAS drives:"
    print_color $CYAN "  - Use 'storcli /c0 show all' for detailed information"
    print_color $CYAN "  - Use 'storcli /c0/eall/sall show' for physical disk status"
    print_color $CYAN "  - Hardware RAID controllers often limit SMART data access"
}

# Usage information
usage() {
    echo "Usage: $SCRIPT_NAME [DISK1 DISK2 ...]"
    echo ""
    echo "If no disks specified, auto-detects all available disks and RAID arrays"
    echo ""
    echo "Examples:"
    echo "  sudo $SCRIPT_NAME              # Check all disks (recommended)"
    echo "  $SCRIPT_NAME /dev/sda          # Check specific disk"
    echo "  $SCRIPT_NAME /dev/sg0          # Check SAS disk directly"
    echo "  $SCRIPT_NAME /dev/sda /dev/sdb # Check multiple disks"
}

# Parse command line arguments
case "${1:-}" in
    -h|--help)
        usage
        exit 0
        ;;
    -v|--version)
        echo "$SCRIPT_NAME version $VERSION"
        exit 0
        ;;
    *)
        main "$@"
        ;;
esac