Linux-Server-Management-Too…/modules/performance/hardware-health-check.sh

#!/bin/bash

# Hardware Health Check
# Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors

# Get the script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

# Source required libraries
source "$TOOLKIT_ROOT/lib/common-functions.sh"
source "$TOOLKIT_ROOT/lib/system-detect.sh"
source "$TOOLKIT_ROOT/lib/reference-db.sh"

# Initialize system detection
detect_system

# Load system info from reference database
if [ -f "$TOOLKIT_ROOT/.sysref" ]; then
    SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
    SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
fi

# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'

# Report file
REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt"

# Analysis results storage
declare -a FINDINGS=()

# Function to add finding
add_finding() {
    local severity="$1"
    local title="$2"
    local details="$3"
    local recommendation="$4"

    # Use @@@SEP@@@ as separator to avoid conflicts with content
    FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation")
}

# Function to check if command exists
command_exists() {
    command -v "$1" &>/dev/null
}

# Function to check SMART status
check_disk_smart() {
    echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..."

    if ! command_exists smartctl; then
        add_finding "INFO" "SMART Tools Not Installed" \
            "smartmontools is not installed - cannot check disk health" \
            "Install SMART tools: yum install smartmontools
After installing, run: systemctl enable smartd && systemctl start smartd"
        return
    fi

    # Find all disks
    local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')

    if [ -z "$disks" ]; then
        add_finding "WARNING" "No Disks Found" \
            "Could not detect any disk devices" \
            "Check system configuration: lsblk -a"
        return
    fi

    local disk_count=0
    local healthy_count=0
    local warning_count=0
    local failed_count=0

    for disk in $disks; do
        disk_count=$((disk_count + 1))

        # Check if SMART is available
        if ! smartctl -i "$disk" &>/dev/null; then
            continue
        fi

        # Get SMART health status
        local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}')

        # Get disk model and serial
        local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//')
        [ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//')
        local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//')

        # Get key SMART attributes
        local reallocated=$(smartctl -A "$disk" 2>/dev/null | grep "Reallocated_Sector" | awk '{print $10}')
        local pending=$(smartctl -A "$disk" 2>/dev/null | grep "Current_Pending_Sector" | awk '{print $10}')
        local uncorrectable=$(smartctl -A "$disk" 2>/dev/null | grep "Offline_Uncorrectable" | awk '{print $10}')
        local temp=$(smartctl -A "$disk" 2>/dev/null | grep "Temperature_Celsius" | awk '{print $10}')
        local power_on=$(smartctl -A "$disk" 2>/dev/null | grep "Power_On_Hours" | awk '{print $10}')

        # Determine severity
        if [[ "$health" =~ PASSED ]]; then
            # Check for warning signs even if passed
            if [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then
                warning_count=$((warning_count + 1))
                add_finding "WARNING" "Disk $disk: Reallocated Sectors Detected" \
                    "Device: $disk
Model: $model
Serial: $serial
Health: $health
Reallocated Sectors: $reallocated
Pending Sectors: ${pending:-0}
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
                    "Disk has reallocated sectors - sign of potential failure
  • Monitor closely: smartctl -A $disk
  • Plan for replacement
  • Ensure backups are current"
            elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then
                warning_count=$((warning_count + 1))
                add_finding "WARNING" "Disk $disk: Pending Sectors Detected" \
                    "Device: $disk
Model: $model
Serial: $serial
Health: $health
Pending Sectors: $pending
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
                    "Disk has pending sectors - potential read/write issues
  • Monitor closely: smartctl -A $disk
  • Check system logs: grep -i '$disk' /var/log/messages
  • Consider replacement if increasing"
            else
                healthy_count=$((healthy_count + 1))
                add_finding "INFO" "Disk $disk: Healthy" \
                    "Device: $disk
Model: $model
Serial: $serial
Health: $health
Reallocated Sectors: ${reallocated:-0}
Pending Sectors: ${pending:-0}
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
                    "Disk is healthy - continue regular monitoring"
            fi
        else
            failed_count=$((failed_count + 1))
            add_finding "CRITICAL" "Disk $disk: SMART FAILURE" \
                "Device: $disk
Model: $model
Serial: $serial
Health: ${health:-UNKNOWN}
Reallocated Sectors: ${reallocated:-N/A}
Pending Sectors: ${pending:-N/A}
Uncorrectable Sectors: ${uncorrectable:-N/A}
Temperature: ${temp:-N/A}°C" \
                "IMMEDIATE ACTION REQUIRED - Disk failing:
  • Backup all data immediately
  • Replace disk as soon as possible
  • Review SMART details: smartctl -a $disk
  • Check system logs: grep -i '$disk' /var/log/messages"
        fi
    done

    # Summary finding
    add_finding "INFO" "Disk Health Summary" \
        "Total disks checked: $disk_count
Healthy: $healthy_count
Warning: $warning_count
Failed: $failed_count" \
        "Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
}

# Function to check memory health
check_memory_health() {
    echo -e "${CYAN}[INFO]${NC} Checking memory health..."

    if ! command_exists dmidecode; then
        add_finding "INFO" "dmidecode Not Available" \
            "dmidecode is not installed - cannot check memory details" \
            "Install dmidecode: yum install dmidecode"
        return
    fi

    # Get memory information
    local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$")
    local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed")

    # Get total memory
    local total_mem=$(free -h | grep "Mem:" | awk '{print $2}')

    # Check for ECC
    local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l)

    # Check for memory errors in dmesg
    local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l)

    # Check hardware errors in system log
    local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | wc -l)

    # Build memory details
    local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:" | sed 's/^[ \t]*/  /')

    if [ "$mem_errors" -gt 0 ] || [ "$hw_mem_errors" -gt 0 ]; then
        # Get recent error samples
        local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')

        add_finding "CRITICAL" "Memory Errors Detected" \
            "Total Memory: $total_mem
Slots: $populated_slots / $total_slots
ECC Support: $([ $ecc_support -gt 0 ] && echo 'Yes' || echo 'No')
Memory errors in dmesg: $mem_errors
Hardware errors in logs: $hw_mem_errors

Recent errors:
$recent_errors" \
            "Memory errors detected - investigate immediately:
  • Run memory test: Install and run memtest86+ (reboot required)
  • Check details: dmidecode -t memory
  • Review all errors: grep -i 'memory.*error' /var/log/messages
  • If ECC, check: dmidecode -t memory | grep -A 5 'Error Information'
  • Contact hosting provider if virtual machine
  • Replace faulty memory modules"
    else
        add_finding "INFO" "Memory Health Status" \
            "Total Memory: $total_mem
Slots: $populated_slots / $total_slots
ECC Support: $([ $ecc_support -gt 0 ] && echo 'Yes' || echo 'No')
Memory errors: None detected

Installed Modules:
$mem_modules" \
            "Memory appears healthy - no errors detected"
    fi
}

# Function to check CPU health
check_cpu_health() {
    echo -e "${CYAN}[INFO]${NC} Checking CPU health..."

    # Get CPU info
    local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')
    local cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
    local cpu_threads=$(nproc)

    # Check for CPU errors in dmesg
    local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l)

    # Check system log
    local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l)

    # Get current CPU frequency
    local cpu_freq=""
    if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then
        local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq)
        cpu_freq=$(echo "scale=2; $freq_khz / 1000000" | bc)" GHz"
    fi

    # Check CPU temperature if sensors available
    local cpu_temp="N/A"
    if command_exists sensors; then
        cpu_temp=$(sensors 2>/dev/null | grep -E "Core 0|temp1" | head -1 | grep -oP '\+\K[0-9.]+' | head -1)
        [ -n "$cpu_temp" ] && cpu_temp="${cpu_temp}°C"
    fi

    # Check load average
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//')

    if [ "$cpu_errors" -gt 0 ] || [ "$hw_cpu_errors" -gt 0 ]; then
        local recent_errors=$(grep -iE "mce|machine check|cpu.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')

        add_finding "CRITICAL" "CPU Errors Detected" \
            "CPU Model: $cpu_model
Cores: $cpu_cores
Threads: $cpu_threads
Current Frequency: ${cpu_freq:-N/A}
Temperature: $cpu_temp
Load Average: $load_avg
MCE/CPU errors in dmesg: $cpu_errors
Hardware errors in logs: $hw_cpu_errors

Recent errors:
$recent_errors" \
            "CPU errors detected - critical hardware issue:
  • Check full details: dmesg | grep -i mce
  • Review MCE logs: grep -i 'machine check' /var/log/messages
  • Check temperature: sensors (install: yum install lm_sensors)
  • Contact hosting provider/hardware vendor immediately
  • May indicate failing CPU or motherboard"
    else
        add_finding "INFO" "CPU Health Status" \
            "CPU Model: $cpu_model
Cores: $cpu_cores
Threads: $cpu_threads
Current Frequency: ${cpu_freq:-N/A}
Temperature: $cpu_temp
Load Average: $load_avg
Hardware errors: None detected" \
            "CPU appears healthy - no errors detected"
    fi

    # Check if sensors are available for monitoring
    if ! command_exists sensors; then
        add_finding "INFO" "Temperature Monitoring Not Available" \
            "lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \
            "Install sensors for temperature monitoring:
  • yum install lm_sensors
  • sensors-detect (answer YES to all)
  • sensors (view temperatures)"
    fi
}

# Function to check system hardware errors
check_hardware_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..."

    # Check for general hardware errors
    local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l)

    if [ "$hw_errors" -gt 0 ]; then
        local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/  /')

        add_finding "WARNING" "Hardware Errors in System Log" \
            "Total hardware-related errors: $hw_errors

Recent errors (last 10):
$recent_errors" \
            "Hardware errors detected in system logs:
  • Review full log: grep -iE 'hardware error|i/o error' /var/log/messages
  • Check dmesg: dmesg | grep -i error | tail -20
  • Identify failing component (disk, memory, CPU, etc.)
  • Run component-specific diagnostics
  • Contact hosting provider if persistent"
    fi
}

# Function to check RAID status
check_raid_status() {
    echo -e "${CYAN}[INFO]${NC} Checking RAID status..."

    local raid_found=false

    # Check for software RAID (mdadm)
    if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then
        raid_found=true
        local raid_status=$(cat /proc/mdstat)
        local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]")

        if [ "$degraded" -gt 0 ]; then
            add_finding "CRITICAL" "Software RAID Degraded" \
                "RAID array is degraded:

$raid_status" \
                "RAID array degraded - immediate action required:
  • Check details: cat /proc/mdstat
  • Identify failed drive: mdadm --detail /dev/md*
  • Replace failed drive and rebuild array
  • Ensure backups are current"
        else
            add_finding "INFO" "Software RAID Status" \
                "$raid_status" \
                "Software RAID is healthy"
        fi
    fi

    # Check for hardware RAID (common controllers)
    if command_exists megacli; then
        raid_found=true
        local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive")
        add_finding "INFO" "MegaRAID Status" \
            "$raid_info" \
            "Check details: megacli -LDInfo -Lall -aALL"
    fi

    if ! $raid_found; then
        add_finding "INFO" "No RAID Detected" \
            "No software or hardware RAID arrays detected" \
            "System appears to use non-RAID storage"
    fi
}

# Function to check disk I/O errors
check_disk_io_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..."

    # Check for I/O errors in dmesg
    local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l)

    if [ "$io_errors" -gt 0 ]; then
        local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/  /')

        add_finding "WARNING" "Disk I/O Errors Detected" \
            "Total I/O errors in dmesg: $io_errors

Recent I/O errors (last 10):
$recent_io_errors" \
            "Disk I/O errors detected - indicates hardware or connection issues:
  • Check SMART status (see above)
  • Review dmesg: dmesg | grep -i 'i/o error'
  • Check cables and connections (if physical server)
  • Check for disk controller issues
  • May indicate failing disk or controller"
    fi
}

# Function to generate report
generate_report() {
    local report_content=""

    report_content+="=============================================================================="$'\n'
    report_content+="HARDWARE HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')"$'\n'
    report_content+="=============================================================================="$'\n'
    report_content+=""$'\n'
    report_content+="System: $SYS_HOSTNAME"$'\n'
    report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n'
    report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n'
    report_content+=""$'\n'

    # Group findings by category
    local -A categories
    categories["DISK"]=""
    categories["MEMORY"]=""
    categories["CPU"]=""
    categories["RAID"]=""
    categories["OTHER"]=""

    for finding in "${FINDINGS[@]}"; do
        # Split by @@@SEP@@@ delimiter
        local severity_title="${finding%%@@@SEP@@@*}"
        local temp="${finding#*@@@SEP@@@}"
        local details="${temp%%@@@SEP@@@*}"
        local recommendation="${temp#*@@@SEP@@@}"

        # Extract severity from [SEVERITY] Title format
        local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
        local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')

        local category="OTHER"
        if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then
            category="DISK"
        elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then
            category="MEMORY"
        elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then
            category="CPU"
        elif [[ "$title" == *"RAID"* ]]; then
            category="RAID"
        fi

        local entry=""
        entry+="[$severity] $title"$'\n'
        entry+="$details"$'\n'
        if [ -n "$recommendation" ]; then
            entry+="Recommendation:"$'\n'
            entry+="$recommendation"$'\n'
        fi
        entry+=""$'\n'
        entry+="------------------------------------------------------------------------------"$'\n'
        entry+=""$'\n'

        categories[$category]+="$entry"
    done

    # Output sections
    if [ -n "${categories[DISK]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="DISK HEALTH & SMART STATUS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[DISK]}"
    fi

    if [ -n "${categories[MEMORY]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="MEMORY HEALTH"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[MEMORY]}"
    fi

    if [ -n "${categories[CPU]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="CPU HEALTH"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[CPU]}"
    fi

    if [ -n "${categories[RAID]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="RAID STATUS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[RAID]}"
    fi

    if [ -n "${categories[OTHER]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="OTHER HARDWARE FINDINGS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[OTHER]}"
    fi

    report_content+="=============================================================================="$'\n'
    report_content+="NEXT STEPS"$'\n'
    report_content+="=============================================================================="$'\n'
    report_content+=""$'\n'
    report_content+="Priority Actions:"$'\n'
    report_content+="  1. Address any CRITICAL issues immediately"$'\n'
    report_content+="  2. Monitor WARNING issues closely"$'\n'
    report_content+="  3. Schedule regular hardware health checks"$'\n'
    report_content+=""$'\n'
    report_content+="Additional Analysis Available:"$'\n'
    report_content+="  • System Health Check (Main Menu) for overall server health"$'\n'
    report_content+="  • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n'
    report_content+=""$'\n'
    report_content+="Report saved to: $REPORT_FILE"$'\n'
    report_content+=""$'\n'

    echo "$report_content"
    echo "$report_content" > "$REPORT_FILE"
}

# Main execution
main() {
    show_banner
    echo -e "${MAGENTA}${BOLD}🔧 Hardware Health Check${NC}"
    echo ""
    echo ""

    echo -e "${CYAN}[INFO]${NC} Starting comprehensive hardware diagnostics..."
    echo ""

    # Run diagnostics
    check_disk_smart
    check_memory_health
    check_cpu_health
    check_hardware_errors
    check_raid_status
    check_disk_io_errors

    echo ""
    echo -e "${GREEN}[OK]${NC} Hardware diagnostics complete!"
    echo ""

    # Generate and display report
    generate_report

    echo ""
    echo -e "${GREEN}[INFO]${NC} Full report saved to: ${CYAN}$REPORT_FILE${NC}"
    echo ""
    echo ""

    press_enter
}

# Run main function
main