#!/bin/bash # Hardware Health Check # Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors # Get the script's directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # Source required libraries source "$TOOLKIT_ROOT/lib/common-functions.sh" source "$TOOLKIT_ROOT/lib/system-detect.sh" source "$TOOLKIT_ROOT/lib/reference-db.sh" # Initialize system detection detect_system # Load system info from reference database if [ -f "$TOOLKIT_ROOT/.sysref" ]; then SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4) SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4) fi # Color definitions RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' MAGENTA='\033[0;35m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' # Report file REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt" # Analysis results storage declare -a FINDINGS=() # Function to add finding add_finding() { local severity="$1" local title="$2" local details="$3" local recommendation="$4" # Use @@@SEP@@@ as separator to avoid conflicts with content FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation") } # Function to check if command exists command_exists() { command -v "$1" &>/dev/null } # Function to check SMART status check_disk_smart() { echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..." if ! command_exists smartctl; then add_finding "INFO" "SMART Tools Not Installed" \ "smartmontools is not installed - cannot check disk health" \ "Install SMART tools: yum install smartmontools After installing, run: systemctl enable smartd && systemctl start smartd" return fi # Find all disks local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}') if [ -z "$disks" ]; then add_finding "WARNING" "No Disks Found" \ "Could not detect any disk devices" \ "Check system configuration: lsblk -a" return fi local disk_count=0 local healthy_count=0 local warning_count=0 local failed_count=0 for disk in $disks; do disk_count=$((disk_count + 1)) # Check if SMART is available if ! smartctl -i "$disk" &>/dev/null; then continue fi # Get SMART health status local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}') # Get disk model and serial local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//') [ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//') local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//') # Get key SMART attributes local reallocated=$(smartctl -A "$disk" 2>/dev/null | grep "Reallocated_Sector" | awk '{print $10}') local pending=$(smartctl -A "$disk" 2>/dev/null | grep "Current_Pending_Sector" | awk '{print $10}') local uncorrectable=$(smartctl -A "$disk" 2>/dev/null | grep "Offline_Uncorrectable" | awk '{print $10}') local temp=$(smartctl -A "$disk" 2>/dev/null | grep "Temperature_Celsius" | awk '{print $10}') local power_on=$(smartctl -A "$disk" 2>/dev/null | grep "Power_On_Hours" | awk '{print $10}') # Determine severity if [[ "$health" =~ PASSED ]]; then # Check for warning signs even if passed if [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then warning_count=$((warning_count + 1)) add_finding "WARNING" "Disk $disk: Reallocated Sectors Detected" \ "Device: $disk Model: $model Serial: $serial Health: $health Reallocated Sectors: $reallocated Pending Sectors: ${pending:-0} Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A}" \ "Disk has reallocated sectors - sign of potential failure • Monitor closely: smartctl -A $disk • Plan for replacement • Ensure backups are current" elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then warning_count=$((warning_count + 1)) add_finding "WARNING" "Disk $disk: Pending Sectors Detected" \ "Device: $disk Model: $model Serial: $serial Health: $health Pending Sectors: $pending Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A}" \ "Disk has pending sectors - potential read/write issues • Monitor closely: smartctl -A $disk • Check system logs: grep -i '$disk' /var/log/messages • Consider replacement if increasing" else healthy_count=$((healthy_count + 1)) add_finding "INFO" "Disk $disk: Healthy" \ "Device: $disk Model: $model Serial: $serial Health: $health Reallocated Sectors: ${reallocated:-0} Pending Sectors: ${pending:-0} Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A}" \ "Disk is healthy - continue regular monitoring" fi else failed_count=$((failed_count + 1)) add_finding "CRITICAL" "Disk $disk: SMART FAILURE" \ "Device: $disk Model: $model Serial: $serial Health: ${health:-UNKNOWN} Reallocated Sectors: ${reallocated:-N/A} Pending Sectors: ${pending:-N/A} Uncorrectable Sectors: ${uncorrectable:-N/A} Temperature: ${temp:-N/A}°C" \ "IMMEDIATE ACTION REQUIRED - Disk failing: • Backup all data immediately • Replace disk as soon as possible • Review SMART details: smartctl -a $disk • Check system logs: grep -i '$disk' /var/log/messages" fi done # Summary finding add_finding "INFO" "Disk Health Summary" \ "Total disks checked: $disk_count Healthy: $healthy_count Warning: $warning_count Failed: $failed_count" \ "Regular SMART monitoring recommended: smartctl -a /dev/[disk]" } # Function to check memory health check_memory_health() { echo -e "${CYAN}[INFO]${NC} Checking memory health..." if ! command_exists dmidecode; then add_finding "INFO" "dmidecode Not Available" \ "dmidecode is not installed - cannot check memory details" \ "Install dmidecode: yum install dmidecode" return fi # Get memory information local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$") local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed") # Get total memory local total_mem=$(free -h | grep "Mem:" | awk '{print $2}') # Check for ECC local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l) # Check for memory errors in dmesg local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l) # Check hardware errors in system log local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | wc -l) # Build memory details local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:" | sed 's/^[ \t]*/ /') if [ "$mem_errors" -gt 0 ] || [ "$hw_mem_errors" -gt 0 ]; then # Get recent error samples local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') add_finding "CRITICAL" "Memory Errors Detected" \ "Total Memory: $total_mem Slots: $populated_slots / $total_slots ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo 'Yes' || echo 'No') Memory errors in dmesg: $mem_errors Hardware errors in logs: $hw_mem_errors Recent errors: $recent_errors" \ "Memory errors detected - investigate immediately: • Run memory test: Install and run memtest86+ (reboot required) • Check details: dmidecode -t memory • Review all errors: grep -i 'memory.*error' /var/log/messages • If ECC, check: dmidecode -t memory | grep -A 5 'Error Information' • Contact hosting provider if virtual machine • Replace faulty memory modules" else add_finding "INFO" "Memory Health Status" \ "Total Memory: $total_mem Slots: $populated_slots / $total_slots ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo 'Yes' || echo 'No') Memory errors: None detected Installed Modules: $mem_modules" \ "Memory appears healthy - no errors detected" fi } # Function to check CPU health check_cpu_health() { echo -e "${CYAN}[INFO]${NC} Checking CPU health..." # Get CPU info local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//') local cpu_cores=$(grep -c "^processor" /proc/cpuinfo) local cpu_threads=$(nproc) # Check for CPU errors in dmesg local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l) # Check system log local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l) # Get current CPU frequency local cpu_freq="" if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq) cpu_freq=$(awk "BEGIN {printf \"%.2f\", $freq_khz / 1000000}")" GHz" fi # Check CPU temperature if sensors available local cpu_temp="N/A" if command_exists sensors; then cpu_temp=$(sensors 2>/dev/null | grep -E "Core 0|temp1" | head -1 | grep -oP '\+\K[0-9.]+' | head -1) [ -n "$cpu_temp" ] && cpu_temp="${cpu_temp}°C" fi # Check load average local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//') if [ "$cpu_errors" -gt 0 ] || [ "$hw_cpu_errors" -gt 0 ]; then local recent_errors=$(grep -iE "mce|machine check|cpu.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') add_finding "CRITICAL" "CPU Errors Detected" \ "CPU Model: $cpu_model Cores: $cpu_cores Threads: $cpu_threads Current Frequency: ${cpu_freq:-N/A} Temperature: $cpu_temp Load Average: $load_avg MCE/CPU errors in dmesg: $cpu_errors Hardware errors in logs: $hw_cpu_errors Recent errors: $recent_errors" \ "CPU errors detected - critical hardware issue: • Check full details: dmesg | grep -i mce • Review MCE logs: grep -i 'machine check' /var/log/messages • Check temperature: sensors (install: yum install lm_sensors) • Contact hosting provider/hardware vendor immediately • May indicate failing CPU or motherboard" else add_finding "INFO" "CPU Health Status" \ "CPU Model: $cpu_model Cores: $cpu_cores Threads: $cpu_threads Current Frequency: ${cpu_freq:-N/A} Temperature: $cpu_temp Load Average: $load_avg Hardware errors: None detected" \ "CPU appears healthy - no errors detected" fi # Check if sensors are available for monitoring if ! command_exists sensors; then add_finding "INFO" "Temperature Monitoring Not Available" \ "lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \ "Install sensors for temperature monitoring: • yum install lm_sensors • sensors-detect (answer YES to all) • sensors (view temperatures)" fi } # Function to check system hardware errors check_hardware_errors() { echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..." # Check for general hardware errors local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l) if [ "$hw_errors" -gt 0 ]; then local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /') add_finding "WARNING" "Hardware Errors in System Log" \ "Total hardware-related errors: $hw_errors Recent errors (last 10): $recent_errors" \ "Hardware errors detected in system logs: • Review full log: grep -iE 'hardware error|i/o error' /var/log/messages • Check dmesg: dmesg | grep -i error | tail -20 • Identify failing component (disk, memory, CPU, etc.) • Run component-specific diagnostics • Contact hosting provider if persistent" fi } # Function to check RAID status check_raid_status() { echo -e "${CYAN}[INFO]${NC} Checking RAID status..." local raid_found=false # Check for software RAID (mdadm) if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then raid_found=true local raid_status=$(cat /proc/mdstat) local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]") if [ "$degraded" -gt 0 ]; then add_finding "CRITICAL" "Software RAID Degraded" \ "RAID array is degraded: $raid_status" \ "RAID array degraded - immediate action required: • Check details: cat /proc/mdstat • Identify failed drive: mdadm --detail /dev/md* • Replace failed drive and rebuild array • Ensure backups are current" else add_finding "INFO" "Software RAID Status" \ "$raid_status" \ "Software RAID is healthy" fi fi # Check for hardware RAID (common controllers) if command_exists megacli; then raid_found=true local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive") add_finding "INFO" "MegaRAID Status" \ "$raid_info" \ "Check details: megacli -LDInfo -Lall -aALL" fi if ! $raid_found; then add_finding "INFO" "No RAID Detected" \ "No software or hardware RAID arrays detected" \ "System appears to use non-RAID storage" fi } # Function to check disk I/O errors check_disk_io_errors() { echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..." # Check for I/O errors in dmesg local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l) if [ "$io_errors" -gt 0 ]; then local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/ /') add_finding "WARNING" "Disk I/O Errors Detected" \ "Total I/O errors in dmesg: $io_errors Recent I/O errors (last 10): $recent_io_errors" \ "Disk I/O errors detected - indicates hardware or connection issues: • Check SMART status (see above) • Review dmesg: dmesg | grep -i 'i/o error' • Check cables and connections (if physical server) • Check for disk controller issues • May indicate failing disk or controller" fi } # Function to generate report generate_report() { local report_content="" report_content+="=============================================================================="$'\n' report_content+="HARDWARE HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="System: $SYS_HOSTNAME"$'\n' report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n' report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n' report_content+=""$'\n' # Group findings by category local -A categories categories["DISK"]="" categories["MEMORY"]="" categories["CPU"]="" categories["RAID"]="" categories["OTHER"]="" for finding in "${FINDINGS[@]}"; do # Split by @@@SEP@@@ delimiter local severity_title="${finding%%@@@SEP@@@*}" local temp="${finding#*@@@SEP@@@}" local details="${temp%%@@@SEP@@@*}" local recommendation="${temp#*@@@SEP@@@}" # Extract severity from [SEVERITY] Title format local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p') local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //') local category="OTHER" if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then category="DISK" elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then category="MEMORY" elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then category="CPU" elif [[ "$title" == *"RAID"* ]]; then category="RAID" fi local entry="" entry+="[$severity] $title"$'\n' entry+="$details"$'\n' if [ -n "$recommendation" ]; then entry+="Recommendation:"$'\n' entry+="$recommendation"$'\n' fi entry+=""$'\n' entry+="------------------------------------------------------------------------------"$'\n' entry+=""$'\n' categories[$category]+="$entry" done # Output sections if [ -n "${categories[DISK]}" ]; then report_content+="=============================================================================="$'\n' report_content+="DISK HEALTH & SMART STATUS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[DISK]}" fi if [ -n "${categories[MEMORY]}" ]; then report_content+="=============================================================================="$'\n' report_content+="MEMORY HEALTH"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[MEMORY]}" fi if [ -n "${categories[CPU]}" ]; then report_content+="=============================================================================="$'\n' report_content+="CPU HEALTH"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[CPU]}" fi if [ -n "${categories[RAID]}" ]; then report_content+="=============================================================================="$'\n' report_content+="RAID STATUS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[RAID]}" fi if [ -n "${categories[OTHER]}" ]; then report_content+="=============================================================================="$'\n' report_content+="OTHER HARDWARE FINDINGS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[OTHER]}" fi report_content+="=============================================================================="$'\n' report_content+="NEXT STEPS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="Priority Actions:"$'\n' report_content+=" 1. Address any CRITICAL issues immediately"$'\n' report_content+=" 2. Monitor WARNING issues closely"$'\n' report_content+=" 3. Schedule regular hardware health checks"$'\n' report_content+=""$'\n' report_content+="Additional Analysis Available:"$'\n' report_content+=" • System Health Check (Main Menu) for overall server health"$'\n' report_content+=" • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n' report_content+=""$'\n' report_content+="Report saved to: $REPORT_FILE"$'\n' report_content+=""$'\n' echo "$report_content" echo "$report_content" > "$REPORT_FILE" } # Main execution main() { show_banner echo -e "${MAGENTA}${BOLD}🔧 Hardware Health Check${NC}" echo "" echo "" echo -e "${CYAN}[INFO]${NC} Starting comprehensive hardware diagnostics..." echo "" # Run diagnostics check_disk_smart check_memory_health check_cpu_health check_hardware_errors check_raid_status check_disk_io_errors echo "" echo -e "${GREEN}[OK]${NC} Hardware diagnostics complete!" echo "" # Generate and display report generate_report echo "" echo -e "${GREEN}[INFO]${NC} Full report saved to: ${CYAN}$REPORT_FILE${NC}" echo "" echo "" press_enter } # Run main function main