Files
Linux-Server-Management-Too…/modules/performance/hardware-health-check.sh
T
cschantz a51d968185 Initial commit: Server Management Toolkit v2.0
- Complete security menu restructure (3-mode: Analysis/Actions/Live)
- Intelligent cPHulk enablement with CSF whitelist import
- Live network security monitoring dashboard
- Multi-source threat detection and classification
- 50+ organized security tools across 4-level menu hierarchy
- System health diagnostics with cPanel/WHM integration
- Reference database for cross-module intelligence sharing
2025-11-03 18:21:40 -05:00

567 lines
21 KiB
Bash
Executable File

#!/bin/bash
# Hardware Health Check
# Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors
# Get the script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Source required libraries
source "$TOOLKIT_ROOT/lib/common-functions.sh"
source "$TOOLKIT_ROOT/lib/system-detect.sh"
source "$TOOLKIT_ROOT/lib/reference-db.sh"
# Initialize system detection
detect_system
# Load system info from reference database
if [ -f "$TOOLKIT_ROOT/.sysref" ]; then
SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
fi
# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
# Report file
REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt"
# Analysis results storage
declare -a FINDINGS=()
# Function to add finding
add_finding() {
local severity="$1"
local title="$2"
local details="$3"
local recommendation="$4"
# Use @@@SEP@@@ as separator to avoid conflicts with content
FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation")
}
# Function to check if command exists
command_exists() {
command -v "$1" &>/dev/null
}
# Function to check SMART status
check_disk_smart() {
echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..."
if ! command_exists smartctl; then
add_finding "INFO" "SMART Tools Not Installed" \
"smartmontools is not installed - cannot check disk health" \
"Install SMART tools: yum install smartmontools
After installing, run: systemctl enable smartd && systemctl start smartd"
return
fi
# Find all disks
local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')
if [ -z "$disks" ]; then
add_finding "WARNING" "No Disks Found" \
"Could not detect any disk devices" \
"Check system configuration: lsblk -a"
return
fi
local disk_count=0
local healthy_count=0
local warning_count=0
local failed_count=0
for disk in $disks; do
disk_count=$((disk_count + 1))
# Check if SMART is available
if ! smartctl -i "$disk" &>/dev/null; then
continue
fi
# Get SMART health status
local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}')
# Get disk model and serial
local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//')
[ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//')
local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//')
# Get key SMART attributes
local reallocated=$(smartctl -A "$disk" 2>/dev/null | grep "Reallocated_Sector" | awk '{print $10}')
local pending=$(smartctl -A "$disk" 2>/dev/null | grep "Current_Pending_Sector" | awk '{print $10}')
local uncorrectable=$(smartctl -A "$disk" 2>/dev/null | grep "Offline_Uncorrectable" | awk '{print $10}')
local temp=$(smartctl -A "$disk" 2>/dev/null | grep "Temperature_Celsius" | awk '{print $10}')
local power_on=$(smartctl -A "$disk" 2>/dev/null | grep "Power_On_Hours" | awk '{print $10}')
# Determine severity
if [[ "$health" =~ PASSED ]]; then
# Check for warning signs even if passed
if [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then
warning_count=$((warning_count + 1))
add_finding "WARNING" "Disk $disk: Reallocated Sectors Detected" \
"Device: $disk
Model: $model
Serial: $serial
Health: $health
Reallocated Sectors: $reallocated
Pending Sectors: ${pending:-0}
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
"Disk has reallocated sectors - sign of potential failure
• Monitor closely: smartctl -A $disk
• Plan for replacement
• Ensure backups are current"
elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then
warning_count=$((warning_count + 1))
add_finding "WARNING" "Disk $disk: Pending Sectors Detected" \
"Device: $disk
Model: $model
Serial: $serial
Health: $health
Pending Sectors: $pending
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
"Disk has pending sectors - potential read/write issues
• Monitor closely: smartctl -A $disk
• Check system logs: grep -i '$disk' /var/log/messages
• Consider replacement if increasing"
else
healthy_count=$((healthy_count + 1))
add_finding "INFO" "Disk $disk: Healthy" \
"Device: $disk
Model: $model
Serial: $serial
Health: $health
Reallocated Sectors: ${reallocated:-0}
Pending Sectors: ${pending:-0}
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}" \
"Disk is healthy - continue regular monitoring"
fi
else
failed_count=$((failed_count + 1))
add_finding "CRITICAL" "Disk $disk: SMART FAILURE" \
"Device: $disk
Model: $model
Serial: $serial
Health: ${health:-UNKNOWN}
Reallocated Sectors: ${reallocated:-N/A}
Pending Sectors: ${pending:-N/A}
Uncorrectable Sectors: ${uncorrectable:-N/A}
Temperature: ${temp:-N/A}°C" \
"IMMEDIATE ACTION REQUIRED - Disk failing:
• Backup all data immediately
• Replace disk as soon as possible
• Review SMART details: smartctl -a $disk
• Check system logs: grep -i '$disk' /var/log/messages"
fi
done
# Summary finding
add_finding "INFO" "Disk Health Summary" \
"Total disks checked: $disk_count
Healthy: $healthy_count
Warning: $warning_count
Failed: $failed_count" \
"Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
}
# Function to check memory health
check_memory_health() {
echo -e "${CYAN}[INFO]${NC} Checking memory health..."
if ! command_exists dmidecode; then
add_finding "INFO" "dmidecode Not Available" \
"dmidecode is not installed - cannot check memory details" \
"Install dmidecode: yum install dmidecode"
return
fi
# Get memory information
local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$")
local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed")
# Get total memory
local total_mem=$(free -h | grep "Mem:" | awk '{print $2}')
# Check for ECC
local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l)
# Check for memory errors in dmesg
local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l)
# Check hardware errors in system log
local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | wc -l)
# Build memory details
local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:" | sed 's/^[ \t]*/ /')
if [ "$mem_errors" -gt 0 ] || [ "$hw_mem_errors" -gt 0 ]; then
# Get recent error samples
local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
add_finding "CRITICAL" "Memory Errors Detected" \
"Total Memory: $total_mem
Slots: $populated_slots / $total_slots
ECC Support: $([ $ecc_support -gt 0 ] && echo 'Yes' || echo 'No')
Memory errors in dmesg: $mem_errors
Hardware errors in logs: $hw_mem_errors
Recent errors:
$recent_errors" \
"Memory errors detected - investigate immediately:
• Run memory test: Install and run memtest86+ (reboot required)
• Check details: dmidecode -t memory
• Review all errors: grep -i 'memory.*error' /var/log/messages
• If ECC, check: dmidecode -t memory | grep -A 5 'Error Information'
• Contact hosting provider if virtual machine
• Replace faulty memory modules"
else
add_finding "INFO" "Memory Health Status" \
"Total Memory: $total_mem
Slots: $populated_slots / $total_slots
ECC Support: $([ $ecc_support -gt 0 ] && echo 'Yes' || echo 'No')
Memory errors: None detected
Installed Modules:
$mem_modules" \
"Memory appears healthy - no errors detected"
fi
}
# Function to check CPU health
check_cpu_health() {
echo -e "${CYAN}[INFO]${NC} Checking CPU health..."
# Get CPU info
local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')
local cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
local cpu_threads=$(nproc)
# Check for CPU errors in dmesg
local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l)
# Check system log
local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l)
# Get current CPU frequency
local cpu_freq=""
if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then
local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq)
cpu_freq=$(echo "scale=2; $freq_khz / 1000000" | bc)" GHz"
fi
# Check CPU temperature if sensors available
local cpu_temp="N/A"
if command_exists sensors; then
cpu_temp=$(sensors 2>/dev/null | grep -E "Core 0|temp1" | head -1 | grep -oP '\+\K[0-9.]+' | head -1)
[ -n "$cpu_temp" ] && cpu_temp="${cpu_temp}°C"
fi
# Check load average
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//')
if [ "$cpu_errors" -gt 0 ] || [ "$hw_cpu_errors" -gt 0 ]; then
local recent_errors=$(grep -iE "mce|machine check|cpu.*error" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
add_finding "CRITICAL" "CPU Errors Detected" \
"CPU Model: $cpu_model
Cores: $cpu_cores
Threads: $cpu_threads
Current Frequency: ${cpu_freq:-N/A}
Temperature: $cpu_temp
Load Average: $load_avg
MCE/CPU errors in dmesg: $cpu_errors
Hardware errors in logs: $hw_cpu_errors
Recent errors:
$recent_errors" \
"CPU errors detected - critical hardware issue:
• Check full details: dmesg | grep -i mce
• Review MCE logs: grep -i 'machine check' /var/log/messages
• Check temperature: sensors (install: yum install lm_sensors)
• Contact hosting provider/hardware vendor immediately
• May indicate failing CPU or motherboard"
else
add_finding "INFO" "CPU Health Status" \
"CPU Model: $cpu_model
Cores: $cpu_cores
Threads: $cpu_threads
Current Frequency: ${cpu_freq:-N/A}
Temperature: $cpu_temp
Load Average: $load_avg
Hardware errors: None detected" \
"CPU appears healthy - no errors detected"
fi
# Check if sensors are available for monitoring
if ! command_exists sensors; then
add_finding "INFO" "Temperature Monitoring Not Available" \
"lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \
"Install sensors for temperature monitoring:
• yum install lm_sensors
• sensors-detect (answer YES to all)
• sensors (view temperatures)"
fi
}
# Function to check system hardware errors
check_hardware_errors() {
echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..."
# Check for general hardware errors
local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l)
if [ "$hw_errors" -gt 0 ]; then
local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
add_finding "WARNING" "Hardware Errors in System Log" \
"Total hardware-related errors: $hw_errors
Recent errors (last 10):
$recent_errors" \
"Hardware errors detected in system logs:
• Review full log: grep -iE 'hardware error|i/o error' /var/log/messages
• Check dmesg: dmesg | grep -i error | tail -20
• Identify failing component (disk, memory, CPU, etc.)
• Run component-specific diagnostics
• Contact hosting provider if persistent"
fi
}
# Function to check RAID status
check_raid_status() {
echo -e "${CYAN}[INFO]${NC} Checking RAID status..."
local raid_found=false
# Check for software RAID (mdadm)
if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then
raid_found=true
local raid_status=$(cat /proc/mdstat)
local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]")
if [ "$degraded" -gt 0 ]; then
add_finding "CRITICAL" "Software RAID Degraded" \
"RAID array is degraded:
$raid_status" \
"RAID array degraded - immediate action required:
• Check details: cat /proc/mdstat
• Identify failed drive: mdadm --detail /dev/md*
• Replace failed drive and rebuild array
• Ensure backups are current"
else
add_finding "INFO" "Software RAID Status" \
"$raid_status" \
"Software RAID is healthy"
fi
fi
# Check for hardware RAID (common controllers)
if command_exists megacli; then
raid_found=true
local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive")
add_finding "INFO" "MegaRAID Status" \
"$raid_info" \
"Check details: megacli -LDInfo -Lall -aALL"
fi
if ! $raid_found; then
add_finding "INFO" "No RAID Detected" \
"No software or hardware RAID arrays detected" \
"System appears to use non-RAID storage"
fi
}
# Function to check disk I/O errors
check_disk_io_errors() {
echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..."
# Check for I/O errors in dmesg
local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l)
if [ "$io_errors" -gt 0 ]; then
local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/ /')
add_finding "WARNING" "Disk I/O Errors Detected" \
"Total I/O errors in dmesg: $io_errors
Recent I/O errors (last 10):
$recent_io_errors" \
"Disk I/O errors detected - indicates hardware or connection issues:
• Check SMART status (see above)
• Review dmesg: dmesg | grep -i 'i/o error'
• Check cables and connections (if physical server)
• Check for disk controller issues
• May indicate failing disk or controller"
fi
}
# Function to generate report
generate_report() {
local report_content=""
report_content+="=============================================================================="$'\n'
report_content+="HARDWARE HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="System: $SYS_HOSTNAME"$'\n'
report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n'
report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n'
report_content+=""$'\n'
# Group findings by category
local -A categories
categories["DISK"]=""
categories["MEMORY"]=""
categories["CPU"]=""
categories["RAID"]=""
categories["OTHER"]=""
for finding in "${FINDINGS[@]}"; do
# Split by @@@SEP@@@ delimiter
local severity_title="${finding%%@@@SEP@@@*}"
local temp="${finding#*@@@SEP@@@}"
local details="${temp%%@@@SEP@@@*}"
local recommendation="${temp#*@@@SEP@@@}"
# Extract severity from [SEVERITY] Title format
local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')
local category="OTHER"
if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then
category="DISK"
elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then
category="MEMORY"
elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then
category="CPU"
elif [[ "$title" == *"RAID"* ]]; then
category="RAID"
fi
local entry=""
entry+="[$severity] $title"$'\n'
entry+="$details"$'\n'
if [ -n "$recommendation" ]; then
entry+="Recommendation:"$'\n'
entry+="$recommendation"$'\n'
fi
entry+=""$'\n'
entry+="------------------------------------------------------------------------------"$'\n'
entry+=""$'\n'
categories[$category]+="$entry"
done
# Output sections
if [ -n "${categories[DISK]}" ]; then
report_content+="=============================================================================="$'\n'
report_content+="DISK HEALTH & SMART STATUS"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="${categories[DISK]}"
fi
if [ -n "${categories[MEMORY]}" ]; then
report_content+="=============================================================================="$'\n'
report_content+="MEMORY HEALTH"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="${categories[MEMORY]}"
fi
if [ -n "${categories[CPU]}" ]; then
report_content+="=============================================================================="$'\n'
report_content+="CPU HEALTH"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="${categories[CPU]}"
fi
if [ -n "${categories[RAID]}" ]; then
report_content+="=============================================================================="$'\n'
report_content+="RAID STATUS"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="${categories[RAID]}"
fi
if [ -n "${categories[OTHER]}" ]; then
report_content+="=============================================================================="$'\n'
report_content+="OTHER HARDWARE FINDINGS"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="${categories[OTHER]}"
fi
report_content+="=============================================================================="$'\n'
report_content+="NEXT STEPS"$'\n'
report_content+="=============================================================================="$'\n'
report_content+=""$'\n'
report_content+="Priority Actions:"$'\n'
report_content+=" 1. Address any CRITICAL issues immediately"$'\n'
report_content+=" 2. Monitor WARNING issues closely"$'\n'
report_content+=" 3. Schedule regular hardware health checks"$'\n'
report_content+=""$'\n'
report_content+="Additional Analysis Available:"$'\n'
report_content+=" • System Health Check (Main Menu) for overall server health"$'\n'
report_content+=" • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n'
report_content+=""$'\n'
report_content+="Report saved to: $REPORT_FILE"$'\n'
report_content+=""$'\n'
echo "$report_content"
echo "$report_content" > "$REPORT_FILE"
}
# Main execution
main() {
show_banner
echo -e "${MAGENTA}${BOLD}🔧 Hardware Health Check${NC}"
echo ""
echo ""
echo -e "${CYAN}[INFO]${NC} Starting comprehensive hardware diagnostics..."
echo ""
# Run diagnostics
check_disk_smart
check_memory_health
check_cpu_health
check_hardware_errors
check_raid_status
check_disk_io_errors
echo ""
echo -e "${GREEN}[OK]${NC} Hardware diagnostics complete!"
echo ""
# Generate and display report
generate_report
echo ""
echo -e "${GREEN}[INFO]${NC} Full report saved to: ${CYAN}$REPORT_FILE${NC}"
echo ""
echo ""
press_enter
}
# Run main function
main