Linux-Server-Management-Too…/modules/performance/hardware-health-check.sh

#!/bin/bash

# Hardware Health Check
# Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors

# Get the script's directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

# Source required libraries
source "$TOOLKIT_ROOT/lib/common-functions.sh"
source "$TOOLKIT_ROOT/lib/system-detect.sh"
source "$TOOLKIT_ROOT/lib/reference-db.sh"

# Initialize system detection
detect_system

# Load system info from reference database
if [ -f "$TOOLKIT_ROOT/.sysref" ]; then
    SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
    SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
    SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
fi

# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'

# Report file
REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt"

# Analysis results storage
declare -a FINDINGS=()

# Function to add finding
add_finding() {
    [ -z "$1" ] || [ -z "$2" ] && return 1
    local severity="$1"
    local title="$2"
    local details="$3"
    local recommendation="$4"

    # Use @@@SEP@@@ as separator to avoid conflicts with content
    FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation")
}

# Function to check if command exists
command_exists() {
    [ -z "$1" ] && return 1
    command -v "$1" &>/dev/null
}

# Global variables for virtualization detection
IS_VIRTUAL=false
VIRT_TYPE="physical"

# Function to detect virtualization
detect_virtualization() {
    IS_VIRTUAL=false
    VIRT_TYPE="physical"

    # Try systemd-detect-virt first (most reliable)
    if command_exists systemd-detect-virt; then
        local detected=$(systemd-detect-virt 2>/dev/null)
        if [ -n "$detected" ] && [ "$detected" != "none" ]; then
            IS_VIRTUAL=true
            VIRT_TYPE="$detected"
        fi
    # Fallback: check dmidecode
    elif command_exists dmidecode; then
        local product=$(dmidecode -s system-product-name 2>/dev/null)
        if echo "$product" | grep -qiE "kvm|qemu|vmware|virtualbox|xen|hyperv"; then
            IS_VIRTUAL=true
            VIRT_TYPE=$(echo "$product" | grep -oiE "kvm|qemu|vmware|virtualbox|xen|hyperv" | head -1)
        fi
    fi

    # Add finding if virtual
    if [ "$IS_VIRTUAL" = true ]; then
        add_finding "INFO" "ℹ️  Virtual Machine Detected" \
            "Environment: $VIRT_TYPE
Hardware checks adapted for virtual machine:
  • SMART disk checks: SKIPPED (VMs use virtual disks)
  • Fan monitoring: SKIPPED (hypervisor controls physical fans)
  • Some sensors: SKIPPED (not accessible in VM)
  • Memory/CPU/Network checks: ACTIVE (VM-compatible)" \
            "This is normal for virtual machines. Hardware monitoring is limited to VM-accessible components."
    else
        add_finding "INFO" "ℹ️  Physical Server Detected" \
            "Environment: Physical hardware
All hardware health checks will be performed:
  • SMART disk monitoring
  • Fan speed monitoring
  • Temperature sensors
  • Memory ECC errors
  • CPU thermal monitoring
  • Network interface errors
  • Kernel parameters" \
            "Full hardware monitoring enabled for physical server."
    fi
}

# Function to check SMART status with deep analysis
check_disk_smart() {
    # Skip SMART checks on virtual machines (VMs use virtual disks)
    if [ "$IS_VIRTUAL" = true ]; then
        echo -e "${CYAN}[INFO]${NC} Skipping SMART checks (virtual machine - $VIRT_TYPE)"
        return 0
    fi

    echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..."

    if ! command_exists smartctl; then
        add_finding "INFO" "SMART Tools Not Installed" \
            "smartmontools is not installed - cannot check disk health" \
            "Install SMART tools: yum install smartmontools
After installing, run: systemctl enable smartd && systemctl start smartd"
        return
    fi

    # Find all disks
    local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')

    if [ -z "$disks" ]; then
        add_finding "WARNING" "No Disks Found" \
            "Could not detect any disk devices" \
            "Check system configuration: lsblk -a"
        return
    fi

    local disk_count=0
    local healthy_count=0
    local warning_count=0
    local failed_count=0

    for disk in $disks; do
        disk_count=$((disk_count + 1))

        # Get device info to determine if SMART is applicable
        local device_info=$(smartctl -i "$disk" 2>&1)

        # COMPREHENSIVE DEVICE DETECTION - Works for ALL storage types

        # 1. CHECK: Device exists and smartctl can communicate
        if echo "$device_info" | grep -qiE "No such device|open device.*failed|Permission denied"; then
            echo -e "${CYAN}[INFO]${NC} Skipping $disk (device not accessible)"
            continue
        fi

        # 2. CHECK: SMART support availability
        if echo "$device_info" | grep -qiE "SMART support is:.*Unavailable|SMART support is:.*Disabled|Unable to detect device type"; then
            echo -e "${CYAN}[INFO]${NC} Skipping $disk (SMART not supported or disabled)"
            continue
        fi

        # 3. EXTRACT: Device type, model, vendor for intelligent detection
        local model=$(echo "$device_info" | grep -iE "Device Model:|Product:|Model Number:|Model Family:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
        local vendor=$(echo "$device_info" | grep -iE "Vendor:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
        local device_type=$(echo "$device_info" | grep -iE "Device type:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
        local serial=$(echo "$device_info" | grep -iE "Serial Number:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)

        # Combine model and vendor for comprehensive matching
        local full_id="${vendor} ${model} ${device_type}"

        # 4. DETECT: Hardware RAID Controllers (all major brands)
        # These devices are RAID controller logical volumes, not physical disks
        if echo "$full_id" | grep -qiE "MR[0-9]{4}|LSI|MegaRAID|PERC|Avago|Broadcom.*RAID|HPE.*Smart Array|HP.*Smart Array|Dell.*RAID|IBM.*ServeRAID|Adaptec|3ware|Areca|HighPoint|Promise.*RAID"; then
            local raid_type="Hardware RAID Controller"
            local tools="Unknown RAID tools"

            # Identify specific RAID type and provide exact tools
            if echo "$full_id" | grep -qiE "MR[0-9]{4}|MegaRAID"; then
                raid_type="MegaRAID Controller"
                tools="megacli -LDInfo -Lall -aALL or storcli /c0 /vall show all"
            elif echo "$full_id" | grep -qiE "LSI|Avago|Broadcom"; then
                raid_type="LSI/Broadcom RAID Controller"
                tools="sas2ircu LIST or storcli show"
            elif echo "$full_id" | grep -qiE "PERC|Dell"; then
                raid_type="Dell PERC RAID Controller"
                tools="perccli /c0 /vall show all or omreport storage vdisk"
            elif echo "$full_id" | grep -qiE "HP|HPE.*Smart"; then
                raid_type="HP Smart Array Controller"
                tools="hpacucli ctrl all show config or ssacli ctrl all show config"
            elif echo "$full_id" | grep -qiE "Adaptec"; then
                raid_type="Adaptec RAID Controller"
                tools="arcconf getconfig 1"
            elif echo "$full_id" | grep -qiE "3ware"; then
                raid_type="3ware RAID Controller"
                tools="tw_cli info c0"
            fi

            echo -e "${CYAN}[INFO]${NC} Skipping $disk ($raid_type: $model)"
            add_finding "INFO" "ℹ️  $raid_type Detected: $disk" \
                "Device: $disk
Controller: $model
Type: $raid_type
SMART Status: Not applicable (logical volume from RAID controller)

This is a logical volume presented by a hardware RAID controller.
SMART data is not available for these devices - the controller manages
the physical disks and presents them as a single logical volume.

To monitor RAID health, use controller-specific tools:
  Command: $tools

Physical disk health is monitored by the RAID controller itself.
Check controller logs and status for drive failures." \
                "Monitor RAID array health using controller tools, not SMART"
            continue
        fi

        # 5. DETECT: Virtual/Emulated Devices (VMs and containers)
        if echo "$full_id" | grep -qiE "QEMU|VirtIO|Virtual|VMware|Xen.*Virtual|Msft.*Virtual|Google.*Persistent|Amazon.*Elastic"; then
            local virt_type="Virtual Disk"

            if echo "$full_id" | grep -qiE "QEMU"; then
                virt_type="QEMU Virtual Disk (KVM)"
            elif echo "$full_id" | grep -qiE "VMware"; then
                virt_type="VMware Virtual Disk"
            elif echo "$full_id" | grep -qiE "VirtIO"; then
                virt_type="VirtIO Virtual Disk"
            elif echo "$full_id" | grep -qiE "Msft.*Virtual"; then
                virt_type="Hyper-V Virtual Disk"
            elif echo "$full_id" | grep -qiE "Xen"; then
                virt_type="Xen Virtual Disk"
            elif echo "$full_id" | grep -qiE "Google"; then
                virt_type="Google Persistent Disk"
            elif echo "$full_id" | grep -qiE "Amazon"; then
                virt_type="AWS EBS Volume"
            fi

            echo -e "${CYAN}[INFO]${NC} Skipping $disk ($virt_type)"
            # Already handled by VM detection at start of function
            continue
        fi

        # 6. DETECT: Software RAID / LVM / Device Mapper
        if echo "$disk" | grep -qE "md[0-9]|dm-[0-9]"; then
            echo -e "${CYAN}[INFO]${NC} Skipping $disk (software RAID/LVM device)"
            add_finding "INFO" "ℹ️  Software RAID/LVM Detected: $disk" \
                "Device: $disk
Type: Software RAID or LVM logical volume

This is a logical device managed by the kernel (mdadm or LVM).
SMART monitoring should be performed on the underlying physical disks.

For software RAID (md devices):
  • Check RAID status: cat /proc/mdstat
  • Monitor physical disks: smartctl -a /dev/sd[X]

For LVM (dm- devices):
  • Check LV status: lvdisplay
  • Monitor physical volumes: pvdisplay
  • Check underlying disks: smartctl -a /dev/sd[X]" \
                "Monitor underlying physical disks, not the logical volume"
            continue
        fi

        # 7. DETECT: Loop devices, RAM disks, other special devices
        if echo "$disk" | grep -qE "loop|ram|zram|nbd"; then
            echo -e "${CYAN}[INFO]${NC} Skipping $disk (special device: loop/ram/network)"
            continue
        fi

        # 8. FINAL CHECK: Is this a real disk with SMART data?
        # Try to get SMART attributes - if this fails, skip
        if ! smartctl -A "$disk" &>/dev/null; then
            echo -e "${CYAN}[INFO]${NC} Skipping $disk (no SMART attributes available)"
            add_finding "INFO" "ℹ️  Device Without SMART: $disk" \
                "Device: $disk
Model: ${model:-Unknown}

This device does not provide SMART attributes.
Common reasons:
  • USB-connected drives (SMART data not passed through)
  • Some hardware RAID configurations
  • Older drives without SMART support
  • Passthrough issues in virtual environments

If this is a critical disk, verify health through other means:
  • Check dmesg for errors: dmesg | grep -i '$disk'
  • Monitor I/O errors: iostat -x $disk
  • Check filesystem errors: mount | grep $disk" \
                "Monitor through system logs and I/O statistics"
            continue
        fi

        # Get SMART health status
        local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}')

        # Get disk model and serial
        local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//')
        [ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//')
        local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//')

        # Get ALL SMART data at once (optimize - single call instead of multiple)
        local smart_data=$(smartctl -A "$disk" 2>/dev/null)

        # Get key SMART attributes with deep parsing
        local reallocated=$(echo "$smart_data" | grep "Reallocated_Sector" | awk '{print $10}')
        local pending=$(echo "$smart_data" | grep "Current_Pending_Sector" | awk '{print $10}')
        local uncorrectable=$(echo "$smart_data" | grep "Offline_Uncorrectable" | awk '{print $10}')
        local temp=$(echo "$smart_data" | grep "Temperature_Celsius" | awk '{print $10}')
        local power_on=$(echo "$smart_data" | grep "Power_On_Hours" | awk '{print $10}')

        # Additional critical attributes for predictive failure
        local read_error_rate=$(echo "$smart_data" | grep "Raw_Read_Error_Rate" | awk '{print $10}')
        local spin_retry=$(echo "$smart_data" | grep "Spin_Retry_Count" | awk '{print $10}')
        local realloc_event=$(echo "$smart_data" | grep "Reallocated_Event_Count" | awk '{print $10}')
        local wear_leveling=$(echo "$smart_data" | grep "Wear_Leveling_Count" | awk '{print $10}')

        # DISK AGE ANALYSIS
        local disk_age_years=0
        local age_warning=""
        if [ -n "$power_on" ] && [ "$power_on" -gt 0 ]; then
            disk_age_years=$((power_on / 8760))  # 8760 hours per year
            if [ "$disk_age_years" -ge 5 ]; then
                age_warning="⚠️  DISK AGE: $disk_age_years years old (REPLACE - expected lifespan: 3-5 years)"
                [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
            elif [ "$disk_age_years" -ge 3 ]; then
                age_warning="ℹ️  DISK AGE: $disk_age_years years old (consider replacement soon)"
            fi
        fi

        # NVMe-SPECIFIC HEALTH (if NVMe drive)
        local is_nvme=false
        local nvme_wear=""
        local nvme_spare=""
        if [[ "$disk" == *"nvme"* ]]; then
            is_nvme=true
            # Get NVMe SMART data
            local nvme_smart=$(smartctl -A "$disk" 2>/dev/null)

            # Percentage Used (wear indicator)
            local percent_used=$(echo "$nvme_smart" | grep "Percentage Used" | awk '{print $3}' | tr -d '%')
            if [ -n "$percent_used" ] && [ "$percent_used" -gt 90 ]; then
                nvme_wear="⚠️  NVMe WEAR: ${percent_used}% used (CRITICAL - near end of life!)"
                failure_risk="HIGH"
            elif [ -n "$percent_used" ] && [ "$percent_used" -gt 80 ]; then
                nvme_wear="⚠️  NVMe WEAR: ${percent_used}% used (high wear - monitor closely)"
                [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
            fi

            # Available Spare
            local avail_spare=$(echo "$nvme_smart" | grep "Available Spare" | awk '{print $3}' | tr -d '%')
            if [ -n "$avail_spare" ] && [ "$avail_spare" -lt 10 ]; then
                nvme_spare="⚠️  NVMe SPARE: ${avail_spare}% available spare (CRITICAL!)"
                failure_risk="HIGH"
            fi
        fi

        # Check for I/O errors in system logs (last 7 days)
        local disk_name=$(basename "$disk")
        local io_errors=$(grep -i "$disk_name.*error\|$disk_name.*failed\|ata.*$disk_name" /var/log/messages 2>/dev/null | wc -l)
        local recent_io_samples=""
        if [ "$io_errors" -gt 0 ]; then
            recent_io_samples=$(grep -i "$disk_name.*error\|$disk_name.*failed" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/    /')
        fi

        # PREDICTIVE FAILURE ANALYSIS - Make critical issues OBVIOUS
        local failure_risk="NONE"
        local risk_factors=""

        # CRITICAL: Immediate failure indicators
        if [ -n "$reallocated" ] && [ "$reallocated" -gt 50 ]; then
            failure_risk="IMMINENT"
            risk_factors+="⚠️  CRITICAL: $reallocated reallocated sectors (DRIVE FAILING SOON!)"$'\n'
        elif [ -n "$reallocated" ] && [ "$reallocated" -gt 10 ]; then
            failure_risk="HIGH"
            risk_factors+="⚠️  HIGH: $reallocated reallocated sectors (failure risk increasing)"$'\n'
        elif [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then
            failure_risk="MODERATE"
            risk_factors+="⚠️  MODERATE: $reallocated reallocated sectors detected"$'\n'
        fi

        if [ -n "$pending" ] && [ "$pending" -gt 10 ]; then
            failure_risk="IMMINENT"
            risk_factors+="⚠️  CRITICAL: $pending pending sectors (READ/WRITE FAILURES!)"$'\n'
        elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then
            [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
            risk_factors+="⚠️  MODERATE: $pending pending sectors"$'\n'
        fi

        if [ -n "$uncorrectable" ] && [ "$uncorrectable" -gt 0 ]; then
            failure_risk="HIGH"
            risk_factors+="⚠️  HIGH: $uncorrectable uncorrectable sectors (data loss possible)"$'\n'
        fi

        # Temperature warnings
        if [ -n "$temp" ] && [ "$temp" -gt 55 ]; then
            [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
            risk_factors+="⚠️  Temperature: ${temp}°C (OVERHEATING - threshold: 50°C)"$'\n'
        elif [ -n "$temp" ] && [ "$temp" -gt 50 ]; then
            risk_factors+="⚠️  Temperature: ${temp}°C (above recommended 50°C)"$'\n'
        fi

        # I/O errors from logs
        if [ "$io_errors" -gt 50 ]; then
            failure_risk="HIGH"
            risk_factors+="⚠️  HIGH: $io_errors I/O errors in last 7 days (hardware problem!)"$'\n'
        elif [ "$io_errors" -gt 10 ]; then
            [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
            risk_factors+="⚠️  MODERATE: $io_errors I/O errors in last 7 days"$'\n'
        fi

        # Add disk age warning to risk factors
        [ -n "$age_warning" ] && risk_factors+="$age_warning"$'\n'

        # Add NVMe-specific warnings to risk factors
        [ -n "$nvme_wear" ] && risk_factors+="$nvme_wear"$'\n'
        [ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n'

        # Determine severity and report
        # Be SMART about health status - only flag if explicitly FAILED
        if [[ "$health" =~ FAILED ]]; then
            # SMART health check explicitly FAILED
            failed_count=$((failed_count + 1))
            add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \
                "Device: $disk
Model: $model
Serial: $serial
Health: FAILED ❌

SMART Status: FAILED
Reallocated Sectors: ${reallocated:-N/A}
Pending Sectors: ${pending:-N/A}
Uncorrectable Sectors: ${uncorrectable:-N/A}
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}

Recent I/O Errors (last 7 days): $io_errors
${recent_io_samples:+Recent errors from /var/log/messages:
$recent_io_samples}" \
                "🚨 IMMEDIATE ACTION REQUIRED - DISK FAILING:
  1. BACKUP ALL DATA IMMEDIATELY (drive may fail at any moment)
  2. Order replacement disk NOW
  3. Plan maintenance window for replacement
  4. Review SMART details: smartctl -a $disk
  5. Check logs: grep -i '${disk_name}' /var/log/messages
  6. If RAID: Verify array status and prepare for rebuild"

        elif [ "$failure_risk" = "IMMINENT" ]; then
            # Predictive: Drive will fail SOON
            failed_count=$((failed_count + 1))
            add_finding "CRITICAL" "🔴 DRIVE FAILING SOON: $disk - REPLACE URGENTLY" \
                "Device: $disk
Model: $model
Serial: $serial
Health: $health (but critical attributes detected)

⚠️  FAILURE RISK: IMMINENT - Drive will likely fail within days/weeks

Critical Issues:
$risk_factors
Power On Hours: ${power_on:-N/A}
Recent I/O Errors (last 7 days): $io_errors
${recent_io_samples:+Recent errors from /var/log/messages:
$recent_io_samples}" \
                "🚨 URGENT - DRIVE REPLACEMENT REQUIRED:
  1. Order replacement disk immediately
  2. Ensure backups are current and verified
  3. Plan replacement within 1-2 weeks (sooner if possible)
  4. Monitor daily: smartctl -A $disk
  5. Watch for increasing errors: grep -i '${disk_name}' /var/log/messages
  6. Do NOT wait for complete failure - replace proactively"

        elif [ "$failure_risk" = "HIGH" ]; then
            # High risk of failure
            warning_count=$((warning_count + 1))
            add_finding "WARNING" "🟡 HIGH FAILURE RISK: $disk - Plan Replacement" \
                "Device: $disk
Model: $model
Serial: $serial
Health: $health

⚠️  FAILURE RISK: HIGH - Replacement recommended

Risk Factors:
$risk_factors
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}
Recent I/O Errors (last 7 days): $io_errors" \
                "⚠️  PLAN DISK REPLACEMENT:
  • Order spare disk as precaution
  • Monitor weekly: smartctl -A $disk
  • Watch for deterioration in attributes
  • Ensure backups are current
  • Check logs regularly: grep -i '${disk_name}' /var/log/messages"

        elif [ "$failure_risk" = "MODERATE" ]; then
            # Moderate risk - monitor closely
            warning_count=$((warning_count + 1))
            add_finding "WARNING" "🟡 Disk $disk: Warning Signs Detected" \
                "Device: $disk
Model: $model
Serial: $serial
Health: $health

⚠️  FAILURE RISK: MODERATE - Monitor closely

Warning Signs:
$risk_factors
Temperature: ${temp:-N/A}°C
Power On Hours: ${power_on:-N/A}
Recent I/O Errors (last 7 days): $io_errors" \
                "Monitor this disk closely:
  • Check SMART weekly: smartctl -A $disk
  • Watch for increasing reallocated/pending sectors
  • Monitor system logs: grep -i '${disk_name}' /var/log/messages
  • Ensure backups are current"

        else
            # Disk is healthy
            healthy_count=$((healthy_count + 1))
            add_finding "INFO" "✅ Disk $disk: Healthy" \
                "Device: $disk
Model: $model
Serial: $serial
Health: $health ✅

SMART Attributes:
  Reallocated Sectors: ${reallocated:-0}
  Pending Sectors: ${pending:-0}
  Uncorrectable Sectors: ${uncorrectable:-0}
  Temperature: ${temp:-N/A}°C (optimal: <50°C)
  Power On Hours: ${power_on:-N/A}
  I/O Errors (7 days): $io_errors" \
                "Disk is healthy - continue regular monitoring
  • Monthly SMART check recommended: smartctl -A $disk"
        fi
    done

    # Summary finding
    add_finding "INFO" "Disk Health Summary" \
        "Total disks checked: $disk_count
Healthy: $healthy_count
Warning: $warning_count
Failed: $failed_count" \
        "Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
}

# Function to check memory health with ECC error detection
check_memory_health() {
    echo -e "${CYAN}[INFO]${NC} Checking memory health..."

    if ! command_exists dmidecode; then
        add_finding "INFO" "dmidecode Not Available" \
            "dmidecode is not installed - cannot check memory details" \
            "Install dmidecode: yum install dmidecode"
        return
    fi

    # Get memory information
    local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$")
    local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed")

    # Get total memory
    local total_mem=$(free -h | grep "Mem:" | awk '{print $2}')
    local used_mem=$(free -h | grep "Mem:" | awk '{print $3}')
    local available_mem=$(free -h | grep "Mem:" | awk '{print $7}')

    # Check for ECC
    local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l)
    local ecc_type=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | sed 's/.*Error Correction Type:[ ]*//')

    # Check for memory errors in dmesg
    local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l)

    # Check hardware errors in system log (last 7 days)
    local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error\|edac.*error" /var/log/messages 2>/dev/null | wc -l)

    # Check for specific ECC error types
    local single_bit_errors=$(grep -i "single.*bit.*error\|correctable.*ecc" /var/log/messages 2>/dev/null | wc -l)
    local multi_bit_errors=$(grep -i "multi.*bit.*error\|uncorrectable.*ecc" /var/log/messages 2>/dev/null | wc -l)

    # Check for OOM killer events
    local oom_events=$(grep -i "out of memory\|oom.*kill\|invoked oom-killer" /var/log/messages 2>/dev/null | wc -l)
    local recent_oom=""
    if [ "$oom_events" -gt 0 ]; then
        recent_oom=$(grep -i "out of memory\|oom.*kill" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/  /')
    fi

    # Check swap usage (high swap can indicate memory pressure)
    local swap_total=$(free -h | grep "Swap:" | awk '{print $2}')
    local swap_used=$(free -h | grep "Swap:" | awk '{print $3}')
    local swap_pct=0
    if [ "$swap_total" != "0B" ] && [ -n "$swap_total" ]; then
        swap_pct=$(free | grep "Swap:" | awk '{if ($2>0) print int($3/$2*100); else print 0}')
    fi

    # Try to identify bad memory module from ECC errors
    local bad_dimm=""
    if [ "$hw_mem_errors" -gt 0 ]; then
        # Look for EDAC messages that identify specific DIMMs
        bad_dimm=$(grep -i "edac.*dimm\|edac.*channel\|edac.*slot" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')
        if [ -z "$bad_dimm" ]; then
            # Try CE (Correctable Error) messages
            bad_dimm=$(grep -i "ce.*error.*channel\|ce.*error.*dimm" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')
        fi
    fi

    # Build memory details
    local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:|Locator:" | sed 's/^[ \t]*/  /')

    # ANALYZE MEMORY HEALTH
    local mem_status="HEALTHY"
    local mem_risk=""

    # CRITICAL: Multi-bit ECC errors (uncorrectable)
    if [ "$multi_bit_errors" -gt 0 ]; then
        mem_status="CRITICAL"
        mem_risk+="🔴 CRITICAL: $multi_bit_errors UNCORRECTABLE ECC errors (multi-bit) - DATA CORRUPTION RISK!"$'\n'
    fi

    # HIGH: Excessive single-bit errors
    if [ "$single_bit_errors" -gt 100 ]; then
        mem_status="CRITICAL"
        mem_risk+="🔴 CRITICAL: $single_bit_errors correctable ECC errors (BAD DIMM - replace immediately!)"$'\n'
    elif [ "$single_bit_errors" -gt 20 ]; then
        [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
        mem_risk+="🟡 WARNING: $single_bit_errors correctable ECC errors (faulty DIMM likely)"$'\n'
    elif [ "$single_bit_errors" -gt 0 ]; then
        [ "$mem_status" = "HEALTHY" ] && mem_status="INFO"
        mem_risk+="ℹ️  INFO: $single_bit_errors correctable ECC errors (monitor closely)"$'\n'
    fi

    # OOM killer events
    if [ "$oom_events" -gt 10 ]; then
        [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
        mem_risk+="🟡 WARNING: $oom_events Out-Of-Memory events (insufficient RAM for workload!)"$'\n'
    elif [ "$oom_events" -gt 0 ]; then
        mem_risk+="ℹ️  INFO: $oom_events OOM events (consider adding RAM)"$'\n'
    fi

    # Swap thrashing
    if [ "$swap_pct" -gt 80 ]; then
        [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
        mem_risk+="🟡 WARNING: Swap ${swap_pct}% full (memory pressure - consider upgrade)"$'\n'
    elif [ "$swap_pct" -gt 50 ]; then
        mem_risk+="ℹ️  INFO: Swap ${swap_pct}% used (moderate memory pressure)"$'\n'
    fi

    # Generate findings based on analysis
    if [ "$mem_status" = "CRITICAL" ]; then
        local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/  /')

        add_finding "CRITICAL" "🔴 MEMORY FAILURE: Replace RAM Immediately" \
            "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
Slots: $populated_slots / $total_slots
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)

🔴 CRITICAL MEMORY ISSUES:
$mem_risk

Memory Errors Detected:
  • Total errors in logs: $hw_mem_errors
  • Single-bit (correctable): $single_bit_errors
  • Multi-bit (UNCORRECTABLE): $multi_bit_errors
  • OOM killer events: $oom_events

${bad_dimm:+Faulty Module Location:
$bad_dimm
}
Recent errors from /var/log/messages:
$recent_errors" \
            "🚨 IMMEDIATE ACTION REQUIRED:
  1. IDENTIFY BAD DIMM: Check logs above for slot/channel information
  2. REPLACE FAULTY RAM: Order replacement immediately
  3. RUN MEMTEST: Boot memtest86+ to identify bad module
  4. CHECK ALL ERRORS: grep -i 'ecc\|edac' /var/log/messages | less
  5. MONITOR CORRUPTION: Watch for application crashes, file corruption
  6. If multi-bit errors: PLAN IMMEDIATE DOWNTIME for replacement

  Commands to identify faulty DIMM:
    • dmidecode -t memory (shows all slots)
    • grep -i edac /var/log/messages (shows which slot failing)
    • edac-util (if installed: yum install edac-utils)"

    elif [ "$mem_status" = "WARNING" ]; then
        local recent_errors=$(grep -i "memory.*error\|ecc.*error\|oom" /var/log/messages 2>/dev/null | tail -8 | sed 's/^/  /')

        add_finding "WARNING" "🟡 Memory Issues Detected - Action Required" \
            "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
Slots: $populated_slots / $total_slots
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)

⚠️  WARNING - Memory Issues:
$mem_risk

Memory Errors Detected:
  • Total errors in logs: $hw_mem_errors
  • Single-bit (correctable): $single_bit_errors
  • Multi-bit (UNCORRECTABLE): $multi_bit_errors
  • OOM killer events: $oom_events

${recent_oom:+Recent OOM Events:
$recent_oom
}
${bad_dimm:+Possible Faulty Module:
$bad_dimm
}
Recent errors:
$recent_errors" \
            "⚠️  RECOMMENDED ACTIONS:
  • Monitor error rate: grep -i 'ecc\|memory error' /var/log/messages | wc -l
  • Check for increasing errors (run daily, compare counts)
  • If ECC errors increasing: Plan RAM replacement
  • If OOM events: Consider RAM upgrade or reduce workload
  • Review memory usage: free -h && top -o %MEM | head -15

  For ECC errors:
    • Install monitoring: yum install edac-utils
    • Check status: edac-util -v
    • Identify DIMM: dmidecode -t memory | grep -A 20 'Memory Device'"

    else
        add_finding "INFO" "✅ Memory Health: No Issues Detected" \
            "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
Slots: $populated_slots / $total_slots
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)

Memory Errors: None detected
OOM Events: None detected
ECC Errors: None detected

Installed Modules:
$mem_modules" \
            "Memory appears healthy
  • Regular monitoring recommended if ECC supported
  • Watch for OOM events: grep -i 'oom' /var/log/messages"
    fi
}

# Function to check CPU health with thermal throttling detection
check_cpu_health() {
    echo -e "${CYAN}[INFO]${NC} Checking CPU health..."

    # Get CPU info
    local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')
    local cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
    local cpu_threads=$(nproc)

    # Check for CPU errors in dmesg
    local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l)

    # Check system log for hardware errors
    local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l)

    # Check for thermal throttling events
    local throttle_events=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical|thermal.*shutdown" /var/log/messages 2>/dev/null | wc -l)
    local recent_throttle=""
    if [ "$throttle_events" -gt 0 ]; then
        recent_throttle=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/  /')
    fi

    # Get current CPU frequency and max frequency
    local cpu_freq=""
    local cpu_max_freq=""
    local freq_throttled=false
    if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then
        local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null)
        cpu_freq=$(awk "BEGIN {printf \"%.2f\", $freq_khz / 1000000}" 2>/dev/null)" GHz"

        # Check max frequency
        if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" ]; then
            local max_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 2>/dev/null)
            cpu_max_freq=$(awk "BEGIN {printf \"%.2f\", $max_freq_khz / 1000000}" 2>/dev/null)" GHz"

            # Check if significantly throttled (more than 20% below max)
            local throttle_pct=$(awk "BEGIN {if ($max_freq_khz > 0) print int((1 - $freq_khz/$max_freq_khz) * 100); else print 0}" 2>/dev/null)
            if [ "$throttle_pct" -gt 20 ]; then
                freq_throttled=true
            fi
        fi
    fi

    # Check CPU temperature with multiple methods
    local cpu_temp="N/A"
    local temp_value=0
    local all_core_temps=""

    if command_exists sensors; then
        # Try to get all core temperatures
        all_core_temps=$(sensors 2>/dev/null | grep -E "Core [0-9]+:" | sed 's/^/  /')

        # Get highest core temperature
        cpu_temp=$(sensors 2>/dev/null | grep -E "Core [0-9]+:|temp1:" | grep -oP '\+\K[0-9.]+' | sort -n | tail -1)
        if [ -n "$cpu_temp" ]; then
            temp_value=${cpu_temp%.*}
            cpu_temp="${cpu_temp}°C"
        else
            cpu_temp="N/A"
        fi
    fi

    # Fallback: Check thermal zones
    if [ "$cpu_temp" = "N/A" ] && [ -d "/sys/class/thermal" ]; then
        for zone in /sys/class/thermal/thermal_zone*/temp; do
            if [ -f "$zone" ]; then
                local temp=$(cat "$zone" 2>/dev/null)
                if [ -n "$temp" ] && [ "$temp" -gt 0 ]; then
                    temp_value=$((temp / 1000))
                    cpu_temp="${temp_value}°C"
                    break
                fi
            fi
        done
    fi

    # Check load average
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//')
    local load_1min=$(echo "$load_avg" | awk -F',' '{print $1}' | tr -d ' ')

    # Calculate load percentage
    local load_pct=0
    if [ -n "$load_1min" ] && [ "$cpu_threads" -gt 0 ]; then
        load_pct=$(awk "BEGIN {printf \"%.0f\", ($load_1min / $cpu_threads) * 100}" 2>/dev/null)
    fi

    # ANALYZE CPU HEALTH
    local cpu_status="HEALTHY"
    local cpu_risk=""

    # CRITICAL: MCE/Hardware errors
    if [ "$hw_cpu_errors" -gt 0 ] || [ "$cpu_errors" -gt 0 ]; then
        cpu_status="CRITICAL"
        cpu_risk+="🔴 CRITICAL: $((cpu_errors + hw_cpu_errors)) Machine Check Exceptions (MCE) - HARDWARE FAILURE!"$'\n'
    fi

    # CRITICAL: Extreme overheating
    if [ "$temp_value" -gt 90 ]; then
        cpu_status="CRITICAL"
        cpu_risk+="🔴 CRITICAL: CPU temperature ${cpu_temp} - EXTREME OVERHEATING (damage risk!)"$'\n'
    elif [ "$temp_value" -gt 80 ]; then
        [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
        cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - OVERHEATING (threshold: 80°C)"$'\n'
    elif [ "$temp_value" -gt 70 ]; then
        [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
        cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - HIGH (normal: <70°C)"$'\n'
    fi

    # Thermal throttling
    if [ "$throttle_events" -gt 10 ]; then
        [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
        cpu_risk+="🟡 WARNING: $throttle_events thermal throttling events - COOLING PROBLEM!"$'\n'
    elif [ "$throttle_events" -gt 0 ]; then
        cpu_risk+="ℹ️  INFO: $throttle_events thermal throttling events detected"$'\n'
    fi

    # Frequency throttling
    if $freq_throttled; then
        [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
        cpu_risk+="🟡 WARNING: CPU frequency throttled (${cpu_freq} / ${cpu_max_freq} max) - thermal or power limiting"$'\n'
    fi

    # High sustained load
    if [ "$load_pct" -gt 200 ]; then
        cpu_risk+="ℹ️  INFO: Very high load (${load_pct}% of capacity) - server may be overloaded"$'\n'
    fi

    # Generate findings
    if [ "$cpu_status" = "CRITICAL" ]; then
        local recent_errors=$(grep -iE "mce|machine check|cpu.*error|thermal.*critical" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/  /')

        add_finding "CRITICAL" "🔴 CPU CRITICAL: Hardware Failure or Overheating" \
            "CPU Model: $cpu_model
Cores: $cpu_cores (Threads: $cpu_threads)
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
Temperature: $cpu_temp ${temp_value:+(CRITICAL threshold: 80°C)}
Load Average: $load_avg (${load_pct}% capacity)

🔴 CRITICAL CPU ISSUES:
$cpu_risk

Hardware Errors:
  • MCE/CPU errors: $((cpu_errors + hw_cpu_errors))
  • Thermal throttling events: $throttle_events

${all_core_temps:+Individual Core Temperatures:
$all_core_temps
}
${recent_throttle:+Recent Thermal Events:
$recent_throttle
}
Recent errors from logs:
$recent_errors" \
            "🚨 IMMEDIATE ACTION REQUIRED:
  1. CHECK TEMPERATURE: If >90°C, shut down immediately to prevent damage!
  2. COOLING SYSTEM: Check fans, heatsink, thermal paste
  3. MCE ERRORS: Critical hardware failure - contact vendor/provider
  4. CLEAN SYSTEM: Remove dust from fans and heatsinks
  5. VERIFY AIRFLOW: Ensure proper case ventilation
  6. MONITOR: Watch temps continuously: watch -n 2 sensors

  Commands:
    • View all temps: sensors
    • Check MCE details: dmesg | grep -i mce | less
    • Monitor throttling: grep -i thermal /var/log/messages
    • Check frequency: cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq"

    elif [ "$cpu_status" = "WARNING" ]; then
        add_finding "WARNING" "🟡 CPU Issues Detected - Cooling or Hardware Problem" \
            "CPU Model: $cpu_model
Cores: $cpu_cores (Threads: $cpu_threads)
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
Temperature: $cpu_temp
Load Average: $load_avg (${load_pct}% capacity)

⚠️  WARNING - CPU Issues:
$cpu_risk

Monitoring:
  • Thermal throttling events: $throttle_events
  • Current temperature: $cpu_temp

${all_core_temps:+Individual Core Temperatures:
$all_core_temps
}
${recent_throttle:+Recent Thermal Events:
$recent_throttle
}" \
            "⚠️  RECOMMENDED ACTIONS:
  • Clean cooling system (fans, heatsink)
  • Verify fan operation: sensors (check fan RPM)
  • Check case ventilation and airflow
  • Monitor temperature trends: watch -n 5 sensors
  • If throttling persists: Replace thermal paste or upgrade cooling
  • Consider reducing workload if temperature stays high

  Commands:
    • Monitor live: watch -n 2 sensors
    • Check throttling: grep -i thermal /var/log/messages
    • View frequencies: grep MHz /proc/cpuinfo"

    else
        add_finding "INFO" "✅ CPU Health: Normal Operation" \
            "CPU Model: $cpu_model
Cores: $cpu_cores (Threads: $cpu_threads)
Current Frequency: ${cpu_freq:-N/A} ${cpu_max_freq:+(Max: ${cpu_max_freq})}
Temperature: $cpu_temp ${temp_value:+(normal: <70°C)}
Load Average: $load_avg (${load_pct}% capacity)

Hardware Errors: None detected
Thermal Throttling: None detected
Frequency Throttling: None detected

${all_core_temps:+Individual Core Temperatures:
$all_core_temps
}" \
            "CPU is operating normally
  • Regular temperature monitoring recommended
  • Monitor: sensors (if installed)"
    fi

    # Check if sensors are available for monitoring
    if ! command_exists sensors; then
        add_finding "INFO" "Temperature Monitoring Not Available" \
            "lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \
            "Install sensors for temperature monitoring:
  1. yum install lm_sensors
  2. Run: sensors-detect (answer YES to all prompts)
  3. Start service: systemctl start lm_sensors
  4. View temperatures: sensors"
    fi
}

# Function to check system hardware errors
check_hardware_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..."

    # Check for general hardware errors
    local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l)

    if [ "$hw_errors" -gt 0 ]; then
        local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/  /')

        add_finding "WARNING" "Hardware Errors in System Log" \
            "Total hardware-related errors: $hw_errors

Recent errors (last 10):
$recent_errors" \
            "Hardware errors detected in system logs:
  • Review full log: grep -iE 'hardware error|i/o error' /var/log/messages
  • Check dmesg: dmesg | grep -i error | tail -20
  • Identify failing component (disk, memory, CPU, etc.)
  • Run component-specific diagnostics
  • Contact hosting provider if persistent"
    fi
}

# Function to check RAID status
check_raid_status() {
    echo -e "${CYAN}[INFO]${NC} Checking RAID status..."

    local raid_found=false

    # Check for software RAID (mdadm)
    if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then
        raid_found=true
        local raid_status=$(cat /proc/mdstat)
        local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]")

        if [ "$degraded" -gt 0 ]; then
            add_finding "CRITICAL" "Software RAID Degraded" \
                "RAID array is degraded:

$raid_status" \
                "RAID array degraded - immediate action required:
  • Check details: cat /proc/mdstat
  • Identify failed drive: mdadm --detail /dev/md*
  • Replace failed drive and rebuild array
  • Ensure backups are current"
        else
            add_finding "INFO" "Software RAID Status" \
                "$raid_status" \
                "Software RAID is healthy"
        fi
    fi

    # Check for hardware RAID (common controllers)
    if command_exists megacli; then
        raid_found=true
        local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive")
        add_finding "INFO" "MegaRAID Status" \
            "$raid_info" \
            "Check details: megacli -LDInfo -Lall -aALL"
    fi

    if ! $raid_found; then
        add_finding "INFO" "No RAID Detected" \
            "No software or hardware RAID arrays detected" \
            "System appears to use non-RAID storage"
    fi
}

# Function to check disk I/O errors
check_disk_io_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..."

    # Check for I/O errors in dmesg
    local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l)

    if [ "$io_errors" -gt 0 ]; then
        local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/  /')

        add_finding "WARNING" "Disk I/O Errors Detected" \
            "Total I/O errors in dmesg: $io_errors

Recent I/O errors (last 10):
$recent_io_errors" \
            "Disk I/O errors detected - indicates hardware or connection issues:
  • Check SMART status (see above)
  • Review dmesg: dmesg | grep -i 'i/o error'
  • Check cables and connections (if physical server)
  • Check for disk controller issues
  • May indicate failing disk or controller"
    fi
}

# Function to check filesystem errors
check_filesystem_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking filesystem errors..."

    # Check for filesystem errors in logs
    local fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)

    if [ "$fs_errors" -gt 0 ]; then
        local recent_fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')

        local severity="WARNING"
        # Check for read-only remounts (critical)
        local ro_remounts=$(grep -i "remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)
        if [ "$ro_remounts" -gt 0 ]; then
            severity="CRITICAL"
        fi

        add_finding "$severity" "🔴 Filesystem Errors Detected" \
            "Total filesystem errors in logs: $fs_errors
Read-only remounts: $ro_remounts

Recent filesystem errors (last 5):
$recent_fs_errors" \
            "Filesystem errors detected - may indicate disk corruption:
  • Check filesystem: fsck (requires unmounting or single-user mode)
  • Review all errors: grep -i 'filesystem.*error' /var/log/messages
  • Check disk SMART status above
  • If read-only remount: System is protecting data - investigate immediately
  • May need to boot rescue mode to repair
  • Ensure backups are current before repair attempts"
    fi
}

# Function to check system fans
check_system_fans() {
    # Skip fan checks on virtual machines (hypervisor controls physical fans)
    if [ "$IS_VIRTUAL" = true ]; then
        echo -e "${CYAN}[INFO]${NC} Skipping fan checks (virtual machine - $VIRT_TYPE)"
        return 0
    fi

    echo -e "${CYAN}[INFO]${NC} Checking system fan status..."

    if ! command_exists sensors; then
        return  # Silently skip if sensors not installed
    fi

    # Get fan information
    local fan_data=$(sensors 2>/dev/null | grep -i "fan")

    if [ -z "$fan_data" ]; then
        return  # No fan data available
    fi

    # Check for failed fans (0 RPM or missing)
    local failed_fans=$(echo "$fan_data" | grep "0 RPM\|FAULT" | wc -l)
    local slow_fans=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | wc -l)

    if [ "$failed_fans" -gt 0 ]; then
        local failed_fan_list=$(echo "$fan_data" | grep "0 RPM\|FAULT" | sed 's/^/  /')

        add_finding "CRITICAL" "🔴 FAILED FAN(S) DETECTED" \
            "Failed fans: $failed_fans

Failed fan details:
$failed_fan_list

All fan data:
$(echo "$fan_data" | sed 's/^/  /')" \
            "🚨 CRITICAL - FAN FAILURE DETECTED:
  • Failed fans detected - system may overheat!
  • Check all fan data: sensors
  • Physical inspection required
  • Replace failed fan immediately
  • Monitor CPU/system temperatures closely
  • May need emergency shutdown if temps rise above 90°C"

    elif [ "$slow_fans" -gt 0 ]; then
        local slow_fan_list=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | sed 's/^/  /')

        add_finding "WARNING" "🟡 Slow Fan(s) Detected" \
            "Slow fans (< 800 RPM): $slow_fans

Slow fan details:
$slow_fan_list

All fan data:
$(echo "$fan_data" | sed 's/^/  /')" \
            "⚠️  WARNING - FANS RUNNING SLOW:
  • Fans running slower than normal
  • May indicate fan wear or BIOS power settings
  • Monitor temperatures closely
  • Consider fan replacement if temperatures rise
  • Check BIOS fan control settings"
    else
        add_finding "INFO" "✅ System Fans: Normal Operation" \
            "All fans operating normally:

$(echo "$fan_data" | sed 's/^/  /')" \
            "All system fans operating within normal parameters"
    fi
}

# Function to check network interface errors
check_network_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking network interface errors..."

    if ! command_exists ethtool; then
        return  # Silently skip if ethtool not installed
    fi

    # Get all active network interfaces (exclude loopback)
    local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v '^lo$' | grep -v '^docker' | grep -v '^veth' | grep -v '^br-')

    if [ -z "$interfaces" ]; then
        return  # No interfaces found
    fi

    local total_rx_dropped=0
    local total_tx_dropped=0
    local total_rx_errors=0
    local total_tx_errors=0
    local total_crc_errors=0
    local problem_interfaces=""
    local has_issues=false

    while IFS= read -r interface; do
        # Get statistics for this interface
        local stats=$(ethtool -S "$interface" 2>/dev/null)

        if [ -n "$stats" ]; then
            # Extract key error metrics (different NICs use different naming)
            local rx_dropped=$(echo "$stats" | grep -iE "rx.*drop|rx_discards" | awk '{sum+=$2} END {print sum+0}')
            local tx_dropped=$(echo "$stats" | grep -iE "tx.*drop|tx_discards" | awk '{sum+=$2} END {print sum+0}')
            local rx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*rx_errors" | awk '{print $2}')
            local tx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*tx_errors" | awk '{print $2}')
            local crc_errors=$(echo "$stats" | grep -iE "crc.*error|rx_crc" | awk '{sum+=$2} END {print sum+0}')

            # Accumulate totals
            total_rx_dropped=$((total_rx_dropped + rx_dropped))
            total_tx_dropped=$((total_tx_dropped + tx_dropped))
            total_rx_errors=$((total_rx_errors + rx_errors))
            total_tx_errors=$((total_tx_errors + tx_errors))
            total_crc_errors=$((total_crc_errors + crc_errors))

            # Check if this interface has significant issues
            if [ "$rx_dropped" -gt 1000 ] || [ "$tx_dropped" -gt 1000 ] || [ "$crc_errors" -gt 100 ]; then
                has_issues=true
                problem_interfaces+="  $interface:
    RX dropped: $rx_dropped
    TX dropped: $tx_dropped
    CRC errors: $crc_errors
"
            fi
        fi
    done <<< "$interfaces"

    # Determine severity
    local severity="INFO"
    if [ "$total_rx_dropped" -gt 10000 ] || [ "$total_tx_dropped" -gt 10000 ] || [ "$total_crc_errors" -gt 1000 ]; then
        severity="CRITICAL"
    elif [ "$total_rx_dropped" -gt 1000 ] || [ "$total_tx_dropped" -gt 1000 ] || [ "$total_crc_errors" -gt 100 ]; then
        severity="WARNING"
    fi

    if [ "$has_issues" = true ] || [ "$severity" != "INFO" ]; then
        add_finding "$severity" "🔴 Network Interface Errors Detected" \
            "Total across all interfaces:
  • RX packets dropped: $total_rx_dropped
  • TX packets dropped: $total_tx_dropped
  • RX errors: $total_rx_errors
  • TX errors: $total_tx_errors
  • CRC errors: $total_crc_errors

Problem interfaces:
$problem_interfaces" \
            "Network errors detected - may indicate hardware or driver issues:
  • Check interface: ethtool eth0
  • Check dmesg: dmesg | grep -i 'eth\|network'
  • High drops may indicate:
    - Network card failure
    - Driver issues
    - Switch/cable problems
    - Bandwidth saturation
  • CRC errors indicate:
    - Bad cable
    - EMI interference
    - Faulty NIC
  • If persistent: Replace network cable first, then NIC if needed"
    else
        # All healthy
        add_finding "INFO" "✅ Network Interfaces: Healthy" \
            "All network interfaces operating normally
Total interfaces checked: $(echo "$interfaces" | wc -l)
No significant packet drops or errors detected" \
            "Network hardware is functioning properly"
    fi
}

# Function to check PCI/PCIe errors
check_pci_errors() {
    echo -e "${CYAN}[INFO]${NC} Checking PCI/PCIe errors..."

    # Check for PCI errors in dmesg and logs
    local pci_errors=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error|correctable.*error.*pci|uncorrectable.*error.*pci" | wc -l)

    # Also check cached messages if available
    local log_pci_errors=0
    if [ -f "$MESSAGES_CACHE" ]; then
        log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" "$MESSAGES_CACHE" 2>/dev/null | wc -l)
    else
        log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" /var/log/messages 2>/dev/null | wc -l)
    fi

    local total_pci_errors=$((pci_errors + log_pci_errors))

    if [ "$total_pci_errors" -gt 0 ]; then
        # Get samples from both sources
        local dmesg_samples=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error" | tail -5 | sed 's/^/  /')
        local log_samples=""
        if [ -f "$MESSAGES_CACHE" ]; then
            log_samples=$(grep -iE "pci.*error|pcie.*error" "$MESSAGES_CACHE" 2>/dev/null | tail -3 | sed 's/^/  /')
        fi

        # Check for uncorrectable errors (more serious)
        local uncorrectable=0
        if echo "$dmesg_samples" | grep -qi "uncorrectable"; then
            uncorrectable=1
        fi

        local severity="WARNING"
        if [ "$uncorrectable" -eq 1 ] || [ "$total_pci_errors" -gt 50 ]; then
            severity="CRITICAL"
        fi

        add_finding "$severity" "🔴 PCI/PCIe Errors Detected" \
            "Total PCI errors: $total_pci_errors
Uncorrectable errors: $([ "$uncorrectable" -eq 1 ] && echo 'YES (CRITICAL!)' || echo 'No')

Recent errors from dmesg:
$dmesg_samples

${log_samples:+Recent errors from /var/log/messages:
$log_samples}" \
            "PCI/PCIe errors detected - may indicate hardware problems:
  • Uncorrectable errors = serious hardware issue
  • Correctable errors = potential signal integrity problems
  • Check details: dmesg | grep -i 'pci.*error'
  • Check PCIe link status: lspci -vv | grep -A 5 'LnkSta'
  • May indicate:
    - Faulty PCIe device (network card, RAID controller, etc.)
    - Motherboard issues
    - Power supply problems
    - Improper card seating
  • If persistent: Reseat cards, check for firmware updates
  • If uncorrectable: Replace failing hardware immediately"
    fi
}

# Function to check kernel parameters
check_kernel_parameters() {
    echo -e "${CYAN}[INFO]${NC} Checking kernel parameters..."

    local issues=""
    local warnings=""
    local info=""

    # Check vm.swappiness (should be 1-10 for servers)
    if command_exists sysctl; then
        local swappiness=$(sysctl -n vm.swappiness 2>/dev/null)
        if [ -n "$swappiness" ]; then
            if [ "$swappiness" -gt 60 ]; then
                warnings+="  • vm.swappiness=$swappiness (HIGH - should be 1-10 for servers)
"
            elif [ "$swappiness" -gt 10 ]; then
                info+="  • vm.swappiness=$swappiness (consider lowering to 1-10 for better performance)
"
            else
                info+="  • vm.swappiness=$swappiness ✅
"
            fi
        fi

        # Check vm.dirty_ratio (should be 10-20)
        local dirty_ratio=$(sysctl -n vm.dirty_ratio 2>/dev/null)
        if [ -n "$dirty_ratio" ]; then
            if [ "$dirty_ratio" -gt 40 ]; then
                warnings+="  • vm.dirty_ratio=$dirty_ratio (HIGH - may cause stalls, recommended: 10-20)
"
            elif [ "$dirty_ratio" -lt 10 ]; then
                info+="  • vm.dirty_ratio=$dirty_ratio (low - may impact write performance)
"
            else
                info+="  • vm.dirty_ratio=$dirty_ratio ✅
"
            fi
        fi

        # Check Transparent Huge Pages (should be never or madvise for databases)
        local thp_enabled=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+')
        if [ -n "$thp_enabled" ]; then
            if [ "$thp_enabled" = "always" ]; then
                warnings+="  • Transparent Huge Pages=always (can cause latency spikes for databases)
    Recommended: echo never > /sys/kernel/mm/transparent_hugepage/enabled
"
            else
                info+="  • Transparent Huge Pages=$thp_enabled ✅
"
            fi
        fi
    fi

    # Check I/O schedulers for each disk
    if [ "$IS_VIRTUAL" != true ]; then  # Only check on physical servers
        local disks=$(lsblk -nd -o NAME,TYPE 2>/dev/null | awk '$2=="disk" {print $1}')
        if [ -n "$disks" ]; then
            while IFS= read -r disk; do
                local scheduler=$(cat /sys/block/$disk/queue/scheduler 2>/dev/null | grep -oP '\[\K[^\]]+')
                local rotational=$(cat /sys/block/$disk/queue/rotational 2>/dev/null)

                if [ -n "$scheduler" ] && [ -n "$rotational" ]; then
                    # Check if scheduler is appropriate for disk type
                    if [[ "$disk" == nvme* ]]; then
                        # NVMe should use 'none'
                        if [ "$scheduler" != "none" ]; then
                            info+="  • /dev/$disk (NVMe): scheduler=$scheduler (consider 'none' for NVMe)
"
                        else
                            info+="  • /dev/$disk (NVMe): scheduler=$scheduler ✅
"
                        fi
                    elif [ "$rotational" = "0" ]; then
                        # SSD should use mq-deadline or none
                        if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "none" ] && [ "$scheduler" != "deadline" ]; then
                            info+="  • /dev/$disk (SSD): scheduler=$scheduler (consider 'mq-deadline' for SSD)
"
                        else
                            info+="  • /dev/$disk (SSD): scheduler=$scheduler ✅
"
                        fi
                    else
                        # HDD should use mq-deadline or deadline
                        if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "deadline" ]; then
                            info+="  • /dev/$disk (HDD): scheduler=$scheduler (consider 'mq-deadline' for HDD)
"
                        else
                            info+="  • /dev/$disk (HDD): scheduler=$scheduler ✅
"
                        fi
                    fi
                fi
            done <<< "$disks"
        fi
    fi

    # Generate finding based on what we found
    if [ -n "$warnings" ]; then
        add_finding "WARNING" "⚠️  Kernel Parameters: Sub-Optimal Configuration" \
            "Performance-impacting kernel parameters detected:

$warnings
${info:+
Informational:
$info}" \
            "Kernel parameters affect system performance and stability:
  • vm.swappiness: Controls swap usage (1-10 for servers)
    - Fix: sysctl -w vm.swappiness=10
    - Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf
  • vm.dirty_ratio: Controls dirty page cache
    - Fix: sysctl -w vm.dirty_ratio=15
  • Transparent Huge Pages: Can cause latency for databases
    - Fix: echo never > /sys/kernel/mm/transparent_hugepage/enabled
  • I/O Scheduler: Affects disk performance
    - NVMe: echo none > /sys/block/nvme0n1/queue/scheduler
    - SSD: echo mq-deadline > /sys/block/sda/queue/scheduler"
    elif [ -n "$info" ]; then
        add_finding "INFO" "ℹ️  Kernel Parameters: Configuration Status" \
            "Current kernel parameters:

$info" \
            "Kernel parameters are within acceptable ranges. Minor optimizations may be possible."
    fi
}

# Function to generate report
generate_report() {
    local report_content=""

    # Count findings by severity
    local critical_count=0
    local warning_count=0
    local info_count=0

    for finding in "${FINDINGS[@]}"; do
        local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
        case "$severity" in
            CRITICAL) critical_count=$((critical_count + 1)) ;;
            WARNING) warning_count=$((warning_count + 1)) ;;
            INFO) info_count=$((info_count + 1)) ;;
        esac
    done

    report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
    report_content+="║                    HARDWARE HEALTH CHECK REPORT                              ║"$'\n'
    report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
    report_content+=""$'\n'
    report_content+="Date: $(date '+%Y-%m-%d %H:%M:%S')"$'\n'
    report_content+="System: $SYS_HOSTNAME"$'\n'
    report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n'
    report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n'
    report_content+=""$'\n'

    # VISUAL SEVERITY SUMMARY - Make issues OBVIOUS
    report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
    if [ "$critical_count" -gt 0 ]; then
        report_content+="║  🔴 CRITICAL ISSUES DETECTED - IMMEDIATE ACTION REQUIRED                    ║"$'\n'
    elif [ "$warning_count" -gt 0 ]; then
        report_content+="║  🟡 WARNING - Hardware Issues Detected                                      ║"$'\n'
    else
        report_content+="║  ✅ ALL HARDWARE CHECKS PASSED - System Healthy                             ║"$'\n'
    fi
    report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
    report_content+=""$'\n'

    # Severity breakdown
    report_content+="FINDINGS SUMMARY:"$'\n'
    report_content+="──────────────────────────────────────────────────────────────────────────────"$'\n'
    if [ "$critical_count" -gt 0 ]; then
        report_content+="  🔴 CRITICAL: $critical_count issue(s) - URGENT ATTENTION REQUIRED"$'\n'
    fi
    if [ "$warning_count" -gt 0 ]; then
        report_content+="  🟡 WARNING:  $warning_count issue(s) - Review and plan action"$'\n'
    fi
    report_content+="  ℹ️  INFO:     $info_count item(s) - Status information"$'\n'
    report_content+=""$'\n'

    # If critical issues, list them prominently at the top
    if [ "$critical_count" -gt 0 ]; then
        report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
        report_content+="║  🚨 CRITICAL ISSUES REQUIRING IMMEDIATE ATTENTION                           ║"$'\n'
        report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
        report_content+=""$'\n'

        local critical_num=1
        for finding in "${FINDINGS[@]}"; do
            local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
            if [ "$severity" = "CRITICAL" ]; then
                local title=$(echo "$finding" | sed 's/^\[[^]]*\] //' | sed 's/@@@SEP@@@.*//')
                report_content+="  $critical_num. $title"$'\n'
                critical_num=$((critical_num + 1))
            fi
        done
        report_content+=""$'\n'
        report_content+="  ⚠️  SEE DETAILED FINDINGS BELOW FOR SPECIFIC ACTIONS TO TAKE"$'\n'
        report_content+=""$'\n'
    fi

    report_content+="=============================================================================="$'\n'
    report_content+=""$'\n'

    # Group findings by category
    local -A categories
    categories["DISK"]=""
    categories["MEMORY"]=""
    categories["CPU"]=""
    categories["RAID"]=""
    categories["OTHER"]=""

    for finding in "${FINDINGS[@]}"; do
        # Split by @@@SEP@@@ delimiter
        local severity_title="${finding%%@@@SEP@@@*}"
        local temp="${finding#*@@@SEP@@@}"
        local details="${temp%%@@@SEP@@@*}"
        local recommendation="${temp#*@@@SEP@@@}"

        # Extract severity from [SEVERITY] Title format
        local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
        local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')

        local category="OTHER"
        if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then
            category="DISK"
        elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then
            category="MEMORY"
        elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then
            category="CPU"
        elif [[ "$title" == *"RAID"* ]]; then
            category="RAID"
        fi

        local entry=""
        entry+="[$severity] $title"$'\n'
        entry+="$details"$'\n'
        if [ -n "$recommendation" ]; then
            entry+="Recommendation:"$'\n'
            entry+="$recommendation"$'\n'
        fi
        entry+=""$'\n'
        entry+="------------------------------------------------------------------------------"$'\n'
        entry+=""$'\n'

        categories[$category]+="$entry"
    done

    # Output sections
    if [ -n "${categories[DISK]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="DISK HEALTH & SMART STATUS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[DISK]}"
    fi

    if [ -n "${categories[MEMORY]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="MEMORY HEALTH"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[MEMORY]}"
    fi

    if [ -n "${categories[CPU]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="CPU HEALTH"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[CPU]}"
    fi

    if [ -n "${categories[RAID]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="RAID STATUS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[RAID]}"
    fi

    if [ -n "${categories[OTHER]}" ]; then
        report_content+="=============================================================================="$'\n'
        report_content+="OTHER HARDWARE FINDINGS"$'\n'
        report_content+="=============================================================================="$'\n'
        report_content+=""$'\n'
        report_content+="${categories[OTHER]}"
    fi

    report_content+="=============================================================================="$'\n'
    report_content+="NEXT STEPS"$'\n'
    report_content+="=============================================================================="$'\n'
    report_content+=""$'\n'
    report_content+="Priority Actions:"$'\n'
    report_content+="  1. Address any CRITICAL issues immediately"$'\n'
    report_content+="  2. Monitor WARNING issues closely"$'\n'
    report_content+="  3. Schedule regular hardware health checks"$'\n'
    report_content+=""$'\n'
    report_content+="Additional Analysis Available:"$'\n'
    report_content+="  • System Health Check (Main Menu) for overall server health"$'\n'
    report_content+="  • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n'
    report_content+=""$'\n'
    report_content+="Report saved to: $REPORT_FILE"$'\n'
    report_content+=""$'\n'

    echo "$report_content"
    echo "$report_content" > "$REPORT_FILE"
}

# Main execution
main() {
    show_banner
    echo -e "${MAGENTA}${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
    echo -e "${MAGENTA}${BOLD}║         🔧 HARDWARE HEALTH CHECK - Deep Analysis            ║${NC}"
    echo -e "${MAGENTA}${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
    echo ""

    # Detect virtualization FIRST (affects which checks to run)
    echo -e "${CYAN}[INFO]${NC} Detecting environment (physical vs virtual)..."
    detect_virtualization
    echo ""

    echo -e "${CYAN}Performing comprehensive hardware diagnostics...${NC}"
    echo -e "${CYAN}Checks: Disks (SMART/NVMe/Age), Memory (ECC), CPU (Thermal), RAID, Filesystem, Fans, PCI, Network, Kernel${NC}"
    echo ""

    # OPTIMIZATION: Cache /var/log/messages once (avoid 32 separate grep calls)
    # Note: Using temp file instead of variable to avoid "Argument list too long" errors
    echo -e "${CYAN}[INFO]${NC} Caching system logs for analysis..."
    MESSAGES_CACHE="/tmp/hw_health_messages_cache_$$.tmp"
    if [ -f /var/log/messages ]; then
        cat /var/log/messages 2>/dev/null > "$MESSAGES_CACHE"
    else
        touch "$MESSAGES_CACHE"
    fi
    # Cleanup cache on exit
    trap "rm -f $MESSAGES_CACHE" EXIT

    # Run diagnostics with progress indicators
    echo -e "${YELLOW}[1/11]${NC} Analyzing disk SMART status and predictive failure indicators..."
    check_disk_smart

    echo -e "${YELLOW}[2/11]${NC} Checking memory health (ECC errors, OOM events, swap usage)..."
    check_memory_health

    echo -e "${YELLOW}[3/11]${NC} Monitoring CPU health (temperature, throttling, MCE errors)..."
    check_cpu_health

    echo -e "${YELLOW}[4/11]${NC} Scanning system hardware error logs..."
    check_hardware_errors

    echo -e "${YELLOW}[5/11]${NC} Verifying RAID array status..."
    check_raid_status

    echo -e "${YELLOW}[6/11]${NC} Analyzing disk I/O errors..."
    check_disk_io_errors

    echo -e "${YELLOW}[7/11]${NC} Checking for filesystem errors..."
    check_filesystem_errors

    echo -e "${YELLOW}[8/11]${NC} Monitoring system fans..."
    check_system_fans

    echo -e "${YELLOW}[9/11]${NC} Checking for PCI/PCIe errors..."
    check_pci_errors

    echo -e "${YELLOW}[10/11]${NC} Checking network interface errors..."
    check_network_errors

    echo -e "${YELLOW}[11/11]${NC} Validating kernel parameters..."
    check_kernel_parameters

    echo ""
    echo -e "${GREEN}[✓]${NC} Hardware diagnostics complete!"
    echo ""

    # Generate and display report
    echo -e "${CYAN}Generating detailed report...${NC}"
    echo ""
    generate_report

    # EXECUTIVE SUMMARY - Quick status overview
    echo ""
    echo -e "${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
    echo -e "${BOLD}║             EXECUTIVE SUMMARY - Component Status            ║${NC}"
    echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
    echo ""

    # Analyze findings to determine component status
    local disk_status="✅" memory_status="✅" cpu_status="✅" raid_status="✅"
    local fs_status="✅" fan_status="✅" pci_status="✅" network_status="✅" kernel_status="✅" overall="HEALTHY"

    for finding in "${FINDINGS[@]}"; do
        local severity_title="${finding%%@@@SEP@@@*}"
        local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
        local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')

        # Categorize by component
        if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"DRIVE"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then disk_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$disk_status" != "🔴" ]; then disk_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]] || [[ "$title" == *"RAM"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then memory_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$memory_status" != "🔴" ]; then memory_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"thermal"* ]] || [[ "$title" == *"temperature"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then cpu_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$cpu_status" != "🔴" ]; then cpu_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"RAID"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then raid_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$raid_status" != "🔴" ]; then raid_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"Filesystem"* ]] || [[ "$title" == *"read-only"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then fs_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$fs_status" != "🔴" ]; then fs_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"Fan"* ]] || [[ "$title" == *"fan"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then fan_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$fan_status" != "🔴" ]; then fan_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"PCI"* ]] || [[ "$title" == *"PCIe"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then pci_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$pci_status" != "🔴" ]; then pci_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"Network"* ]] || [[ "$title" == *"Interface"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then network_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$network_status" != "🔴" ]; then network_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        elif [[ "$title" == *"Kernel"* ]] || [[ "$title" == *"Parameter"* ]]; then
            if [ "$severity" = "CRITICAL" ]; then kernel_status="🔴"; overall="CRITICAL"
            elif [ "$severity" = "WARNING" ] && [ "$kernel_status" != "🔴" ]; then kernel_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
            fi
        fi
    done

    # Display component summary
    echo -e "  Disks/Storage: $disk_status    Memory: $memory_status    CPU: $cpu_status    RAID: $raid_status"
    echo -e "  Filesystem: $fs_status    Fans: $fan_status    PCI/PCIe: $pci_status"
    echo -e "  Network: $network_status    Kernel: $kernel_status"
    echo ""

    # Count critical/warning issues
    local critical_count=0
    local warning_count=0
    for finding in "${FINDINGS[@]}"; do
        local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
        case "$severity" in
            CRITICAL) critical_count=$((critical_count + 1)) ;;
            WARNING) warning_count=$((warning_count + 1)) ;;
        esac
    done

    # Overall status
    if [ "$overall" = "CRITICAL" ]; then
        echo -e "  ${RED}${BOLD}Overall Status: 🔴 CRITICAL - $critical_count issue(s) require IMMEDIATE action!${NC}"
    elif [ "$overall" = "WARNING" ]; then
        echo -e "  ${YELLOW}${BOLD}Overall Status: 🟡 WARNING - $warning_count issue(s) detected${NC}"
    else
        echo -e "  ${GREEN}${BOLD}Overall Status: ✅ HEALTHY - All systems operating normally${NC}"
    fi

    echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
    echo ""
    echo -e "${CYAN}Full report saved to:${NC} ${BOLD}$REPORT_FILE${NC}"
    echo ""

    press_enter

    # Severity-based exit codes for monitoring system integration
    # exit 0 = healthy (INFO only)
    # exit 1 = warnings detected
    # exit 2 = critical issues detected
    case "$overall" in
        CRITICAL) exit 2 ;;
        WARNING) exit 1 ;;
        *) exit 0 ;;
    esac
}

# Run main function
main