#!/bin/bash # Hardware Health Check # Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors # Get the script's directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # Source required libraries source "$TOOLKIT_ROOT/lib/common-functions.sh" source "$TOOLKIT_ROOT/lib/system-detect.sh" source "$TOOLKIT_ROOT/lib/reference-db.sh" # Initialize system detection detect_system # Load system info from reference database if [ -f "$TOOLKIT_ROOT/.sysref" ]; then SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4) SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3) SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4) fi # Color definitions RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' MAGENTA='\033[0;35m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' # Report file REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt" # Analysis results storage declare -a FINDINGS=() # Function to add finding add_finding() { [ -z "$1" ] || [ -z "$2" ] && return 1 local severity="$1" local title="$2" local details="$3" local recommendation="$4" # Use @@@SEP@@@ as separator to avoid conflicts with content FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation") } # Function to check if command exists command_exists() { [ -z "$1" ] && return 1 command -v "$1" &>/dev/null } # Global variables for virtualization detection IS_VIRTUAL=false VIRT_TYPE="physical" # Function to detect virtualization detect_virtualization() { IS_VIRTUAL=false VIRT_TYPE="physical" # Try systemd-detect-virt first (most reliable) if command_exists systemd-detect-virt; then local detected=$(systemd-detect-virt 2>/dev/null) if [ -n "$detected" ] && [ "$detected" != "none" ]; then IS_VIRTUAL=true VIRT_TYPE="$detected" fi # Fallback: check dmidecode elif command_exists dmidecode; then local product=$(dmidecode -s system-product-name 2>/dev/null) if echo "$product" | grep -qiE "kvm|qemu|vmware|virtualbox|xen|hyperv"; then IS_VIRTUAL=true VIRT_TYPE=$(echo "$product" | grep -oiE "kvm|qemu|vmware|virtualbox|xen|hyperv" | head -1) fi fi # Add finding if virtual if [ "$IS_VIRTUAL" = true ]; then add_finding "INFO" "ℹ️ Virtual Machine Detected" \ "Environment: $VIRT_TYPE Hardware checks adapted for virtual machine: • SMART disk checks: SKIPPED (VMs use virtual disks) • Fan monitoring: SKIPPED (hypervisor controls physical fans) • Some sensors: SKIPPED (not accessible in VM) • Memory/CPU/Network checks: ACTIVE (VM-compatible)" \ "This is normal for virtual machines. Hardware monitoring is limited to VM-accessible components." else add_finding "INFO" "ℹ️ Physical Server Detected" \ "Environment: Physical hardware All hardware health checks will be performed: • SMART disk monitoring • Fan speed monitoring • Temperature sensors • Memory ECC errors • CPU thermal monitoring • Network interface errors • Kernel parameters" \ "Full hardware monitoring enabled for physical server." fi } # Function to check SMART status with deep analysis check_disk_smart() { # Skip SMART checks on virtual machines (VMs use virtual disks) if [ "$IS_VIRTUAL" = true ]; then echo -e "${CYAN}[INFO]${NC} Skipping SMART checks (virtual machine - $VIRT_TYPE)" return 0 fi echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..." if ! command_exists smartctl; then add_finding "INFO" "SMART Tools Not Installed" \ "smartmontools is not installed - cannot check disk health" \ "Install SMART tools: yum install smartmontools After installing, run: systemctl enable smartd && systemctl start smartd" return fi # Find all disks local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}') if [ -z "$disks" ]; then add_finding "WARNING" "No Disks Found" \ "Could not detect any disk devices" \ "Check system configuration: lsblk -a" return fi local disk_count=0 local healthy_count=0 local warning_count=0 local failed_count=0 local skipped_count=0 local skipped_raid=0 local skipped_virtual=0 local skipped_lvm=0 local skipped_other=0 for disk in $disks; do disk_count=$((disk_count + 1)) # Get device info to determine if SMART is applicable local device_info=$(smartctl -i "$disk" 2>&1) # COMPREHENSIVE DEVICE DETECTION - Works for ALL storage types # 1. CHECK: Device exists and smartctl can communicate if echo "$device_info" | grep -qiE "No such device|open device.*failed|Permission denied"; then echo -e "${CYAN}[INFO]${NC} Skipping $disk (device not accessible)" skipped_count=$((skipped_count + 1)) skipped_other=$((skipped_other + 1)) continue fi # 2. CHECK: SMART support availability if echo "$device_info" | grep -qiE "SMART support is:.*Unavailable|SMART support is:.*Disabled|Unable to detect device type"; then echo -e "${CYAN}[INFO]${NC} Skipping $disk (SMART not supported or disabled)" skipped_count=$((skipped_count + 1)) skipped_other=$((skipped_other + 1)) continue fi # 3. EXTRACT: Device type, model, vendor for intelligent detection local model=$(echo "$device_info" | grep -iE "Device Model:|Product:|Model Number:|Model Family:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs) local vendor=$(echo "$device_info" | grep -iE "Vendor:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs) local device_type=$(echo "$device_info" | grep -iE "Device type:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs) local serial=$(echo "$device_info" | grep -iE "Serial Number:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs) # Combine model and vendor for comprehensive matching local full_id="${vendor} ${model} ${device_type}" # 4. DETECT: Hardware RAID Controllers (all major brands) # These devices are RAID controller logical volumes, not physical disks if echo "$full_id" | grep -qiE "MR[0-9]{4}|LSI|MegaRAID|PERC|Avago|Broadcom.*RAID|HPE.*Smart Array|HP.*Smart Array|Dell.*RAID|IBM.*ServeRAID|Adaptec|3ware|Areca|HighPoint|Promise.*RAID"; then local raid_type="Hardware RAID Controller" local tools="Unknown RAID tools" # Identify specific RAID type and provide exact tools if echo "$full_id" | grep -qiE "MR[0-9]{4}|MegaRAID"; then raid_type="MegaRAID Controller" tools="megacli -LDInfo -Lall -aALL or storcli /c0 /vall show all" elif echo "$full_id" | grep -qiE "LSI|Avago|Broadcom"; then raid_type="LSI/Broadcom RAID Controller" tools="sas2ircu LIST or storcli show" elif echo "$full_id" | grep -qiE "PERC|Dell"; then raid_type="Dell PERC RAID Controller" tools="perccli /c0 /vall show all or omreport storage vdisk" elif echo "$full_id" | grep -qiE "HP|HPE.*Smart"; then raid_type="HP Smart Array Controller" tools="hpacucli ctrl all show config or ssacli ctrl all show config" elif echo "$full_id" | grep -qiE "Adaptec"; then raid_type="Adaptec RAID Controller" tools="arcconf getconfig 1" elif echo "$full_id" | grep -qiE "3ware"; then raid_type="3ware RAID Controller" tools="tw_cli info c0" fi echo -e "${CYAN}[INFO]${NC} Skipping $disk ($raid_type: $model)" skipped_count=$((skipped_count + 1)) skipped_raid=$((skipped_raid + 1)) add_finding "INFO" "ℹ️ $raid_type Detected: $disk" \ "Device: $disk Controller: $model Type: $raid_type SMART Status: Not applicable (logical volume from RAID controller) This is a logical volume presented by a hardware RAID controller. SMART data is not available for these devices - the controller manages the physical disks and presents them as a single logical volume. To monitor RAID health, use controller-specific tools: Command: $tools Physical disk health is monitored by the RAID controller itself. Check controller logs and status for drive failures." \ "Monitor RAID array health using controller tools, not SMART" continue fi # 5. DETECT: Virtual/Emulated Devices (VMs and containers) if echo "$full_id" | grep -qiE "QEMU|VirtIO|Virtual|VMware|Xen.*Virtual|Msft.*Virtual|Google.*Persistent|Amazon.*Elastic"; then local virt_type="Virtual Disk" if echo "$full_id" | grep -qiE "QEMU"; then virt_type="QEMU Virtual Disk (KVM)" elif echo "$full_id" | grep -qiE "VMware"; then virt_type="VMware Virtual Disk" elif echo "$full_id" | grep -qiE "VirtIO"; then virt_type="VirtIO Virtual Disk" elif echo "$full_id" | grep -qiE "Msft.*Virtual"; then virt_type="Hyper-V Virtual Disk" elif echo "$full_id" | grep -qiE "Xen"; then virt_type="Xen Virtual Disk" elif echo "$full_id" | grep -qiE "Google"; then virt_type="Google Persistent Disk" elif echo "$full_id" | grep -qiE "Amazon"; then virt_type="AWS EBS Volume" fi echo -e "${CYAN}[INFO]${NC} Skipping $disk ($virt_type)" skipped_count=$((skipped_count + 1)) skipped_virtual=$((skipped_virtual + 1)) # Already handled by VM detection at start of function continue fi # 6. DETECT: Software RAID / LVM / Device Mapper if echo "$disk" | grep -qE "md[0-9]|dm-[0-9]"; then echo -e "${CYAN}[INFO]${NC} Skipping $disk (software RAID/LVM device)" skipped_count=$((skipped_count + 1)) skipped_lvm=$((skipped_lvm + 1)) add_finding "INFO" "ℹ️ Software RAID/LVM Detected: $disk" \ "Device: $disk Type: Software RAID or LVM logical volume This is a logical device managed by the kernel (mdadm or LVM). SMART monitoring should be performed on the underlying physical disks. For software RAID (md devices): • Check RAID status: cat /proc/mdstat • Monitor physical disks: smartctl -a /dev/sd[X] For LVM (dm- devices): • Check LV status: lvdisplay • Monitor physical volumes: pvdisplay • Check underlying disks: smartctl -a /dev/sd[X]" \ "Monitor underlying physical disks, not the logical volume" continue fi # 7. DETECT: Loop devices, RAM disks, other special devices if echo "$disk" | grep -qE "loop|ram|zram|nbd"; then echo -e "${CYAN}[INFO]${NC} Skipping $disk (special device: loop/ram/network)" skipped_count=$((skipped_count + 1)) skipped_other=$((skipped_other + 1)) continue fi # 8. FINAL CHECK: Is this a real disk with SMART data? # Try to get SMART attributes - if this fails, skip if ! smartctl -A "$disk" &>/dev/null; then echo -e "${CYAN}[INFO]${NC} Skipping $disk (no SMART attributes available)" skipped_count=$((skipped_count + 1)) skipped_other=$((skipped_other + 1)) add_finding "INFO" "ℹ️ Device Without SMART: $disk" \ "Device: $disk Model: ${model:-Unknown} This device does not provide SMART attributes. Common reasons: • USB-connected drives (SMART data not passed through) • Some hardware RAID configurations • Older drives without SMART support • Passthrough issues in virtual environments If this is a critical disk, verify health through other means: • Check dmesg for errors: dmesg | grep -i '$disk' • Monitor I/O errors: iostat -x $disk • Check filesystem errors: mount | grep $disk" \ "Monitor through system logs and I/O statistics" continue fi # Get SMART health status local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}') # Get disk model and serial local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//') [ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//') local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//') # Get ALL SMART data at once (optimize - single call instead of multiple) local smart_data=$(smartctl -A "$disk" 2>/dev/null) # Get key SMART attributes with deep parsing local reallocated=$(echo "$smart_data" | grep "Reallocated_Sector" | awk '{print $10}') local pending=$(echo "$smart_data" | grep "Current_Pending_Sector" | awk '{print $10}') local uncorrectable=$(echo "$smart_data" | grep "Offline_Uncorrectable" | awk '{print $10}') local temp=$(echo "$smart_data" | grep "Temperature_Celsius" | awk '{print $10}') local power_on=$(echo "$smart_data" | grep "Power_On_Hours" | awk '{print $10}') # Additional critical attributes for predictive failure local read_error_rate=$(echo "$smart_data" | grep "Raw_Read_Error_Rate" | awk '{print $10}') local spin_retry=$(echo "$smart_data" | grep "Spin_Retry_Count" | awk '{print $10}') local realloc_event=$(echo "$smart_data" | grep "Reallocated_Event_Count" | awk '{print $10}') local wear_leveling=$(echo "$smart_data" | grep "Wear_Leveling_Count" | awk '{print $10}') # DISK AGE ANALYSIS local disk_age_years=0 local age_warning="" if [ -n "$power_on" ] && [ "$power_on" -gt 0 ]; then disk_age_years=$((power_on / 8760)) # 8760 hours per year if [ "$disk_age_years" -ge 5 ]; then age_warning="⚠️ DISK AGE: $disk_age_years years old (REPLACE - expected lifespan: 3-5 years)" [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE" elif [ "$disk_age_years" -ge 3 ]; then age_warning="ℹ️ DISK AGE: $disk_age_years years old (consider replacement soon)" fi fi # NVMe-SPECIFIC HEALTH (if NVMe drive) local is_nvme=false local nvme_wear="" local nvme_spare="" if [[ "$disk" == *"nvme"* ]]; then is_nvme=true # Get NVMe SMART data local nvme_smart=$(smartctl -A "$disk" 2>/dev/null) # Percentage Used (wear indicator) local percent_used=$(echo "$nvme_smart" | grep "Percentage Used" | awk '{print $3}' | tr -d '%') if [ -n "$percent_used" ] && [ "$percent_used" -gt 90 ]; then nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (CRITICAL - near end of life!)" failure_risk="HIGH" elif [ -n "$percent_used" ] && [ "$percent_used" -gt 80 ]; then nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (high wear - monitor closely)" [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE" fi # Available Spare local avail_spare=$(echo "$nvme_smart" | grep "Available Spare" | awk '{print $3}' | tr -d '%') if [ -n "$avail_spare" ] && [ "$avail_spare" -lt 10 ]; then nvme_spare="⚠️ NVMe SPARE: ${avail_spare}% available spare (CRITICAL!)" failure_risk="HIGH" fi fi # Check for I/O errors in system logs (last 7 days) local disk_name=$(basename "$disk") local io_errors=$(grep -i "$disk_name.*error\|$disk_name.*failed\|ata.*$disk_name" /var/log/messages 2>/dev/null | wc -l) local recent_io_samples="" if [ "$io_errors" -gt 0 ]; then recent_io_samples=$(grep -i "$disk_name.*error\|$disk_name.*failed" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /') fi # PREDICTIVE FAILURE ANALYSIS - Make critical issues OBVIOUS local failure_risk="NONE" local risk_factors="" # CRITICAL: Immediate failure indicators if [ -n "$reallocated" ] && [ "$reallocated" -gt 50 ]; then failure_risk="IMMINENT" risk_factors+="⚠️ CRITICAL: $reallocated reallocated sectors (DRIVE FAILING SOON!)"$'\n' elif [ -n "$reallocated" ] && [ "$reallocated" -gt 10 ]; then failure_risk="HIGH" risk_factors+="⚠️ HIGH: $reallocated reallocated sectors (failure risk increasing)"$'\n' elif [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then failure_risk="MODERATE" risk_factors+="⚠️ MODERATE: $reallocated reallocated sectors detected"$'\n' fi if [ -n "$pending" ] && [ "$pending" -gt 10 ]; then failure_risk="IMMINENT" risk_factors+="⚠️ CRITICAL: $pending pending sectors (READ/WRITE FAILURES!)"$'\n' elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE" risk_factors+="⚠️ MODERATE: $pending pending sectors"$'\n' fi if [ -n "$uncorrectable" ] && [ "$uncorrectable" -gt 0 ]; then failure_risk="HIGH" risk_factors+="⚠️ HIGH: $uncorrectable uncorrectable sectors (data loss possible)"$'\n' fi # Temperature warnings if [ -n "$temp" ] && [ "$temp" -gt 55 ]; then [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE" risk_factors+="⚠️ Temperature: ${temp}°C (OVERHEATING - threshold: 50°C)"$'\n' elif [ -n "$temp" ] && [ "$temp" -gt 50 ]; then risk_factors+="⚠️ Temperature: ${temp}°C (above recommended 50°C)"$'\n' fi # I/O errors from logs if [ "$io_errors" -gt 50 ]; then failure_risk="HIGH" risk_factors+="⚠️ HIGH: $io_errors I/O errors in last 7 days (hardware problem!)"$'\n' elif [ "$io_errors" -gt 10 ]; then [ "$failure_risk" = "NONE" ] && failure_risk="MODERATE" risk_factors+="⚠️ MODERATE: $io_errors I/O errors in last 7 days"$'\n' fi # Add disk age warning to risk factors [ -n "$age_warning" ] && risk_factors+="$age_warning"$'\n' # Add NVMe-specific warnings to risk factors [ -n "$nvme_wear" ] && risk_factors+="$nvme_wear"$'\n' [ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n' # Determine severity and report # Be SMART about health status - only flag if explicitly FAILED if [[ "$health" =~ FAILED ]]; then # SMART health check explicitly FAILED failed_count=$((failed_count + 1)) add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \ "Device: $disk Model: $model Serial: $serial Health: FAILED ❌ SMART Status: FAILED Reallocated Sectors: ${reallocated:-N/A} Pending Sectors: ${pending:-N/A} Uncorrectable Sectors: ${uncorrectable:-N/A} Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A} Recent I/O Errors (last 7 days): $io_errors ${recent_io_samples:+Recent errors from /var/log/messages: $recent_io_samples}" \ "🚨 IMMEDIATE ACTION REQUIRED - DISK FAILING: 1. BACKUP ALL DATA IMMEDIATELY (drive may fail at any moment) 2. Order replacement disk NOW 3. Plan maintenance window for replacement 4. Review SMART details: smartctl -a $disk 5. Check logs: grep -i '${disk_name}' /var/log/messages 6. If RAID: Verify array status and prepare for rebuild" elif [ "$failure_risk" = "IMMINENT" ]; then # Predictive: Drive will fail SOON failed_count=$((failed_count + 1)) add_finding "CRITICAL" "🔴 DRIVE FAILING SOON: $disk - REPLACE URGENTLY" \ "Device: $disk Model: $model Serial: $serial Health: $health (but critical attributes detected) ⚠️ FAILURE RISK: IMMINENT - Drive will likely fail within days/weeks Critical Issues: $risk_factors Power On Hours: ${power_on:-N/A} Recent I/O Errors (last 7 days): $io_errors ${recent_io_samples:+Recent errors from /var/log/messages: $recent_io_samples}" \ "🚨 URGENT - DRIVE REPLACEMENT REQUIRED: 1. Order replacement disk immediately 2. Ensure backups are current and verified 3. Plan replacement within 1-2 weeks (sooner if possible) 4. Monitor daily: smartctl -A $disk 5. Watch for increasing errors: grep -i '${disk_name}' /var/log/messages 6. Do NOT wait for complete failure - replace proactively" elif [ "$failure_risk" = "HIGH" ]; then # High risk of failure warning_count=$((warning_count + 1)) add_finding "WARNING" "🟡 HIGH FAILURE RISK: $disk - Plan Replacement" \ "Device: $disk Model: $model Serial: $serial Health: $health ⚠️ FAILURE RISK: HIGH - Replacement recommended Risk Factors: $risk_factors Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A} Recent I/O Errors (last 7 days): $io_errors" \ "⚠️ PLAN DISK REPLACEMENT: • Order spare disk as precaution • Monitor weekly: smartctl -A $disk • Watch for deterioration in attributes • Ensure backups are current • Check logs regularly: grep -i '${disk_name}' /var/log/messages" elif [ "$failure_risk" = "MODERATE" ]; then # Moderate risk - monitor closely warning_count=$((warning_count + 1)) add_finding "WARNING" "🟡 Disk $disk: Warning Signs Detected" \ "Device: $disk Model: $model Serial: $serial Health: $health ⚠️ FAILURE RISK: MODERATE - Monitor closely Warning Signs: $risk_factors Temperature: ${temp:-N/A}°C Power On Hours: ${power_on:-N/A} Recent I/O Errors (last 7 days): $io_errors" \ "Monitor this disk closely: • Check SMART weekly: smartctl -A $disk • Watch for increasing reallocated/pending sectors • Monitor system logs: grep -i '${disk_name}' /var/log/messages • Ensure backups are current" else # Disk is healthy healthy_count=$((healthy_count + 1)) add_finding "INFO" "✅ Disk $disk: Healthy" \ "Device: $disk Model: $model Serial: $serial Health: $health ✅ SMART Attributes: Reallocated Sectors: ${reallocated:-0} Pending Sectors: ${pending:-0} Uncorrectable Sectors: ${uncorrectable:-0} Temperature: ${temp:-N/A}°C (optimal: <50°C) Power On Hours: ${power_on:-N/A} I/O Errors (7 days): $io_errors" \ "Disk is healthy - continue regular monitoring • Monthly SMART check recommended: smartctl -A $disk" fi done # Summary finding with skip breakdown local summary_details="Total devices found: $disk_count Physical disks monitored: $healthy_count healthy, $warning_count warning, $failed_count failed" if [ "$skipped_count" -gt 0 ]; then summary_details="${summary_details} Devices skipped (SMART not applicable): $skipped_count" if [ "$skipped_raid" -gt 0 ]; then summary_details="${summary_details} • Hardware RAID controllers: $skipped_raid (use vendor tools)" fi if [ "$skipped_lvm" -gt 0 ]; then summary_details="${summary_details} • Software RAID/LVM: $skipped_lvm (monitor underlying disks)" fi if [ "$skipped_virtual" -gt 0 ]; then summary_details="${summary_details} • Virtual/cloud disks: $skipped_virtual (managed by hypervisor)" fi if [ "$skipped_other" -gt 0 ]; then summary_details="${summary_details} • Other (USB/special): $skipped_other (see findings for details)" fi fi add_finding "INFO" "Disk Health Summary" \ "$summary_details" \ "Regular SMART monitoring recommended: smartctl -a /dev/[disk]" } # Function to check memory health with ECC error detection check_memory_health() { echo -e "${CYAN}[INFO]${NC} Checking memory health..." if ! command_exists dmidecode; then add_finding "INFO" "dmidecode Not Available" \ "dmidecode is not installed - cannot check memory details" \ "Install dmidecode: yum install dmidecode" return fi # Get memory information local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$") local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed") # Get total memory local total_mem=$(free -h | grep "Mem:" | awk '{print $2}') local used_mem=$(free -h | grep "Mem:" | awk '{print $3}') local available_mem=$(free -h | grep "Mem:" | awk '{print $7}') # Check for ECC local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l) local ecc_type=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | sed 's/.*Error Correction Type:[ ]*//') # Check for memory errors in dmesg local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l) # Check hardware errors in system log (last 7 days) local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error\|edac.*error" /var/log/messages 2>/dev/null | wc -l) # Check for specific ECC error types local single_bit_errors=$(grep -i "single.*bit.*error\|correctable.*ecc" /var/log/messages 2>/dev/null | wc -l) local multi_bit_errors=$(grep -i "multi.*bit.*error\|uncorrectable.*ecc" /var/log/messages 2>/dev/null | wc -l) # Check for OOM killer events local oom_events=$(grep -i "out of memory\|oom.*kill\|invoked oom-killer" /var/log/messages 2>/dev/null | wc -l) local recent_oom="" if [ "$oom_events" -gt 0 ]; then recent_oom=$(grep -i "out of memory\|oom.*kill" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /') fi # Check swap usage (high swap can indicate memory pressure) local swap_total=$(free -h | grep "Swap:" | awk '{print $2}') local swap_used=$(free -h | grep "Swap:" | awk '{print $3}') local swap_pct=0 if [ "$swap_total" != "0B" ] && [ -n "$swap_total" ]; then swap_pct=$(free | grep "Swap:" | awk '{if ($2>0) print int($3/$2*100); else print 0}') fi # Try to identify bad memory module from ECC errors local bad_dimm="" if [ "$hw_mem_errors" -gt 0 ]; then # Look for EDAC messages that identify specific DIMMs bad_dimm=$(grep -i "edac.*dimm\|edac.*channel\|edac.*slot" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') if [ -z "$bad_dimm" ]; then # Try CE (Correctable Error) messages bad_dimm=$(grep -i "ce.*error.*channel\|ce.*error.*dimm" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') fi fi # Build memory details local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:|Locator:" | sed 's/^[ \t]*/ /') # ANALYZE MEMORY HEALTH local mem_status="HEALTHY" local mem_risk="" # CRITICAL: Multi-bit ECC errors (uncorrectable) if [ "$multi_bit_errors" -gt 0 ]; then mem_status="CRITICAL" mem_risk+="🔴 CRITICAL: $multi_bit_errors UNCORRECTABLE ECC errors (multi-bit) - DATA CORRUPTION RISK!"$'\n' fi # HIGH: Excessive single-bit errors if [ "$single_bit_errors" -gt 100 ]; then mem_status="CRITICAL" mem_risk+="🔴 CRITICAL: $single_bit_errors correctable ECC errors (BAD DIMM - replace immediately!)"$'\n' elif [ "$single_bit_errors" -gt 20 ]; then [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING" mem_risk+="🟡 WARNING: $single_bit_errors correctable ECC errors (faulty DIMM likely)"$'\n' elif [ "$single_bit_errors" -gt 0 ]; then [ "$mem_status" = "HEALTHY" ] && mem_status="INFO" mem_risk+="ℹ️ INFO: $single_bit_errors correctable ECC errors (monitor closely)"$'\n' fi # OOM killer events if [ "$oom_events" -gt 10 ]; then [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING" mem_risk+="🟡 WARNING: $oom_events Out-Of-Memory events (insufficient RAM for workload!)"$'\n' elif [ "$oom_events" -gt 0 ]; then mem_risk+="ℹ️ INFO: $oom_events OOM events (consider adding RAM)"$'\n' fi # Swap thrashing if [ "$swap_pct" -gt 80 ]; then [ "$mem_status" = "HEALTHY" ] && mem_status="WARNING" mem_risk+="🟡 WARNING: Swap ${swap_pct}% full (memory pressure - consider upgrade)"$'\n' elif [ "$swap_pct" -gt 50 ]; then mem_risk+="ℹ️ INFO: Swap ${swap_pct}% used (moderate memory pressure)"$'\n' fi # Generate findings based on analysis if [ "$mem_status" = "CRITICAL" ]; then local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /') add_finding "CRITICAL" "🔴 MEMORY FAILURE: Replace RAM Immediately" \ "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem) Slots: $populated_slots / $total_slots ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No') Swap Usage: $swap_used / $swap_total (${swap_pct}% used) 🔴 CRITICAL MEMORY ISSUES: $mem_risk Memory Errors Detected: • Total errors in logs: $hw_mem_errors • Single-bit (correctable): $single_bit_errors • Multi-bit (UNCORRECTABLE): $multi_bit_errors • OOM killer events: $oom_events ${bad_dimm:+Faulty Module Location: $bad_dimm } Recent errors from /var/log/messages: $recent_errors" \ "🚨 IMMEDIATE ACTION REQUIRED: 1. IDENTIFY BAD DIMM: Check logs above for slot/channel information 2. REPLACE FAULTY RAM: Order replacement immediately 3. RUN MEMTEST: Boot memtest86+ to identify bad module 4. CHECK ALL ERRORS: grep -i 'ecc\|edac' /var/log/messages | less 5. MONITOR CORRUPTION: Watch for application crashes, file corruption 6. If multi-bit errors: PLAN IMMEDIATE DOWNTIME for replacement Commands to identify faulty DIMM: • dmidecode -t memory (shows all slots) • grep -i edac /var/log/messages (shows which slot failing) • edac-util (if installed: yum install edac-utils)" elif [ "$mem_status" = "WARNING" ]; then local recent_errors=$(grep -i "memory.*error\|ecc.*error\|oom" /var/log/messages 2>/dev/null | tail -8 | sed 's/^/ /') add_finding "WARNING" "🟡 Memory Issues Detected - Action Required" \ "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem) Slots: $populated_slots / $total_slots ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No') Swap Usage: $swap_used / $swap_total (${swap_pct}% used) ⚠️ WARNING - Memory Issues: $mem_risk Memory Errors Detected: • Total errors in logs: $hw_mem_errors • Single-bit (correctable): $single_bit_errors • Multi-bit (UNCORRECTABLE): $multi_bit_errors • OOM killer events: $oom_events ${recent_oom:+Recent OOM Events: $recent_oom } ${bad_dimm:+Possible Faulty Module: $bad_dimm } Recent errors: $recent_errors" \ "⚠️ RECOMMENDED ACTIONS: • Monitor error rate: grep -i 'ecc\|memory error' /var/log/messages | wc -l • Check for increasing errors (run daily, compare counts) • If ECC errors increasing: Plan RAM replacement • If OOM events: Consider RAM upgrade or reduce workload • Review memory usage: free -h && top -o %MEM | head -15 For ECC errors: • Install monitoring: yum install edac-utils • Check status: edac-util -v • Identify DIMM: dmidecode -t memory | grep -A 20 'Memory Device'" else add_finding "INFO" "✅ Memory Health: No Issues Detected" \ "Total Memory: $total_mem (Used: $used_mem, Available: $available_mem) Slots: $populated_slots / $total_slots ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No') Swap Usage: $swap_used / $swap_total (${swap_pct}% used) Memory Errors: None detected OOM Events: None detected ECC Errors: None detected Installed Modules: $mem_modules" \ "Memory appears healthy • Regular monitoring recommended if ECC supported • Watch for OOM events: grep -i 'oom' /var/log/messages" fi } # Function to check CPU health with thermal throttling detection check_cpu_health() { echo -e "${CYAN}[INFO]${NC} Checking CPU health..." # Get CPU info local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//') local cpu_cores=$(grep -c "^processor" /proc/cpuinfo) local cpu_threads=$(nproc) # Check for CPU errors in dmesg local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l) # Check system log for hardware errors local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l) # Check for thermal throttling events local throttle_events=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical|thermal.*shutdown" /var/log/messages 2>/dev/null | wc -l) local recent_throttle="" if [ "$throttle_events" -gt 0 ]; then recent_throttle=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /') fi # Get current CPU frequency and max frequency local cpu_freq="" local cpu_max_freq="" local freq_throttled=false if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null) cpu_freq=$(awk "BEGIN {printf \"%.2f\", $freq_khz / 1000000}" 2>/dev/null)" GHz" # Check max frequency if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" ]; then local max_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 2>/dev/null) cpu_max_freq=$(awk "BEGIN {printf \"%.2f\", $max_freq_khz / 1000000}" 2>/dev/null)" GHz" # Check if significantly throttled (more than 20% below max) local throttle_pct=$(awk "BEGIN {if ($max_freq_khz > 0) print int((1 - $freq_khz/$max_freq_khz) * 100); else print 0}" 2>/dev/null) if [ "$throttle_pct" -gt 20 ]; then freq_throttled=true fi fi fi # Check CPU temperature with multiple methods local cpu_temp="N/A" local temp_value=0 local all_core_temps="" if command_exists sensors; then # Try to get all core temperatures all_core_temps=$(sensors 2>/dev/null | grep -E "Core [0-9]+:" | sed 's/^/ /') # Get highest core temperature cpu_temp=$(sensors 2>/dev/null | grep -E "Core [0-9]+:|temp1:" | grep -oP '\+\K[0-9.]+' | sort -n | tail -1) if [ -n "$cpu_temp" ]; then temp_value=${cpu_temp%.*} cpu_temp="${cpu_temp}°C" else cpu_temp="N/A" fi fi # Fallback: Check thermal zones if [ "$cpu_temp" = "N/A" ] && [ -d "/sys/class/thermal" ]; then for zone in /sys/class/thermal/thermal_zone*/temp; do if [ -f "$zone" ]; then local temp=$(cat "$zone" 2>/dev/null) if [ -n "$temp" ] && [ "$temp" -gt 0 ]; then temp_value=$((temp / 1000)) cpu_temp="${temp_value}°C" break fi fi done fi # Check load average local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//') local load_1min=$(echo "$load_avg" | awk -F',' '{print $1}' | tr -d ' ') # Calculate load percentage local load_pct=0 if [ -n "$load_1min" ] && [ "$cpu_threads" -gt 0 ]; then load_pct=$(awk "BEGIN {printf \"%.0f\", ($load_1min / $cpu_threads) * 100}" 2>/dev/null) fi # ANALYZE CPU HEALTH local cpu_status="HEALTHY" local cpu_risk="" # CRITICAL: MCE/Hardware errors if [ "$hw_cpu_errors" -gt 0 ] || [ "$cpu_errors" -gt 0 ]; then cpu_status="CRITICAL" cpu_risk+="🔴 CRITICAL: $((cpu_errors + hw_cpu_errors)) Machine Check Exceptions (MCE) - HARDWARE FAILURE!"$'\n' fi # CRITICAL: Extreme overheating if [ "$temp_value" -gt 90 ]; then cpu_status="CRITICAL" cpu_risk+="🔴 CRITICAL: CPU temperature ${cpu_temp} - EXTREME OVERHEATING (damage risk!)"$'\n' elif [ "$temp_value" -gt 80 ]; then [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING" cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - OVERHEATING (threshold: 80°C)"$'\n' elif [ "$temp_value" -gt 70 ]; then [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING" cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - HIGH (normal: <70°C)"$'\n' fi # Thermal throttling if [ "$throttle_events" -gt 10 ]; then [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING" cpu_risk+="🟡 WARNING: $throttle_events thermal throttling events - COOLING PROBLEM!"$'\n' elif [ "$throttle_events" -gt 0 ]; then cpu_risk+="ℹ️ INFO: $throttle_events thermal throttling events detected"$'\n' fi # Frequency throttling if $freq_throttled; then [ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING" cpu_risk+="🟡 WARNING: CPU frequency throttled (${cpu_freq} / ${cpu_max_freq} max) - thermal or power limiting"$'\n' fi # High sustained load if [ "$load_pct" -gt 200 ]; then cpu_risk+="ℹ️ INFO: Very high load (${load_pct}% of capacity) - server may be overloaded"$'\n' fi # Generate findings if [ "$cpu_status" = "CRITICAL" ]; then local recent_errors=$(grep -iE "mce|machine check|cpu.*error|thermal.*critical" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /') add_finding "CRITICAL" "🔴 CPU CRITICAL: Hardware Failure or Overheating" \ "CPU Model: $cpu_model Cores: $cpu_cores (Threads: $cpu_threads) Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A}) Temperature: $cpu_temp ${temp_value:+(CRITICAL threshold: 80°C)} Load Average: $load_avg (${load_pct}% capacity) 🔴 CRITICAL CPU ISSUES: $cpu_risk Hardware Errors: • MCE/CPU errors: $((cpu_errors + hw_cpu_errors)) • Thermal throttling events: $throttle_events ${all_core_temps:+Individual Core Temperatures: $all_core_temps } ${recent_throttle:+Recent Thermal Events: $recent_throttle } Recent errors from logs: $recent_errors" \ "🚨 IMMEDIATE ACTION REQUIRED: 1. CHECK TEMPERATURE: If >90°C, shut down immediately to prevent damage! 2. COOLING SYSTEM: Check fans, heatsink, thermal paste 3. MCE ERRORS: Critical hardware failure - contact vendor/provider 4. CLEAN SYSTEM: Remove dust from fans and heatsinks 5. VERIFY AIRFLOW: Ensure proper case ventilation 6. MONITOR: Watch temps continuously: watch -n 2 sensors Commands: • View all temps: sensors • Check MCE details: dmesg | grep -i mce | less • Monitor throttling: grep -i thermal /var/log/messages • Check frequency: cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq" elif [ "$cpu_status" = "WARNING" ]; then add_finding "WARNING" "🟡 CPU Issues Detected - Cooling or Hardware Problem" \ "CPU Model: $cpu_model Cores: $cpu_cores (Threads: $cpu_threads) Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A}) Temperature: $cpu_temp Load Average: $load_avg (${load_pct}% capacity) ⚠️ WARNING - CPU Issues: $cpu_risk Monitoring: • Thermal throttling events: $throttle_events • Current temperature: $cpu_temp ${all_core_temps:+Individual Core Temperatures: $all_core_temps } ${recent_throttle:+Recent Thermal Events: $recent_throttle }" \ "⚠️ RECOMMENDED ACTIONS: • Clean cooling system (fans, heatsink) • Verify fan operation: sensors (check fan RPM) • Check case ventilation and airflow • Monitor temperature trends: watch -n 5 sensors • If throttling persists: Replace thermal paste or upgrade cooling • Consider reducing workload if temperature stays high Commands: • Monitor live: watch -n 2 sensors • Check throttling: grep -i thermal /var/log/messages • View frequencies: grep MHz /proc/cpuinfo" else add_finding "INFO" "✅ CPU Health: Normal Operation" \ "CPU Model: $cpu_model Cores: $cpu_cores (Threads: $cpu_threads) Current Frequency: ${cpu_freq:-N/A} ${cpu_max_freq:+(Max: ${cpu_max_freq})} Temperature: $cpu_temp ${temp_value:+(normal: <70°C)} Load Average: $load_avg (${load_pct}% capacity) Hardware Errors: None detected Thermal Throttling: None detected Frequency Throttling: None detected ${all_core_temps:+Individual Core Temperatures: $all_core_temps }" \ "CPU is operating normally • Regular temperature monitoring recommended • Monitor: sensors (if installed)" fi # Check if sensors are available for monitoring if ! command_exists sensors; then add_finding "INFO" "Temperature Monitoring Not Available" \ "lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \ "Install sensors for temperature monitoring: 1. yum install lm_sensors 2. Run: sensors-detect (answer YES to all prompts) 3. Start service: systemctl start lm_sensors 4. View temperatures: sensors" fi } # Function to check system hardware errors check_hardware_errors() { echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..." # Check for general hardware errors local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l) if [ "$hw_errors" -gt 0 ]; then local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /') add_finding "WARNING" "Hardware Errors in System Log" \ "Total hardware-related errors: $hw_errors Recent errors (last 10): $recent_errors" \ "Hardware errors detected in system logs: • Review full log: grep -iE 'hardware error|i/o error' /var/log/messages • Check dmesg: dmesg | grep -i error | tail -20 • Identify failing component (disk, memory, CPU, etc.) • Run component-specific diagnostics • Contact hosting provider if persistent" fi } # Function to check RAID status check_raid_status() { echo -e "${CYAN}[INFO]${NC} Checking RAID status..." local raid_found=false # Check for software RAID (mdadm) if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then raid_found=true local raid_status=$(cat /proc/mdstat) local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]") if [ "$degraded" -gt 0 ]; then add_finding "CRITICAL" "Software RAID Degraded" \ "RAID array is degraded: $raid_status" \ "RAID array degraded - immediate action required: • Check details: cat /proc/mdstat • Identify failed drive: mdadm --detail /dev/md* • Replace failed drive and rebuild array • Ensure backups are current" else add_finding "INFO" "Software RAID Status" \ "$raid_status" \ "Software RAID is healthy" fi fi # Check for hardware RAID (common controllers) if command_exists megacli; then raid_found=true local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive") add_finding "INFO" "MegaRAID Status" \ "$raid_info" \ "Check details: megacli -LDInfo -Lall -aALL" fi if ! $raid_found; then add_finding "INFO" "No RAID Detected" \ "No software or hardware RAID arrays detected" \ "System appears to use non-RAID storage" fi } # Function to check disk I/O errors check_disk_io_errors() { echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..." # Check for I/O errors in dmesg local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l) if [ "$io_errors" -gt 0 ]; then local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/ /') add_finding "WARNING" "Disk I/O Errors Detected" \ "Total I/O errors in dmesg: $io_errors Recent I/O errors (last 10): $recent_io_errors" \ "Disk I/O errors detected - indicates hardware or connection issues: • Check SMART status (see above) • Review dmesg: dmesg | grep -i 'i/o error' • Check cables and connections (if physical server) • Check for disk controller issues • May indicate failing disk or controller" fi } # Function to check filesystem errors check_filesystem_errors() { echo -e "${CYAN}[INFO]${NC} Checking filesystem errors..." # Check for filesystem errors in logs local fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | wc -l) if [ "$fs_errors" -gt 0 ]; then local recent_fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') local severity="WARNING" # Check for read-only remounts (critical) local ro_remounts=$(grep -i "remounted.*read-only" /var/log/messages 2>/dev/null | wc -l) if [ "$ro_remounts" -gt 0 ]; then severity="CRITICAL" fi add_finding "$severity" "🔴 Filesystem Errors Detected" \ "Total filesystem errors in logs: $fs_errors Read-only remounts: $ro_remounts Recent filesystem errors (last 5): $recent_fs_errors" \ "Filesystem errors detected - may indicate disk corruption: • Check filesystem: fsck (requires unmounting or single-user mode) • Review all errors: grep -i 'filesystem.*error' /var/log/messages • Check disk SMART status above • If read-only remount: System is protecting data - investigate immediately • May need to boot rescue mode to repair • Ensure backups are current before repair attempts" fi } # Function to check system fans check_system_fans() { # Skip fan checks on virtual machines (hypervisor controls physical fans) if [ "$IS_VIRTUAL" = true ]; then echo -e "${CYAN}[INFO]${NC} Skipping fan checks (virtual machine - $VIRT_TYPE)" return 0 fi echo -e "${CYAN}[INFO]${NC} Checking system fan status..." if ! command_exists sensors; then return # Silently skip if sensors not installed fi # Get fan information local fan_data=$(sensors 2>/dev/null | grep -i "fan") if [ -z "$fan_data" ]; then return # No fan data available fi # Check for failed fans (0 RPM or missing) local failed_fans=$(echo "$fan_data" | grep "0 RPM\|FAULT" | wc -l) local slow_fans=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | wc -l) if [ "$failed_fans" -gt 0 ]; then local failed_fan_list=$(echo "$fan_data" | grep "0 RPM\|FAULT" | sed 's/^/ /') add_finding "CRITICAL" "🔴 FAILED FAN(S) DETECTED" \ "Failed fans: $failed_fans Failed fan details: $failed_fan_list All fan data: $(echo "$fan_data" | sed 's/^/ /')" \ "🚨 CRITICAL - FAN FAILURE DETECTED: • Failed fans detected - system may overheat! • Check all fan data: sensors • Physical inspection required • Replace failed fan immediately • Monitor CPU/system temperatures closely • May need emergency shutdown if temps rise above 90°C" elif [ "$slow_fans" -gt 0 ]; then local slow_fan_list=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | sed 's/^/ /') add_finding "WARNING" "🟡 Slow Fan(s) Detected" \ "Slow fans (< 800 RPM): $slow_fans Slow fan details: $slow_fan_list All fan data: $(echo "$fan_data" | sed 's/^/ /')" \ "⚠️ WARNING - FANS RUNNING SLOW: • Fans running slower than normal • May indicate fan wear or BIOS power settings • Monitor temperatures closely • Consider fan replacement if temperatures rise • Check BIOS fan control settings" else add_finding "INFO" "✅ System Fans: Normal Operation" \ "All fans operating normally: $(echo "$fan_data" | sed 's/^/ /')" \ "All system fans operating within normal parameters" fi } # Function to check network interface errors check_network_errors() { echo -e "${CYAN}[INFO]${NC} Checking network interface errors..." if ! command_exists ethtool; then return # Silently skip if ethtool not installed fi # Get all active network interfaces (exclude loopback) local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v '^lo$' | grep -v '^docker' | grep -v '^veth' | grep -v '^br-') if [ -z "$interfaces" ]; then return # No interfaces found fi local total_rx_dropped=0 local total_tx_dropped=0 local total_rx_errors=0 local total_tx_errors=0 local total_crc_errors=0 local problem_interfaces="" local has_issues=false while IFS= read -r interface; do # Get statistics for this interface local stats=$(ethtool -S "$interface" 2>/dev/null) if [ -n "$stats" ]; then # Extract key error metrics (different NICs use different naming) local rx_dropped=$(echo "$stats" | grep -iE "rx.*drop|rx_discards" | awk '{sum+=$2} END {print sum+0}') local tx_dropped=$(echo "$stats" | grep -iE "tx.*drop|tx_discards" | awk '{sum+=$2} END {print sum+0}') local rx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*rx_errors" | awk '{print $2}') local tx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*tx_errors" | awk '{print $2}') local crc_errors=$(echo "$stats" | grep -iE "crc.*error|rx_crc" | awk '{sum+=$2} END {print sum+0}') # Accumulate totals total_rx_dropped=$((total_rx_dropped + rx_dropped)) total_tx_dropped=$((total_tx_dropped + tx_dropped)) total_rx_errors=$((total_rx_errors + rx_errors)) total_tx_errors=$((total_tx_errors + tx_errors)) total_crc_errors=$((total_crc_errors + crc_errors)) # Check if this interface has significant issues if [ "$rx_dropped" -gt 1000 ] || [ "$tx_dropped" -gt 1000 ] || [ "$crc_errors" -gt 100 ]; then has_issues=true problem_interfaces+=" $interface: RX dropped: $rx_dropped TX dropped: $tx_dropped CRC errors: $crc_errors " fi fi done <<< "$interfaces" # Determine severity local severity="INFO" if [ "$total_rx_dropped" -gt 10000 ] || [ "$total_tx_dropped" -gt 10000 ] || [ "$total_crc_errors" -gt 1000 ]; then severity="CRITICAL" elif [ "$total_rx_dropped" -gt 1000 ] || [ "$total_tx_dropped" -gt 1000 ] || [ "$total_crc_errors" -gt 100 ]; then severity="WARNING" fi if [ "$has_issues" = true ] || [ "$severity" != "INFO" ]; then add_finding "$severity" "🔴 Network Interface Errors Detected" \ "Total across all interfaces: • RX packets dropped: $total_rx_dropped • TX packets dropped: $total_tx_dropped • RX errors: $total_rx_errors • TX errors: $total_tx_errors • CRC errors: $total_crc_errors Problem interfaces: $problem_interfaces" \ "Network errors detected - may indicate hardware or driver issues: • Check interface: ethtool eth0 • Check dmesg: dmesg | grep -i 'eth\|network' • High drops may indicate: - Network card failure - Driver issues - Switch/cable problems - Bandwidth saturation • CRC errors indicate: - Bad cable - EMI interference - Faulty NIC • If persistent: Replace network cable first, then NIC if needed" else # All healthy add_finding "INFO" "✅ Network Interfaces: Healthy" \ "All network interfaces operating normally Total interfaces checked: $(echo "$interfaces" | wc -l) No significant packet drops or errors detected" \ "Network hardware is functioning properly" fi } # Function to check PCI/PCIe errors check_pci_errors() { echo -e "${CYAN}[INFO]${NC} Checking PCI/PCIe errors..." # Check for PCI errors in dmesg and logs local pci_errors=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error|correctable.*error.*pci|uncorrectable.*error.*pci" | wc -l) # Also check cached messages if available local log_pci_errors=0 if [ -f "$MESSAGES_CACHE" ]; then log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" "$MESSAGES_CACHE" 2>/dev/null | wc -l) else log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" /var/log/messages 2>/dev/null | wc -l) fi local total_pci_errors=$((pci_errors + log_pci_errors)) if [ "$total_pci_errors" -gt 0 ]; then # Get samples from both sources local dmesg_samples=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error" | tail -5 | sed 's/^/ /') local log_samples="" if [ -f "$MESSAGES_CACHE" ]; then log_samples=$(grep -iE "pci.*error|pcie.*error" "$MESSAGES_CACHE" 2>/dev/null | tail -3 | sed 's/^/ /') fi # Check for uncorrectable errors (more serious) local uncorrectable=0 if echo "$dmesg_samples" | grep -qi "uncorrectable"; then uncorrectable=1 fi local severity="WARNING" if [ "$uncorrectable" -eq 1 ] || [ "$total_pci_errors" -gt 50 ]; then severity="CRITICAL" fi add_finding "$severity" "🔴 PCI/PCIe Errors Detected" \ "Total PCI errors: $total_pci_errors Uncorrectable errors: $([ "$uncorrectable" -eq 1 ] && echo 'YES (CRITICAL!)' || echo 'No') Recent errors from dmesg: $dmesg_samples ${log_samples:+Recent errors from /var/log/messages: $log_samples}" \ "PCI/PCIe errors detected - may indicate hardware problems: • Uncorrectable errors = serious hardware issue • Correctable errors = potential signal integrity problems • Check details: dmesg | grep -i 'pci.*error' • Check PCIe link status: lspci -vv | grep -A 5 'LnkSta' • May indicate: - Faulty PCIe device (network card, RAID controller, etc.) - Motherboard issues - Power supply problems - Improper card seating • If persistent: Reseat cards, check for firmware updates • If uncorrectable: Replace failing hardware immediately" fi } # Function to check kernel parameters check_kernel_parameters() { echo -e "${CYAN}[INFO]${NC} Checking kernel parameters..." local issues="" local warnings="" local info="" # Check vm.swappiness (should be 1-10 for servers) if command_exists sysctl; then local swappiness=$(sysctl -n vm.swappiness 2>/dev/null) if [ -n "$swappiness" ]; then if [ "$swappiness" -gt 60 ]; then warnings+=" • vm.swappiness=$swappiness (HIGH - should be 1-10 for servers) " elif [ "$swappiness" -gt 10 ]; then info+=" • vm.swappiness=$swappiness (consider lowering to 1-10 for better performance) " else info+=" • vm.swappiness=$swappiness ✅ " fi fi # Check vm.dirty_ratio (should be 10-20) local dirty_ratio=$(sysctl -n vm.dirty_ratio 2>/dev/null) if [ -n "$dirty_ratio" ]; then if [ "$dirty_ratio" -gt 40 ]; then warnings+=" • vm.dirty_ratio=$dirty_ratio (HIGH - may cause stalls, recommended: 10-20) " elif [ "$dirty_ratio" -lt 10 ]; then info+=" • vm.dirty_ratio=$dirty_ratio (low - may impact write performance) " else info+=" • vm.dirty_ratio=$dirty_ratio ✅ " fi fi # Check Transparent Huge Pages (should be never or madvise for databases) local thp_enabled=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+') if [ -n "$thp_enabled" ]; then if [ "$thp_enabled" = "always" ]; then warnings+=" • Transparent Huge Pages=always (can cause latency spikes for databases) Recommended: echo never > /sys/kernel/mm/transparent_hugepage/enabled " else info+=" • Transparent Huge Pages=$thp_enabled ✅ " fi fi fi # Check I/O schedulers for each disk if [ "$IS_VIRTUAL" != true ]; then # Only check on physical servers local disks=$(lsblk -nd -o NAME,TYPE 2>/dev/null | awk '$2=="disk" {print $1}') if [ -n "$disks" ]; then while IFS= read -r disk; do local scheduler=$(cat "/sys/block/$disk/queue/scheduler" 2>/dev/null | grep -oP '\[\K[^\]]+') local rotational=$(cat "/sys/block/$disk/queue/rotational" 2>/dev/null) if [ -n "$scheduler" ] && [ -n "$rotational" ]; then # Check if scheduler is appropriate for disk type if [[ "$disk" == nvme* ]]; then # NVMe should use 'none' if [ "$scheduler" != "none" ]; then info+=" • /dev/$disk (NVMe): scheduler=$scheduler (consider 'none' for NVMe) " else info+=" • /dev/$disk (NVMe): scheduler=$scheduler ✅ " fi elif [ "$rotational" = "0" ]; then # SSD should use mq-deadline or none if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "none" ] && [ "$scheduler" != "deadline" ]; then info+=" • /dev/$disk (SSD): scheduler=$scheduler (consider 'mq-deadline' for SSD) " else info+=" • /dev/$disk (SSD): scheduler=$scheduler ✅ " fi else # HDD should use mq-deadline or deadline if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "deadline" ]; then info+=" • /dev/$disk (HDD): scheduler=$scheduler (consider 'mq-deadline' for HDD) " else info+=" • /dev/$disk (HDD): scheduler=$scheduler ✅ " fi fi fi done <<< "$disks" fi fi # Generate finding based on what we found if [ -n "$warnings" ]; then add_finding "WARNING" "⚠️ Kernel Parameters: Sub-Optimal Configuration" \ "Performance-impacting kernel parameters detected: $warnings ${info:+ Informational: $info}" \ "Kernel parameters affect system performance and stability: • vm.swappiness: Controls swap usage (1-10 for servers) - Fix: sysctl -w vm.swappiness=10 - Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf • vm.dirty_ratio: Controls dirty page cache - Fix: sysctl -w vm.dirty_ratio=15 • Transparent Huge Pages: Can cause latency for databases - Fix: echo never > /sys/kernel/mm/transparent_hugepage/enabled • I/O Scheduler: Affects disk performance - NVMe: echo none > /sys/block/nvme0n1/queue/scheduler - SSD: echo mq-deadline > /sys/block/sda/queue/scheduler" elif [ -n "$info" ]; then add_finding "INFO" "ℹ️ Kernel Parameters: Configuration Status" \ "Current kernel parameters: $info" \ "Kernel parameters are within acceptable ranges. Minor optimizations may be possible." fi } # Function to generate report generate_report() { local report_content="" # Count findings by severity local critical_count=0 local warning_count=0 local info_count=0 for finding in "${FINDINGS[@]}"; do local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p') case "$severity" in CRITICAL) critical_count=$((critical_count + 1)) ;; WARNING) warning_count=$((warning_count + 1)) ;; INFO) info_count=$((info_count + 1)) ;; esac done report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n' report_content+="║ HARDWARE HEALTH CHECK REPORT ║"$'\n' report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n' report_content+=""$'\n' report_content+="Date: $(date '+%Y-%m-%d %H:%M:%S')"$'\n' report_content+="System: $SYS_HOSTNAME"$'\n' report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n' report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n' report_content+=""$'\n' # VISUAL SEVERITY SUMMARY - Make issues OBVIOUS report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n' if [ "$critical_count" -gt 0 ]; then report_content+="║ 🔴 CRITICAL ISSUES DETECTED - IMMEDIATE ACTION REQUIRED ║"$'\n' elif [ "$warning_count" -gt 0 ]; then report_content+="║ 🟡 WARNING - Hardware Issues Detected ║"$'\n' else report_content+="║ ✅ ALL HARDWARE CHECKS PASSED - System Healthy ║"$'\n' fi report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n' report_content+=""$'\n' # Severity breakdown report_content+="FINDINGS SUMMARY:"$'\n' report_content+="──────────────────────────────────────────────────────────────────────────────"$'\n' if [ "$critical_count" -gt 0 ]; then report_content+=" 🔴 CRITICAL: $critical_count issue(s) - URGENT ATTENTION REQUIRED"$'\n' fi if [ "$warning_count" -gt 0 ]; then report_content+=" 🟡 WARNING: $warning_count issue(s) - Review and plan action"$'\n' fi report_content+=" ℹ️ INFO: $info_count item(s) - Status information"$'\n' report_content+=""$'\n' # If critical issues, list them prominently at the top if [ "$critical_count" -gt 0 ]; then report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n' report_content+="║ 🚨 CRITICAL ISSUES REQUIRING IMMEDIATE ATTENTION ║"$'\n' report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n' report_content+=""$'\n' local critical_num=1 for finding in "${FINDINGS[@]}"; do local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p') if [ "$severity" = "CRITICAL" ]; then local title=$(echo "$finding" | sed 's/^\[[^]]*\] //' | sed 's/@@@SEP@@@.*//') report_content+=" $critical_num. $title"$'\n' critical_num=$((critical_num + 1)) fi done report_content+=""$'\n' report_content+=" ⚠️ SEE DETAILED FINDINGS BELOW FOR SPECIFIC ACTIONS TO TAKE"$'\n' report_content+=""$'\n' fi report_content+="=============================================================================="$'\n' report_content+=""$'\n' # Group findings by category local -A categories categories["DISK"]="" categories["MEMORY"]="" categories["CPU"]="" categories["RAID"]="" categories["OTHER"]="" for finding in "${FINDINGS[@]}"; do # Split by @@@SEP@@@ delimiter local severity_title="${finding%%@@@SEP@@@*}" local temp="${finding#*@@@SEP@@@}" local details="${temp%%@@@SEP@@@*}" local recommendation="${temp#*@@@SEP@@@}" # Extract severity from [SEVERITY] Title format local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p') local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //') local category="OTHER" if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then category="DISK" elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then category="MEMORY" elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then category="CPU" elif [[ "$title" == *"RAID"* ]]; then category="RAID" fi local entry="" entry+="[$severity] $title"$'\n' entry+="$details"$'\n' if [ -n "$recommendation" ]; then entry+="Recommendation:"$'\n' entry+="$recommendation"$'\n' fi entry+=""$'\n' entry+="------------------------------------------------------------------------------"$'\n' entry+=""$'\n' categories[$category]+="$entry" done # Output sections if [ -n "${categories[DISK]}" ]; then report_content+="=============================================================================="$'\n' report_content+="DISK HEALTH & SMART STATUS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[DISK]}" fi if [ -n "${categories[MEMORY]}" ]; then report_content+="=============================================================================="$'\n' report_content+="MEMORY HEALTH"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[MEMORY]}" fi if [ -n "${categories[CPU]}" ]; then report_content+="=============================================================================="$'\n' report_content+="CPU HEALTH"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[CPU]}" fi if [ -n "${categories[RAID]}" ]; then report_content+="=============================================================================="$'\n' report_content+="RAID STATUS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[RAID]}" fi if [ -n "${categories[OTHER]}" ]; then report_content+="=============================================================================="$'\n' report_content+="OTHER HARDWARE FINDINGS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="${categories[OTHER]}" fi report_content+="=============================================================================="$'\n' report_content+="NEXT STEPS"$'\n' report_content+="=============================================================================="$'\n' report_content+=""$'\n' report_content+="Priority Actions:"$'\n' report_content+=" 1. Address any CRITICAL issues immediately"$'\n' report_content+=" 2. Monitor WARNING issues closely"$'\n' report_content+=" 3. Schedule regular hardware health checks"$'\n' report_content+=""$'\n' report_content+="Additional Analysis Available:"$'\n' report_content+=" • System Health Check (Main Menu) for overall server health"$'\n' report_content+=" • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n' report_content+=""$'\n' report_content+="Report saved to: $REPORT_FILE"$'\n' report_content+=""$'\n' echo "$report_content" echo "$report_content" > "$REPORT_FILE" } # Main execution main() { show_banner echo -e "${MAGENTA}${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${MAGENTA}${BOLD}║ 🔧 HARDWARE HEALTH CHECK - Deep Analysis ║${NC}" echo -e "${MAGENTA}${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" # Detect virtualization FIRST (affects which checks to run) echo -e "${CYAN}[INFO]${NC} Detecting environment (physical vs virtual)..." detect_virtualization echo "" echo -e "${CYAN}Performing comprehensive hardware diagnostics...${NC}" echo -e "${CYAN}Checks: Disks (SMART/NVMe/Age), Memory (ECC), CPU (Thermal), RAID, Filesystem, Fans, PCI, Network, Kernel${NC}" echo "" # OPTIMIZATION: Cache /var/log/messages once (avoid 32 separate grep calls) # Note: Using temp file instead of variable to avoid "Argument list too long" errors echo -e "${CYAN}[INFO]${NC} Caching system logs for analysis..." MESSAGES_CACHE="/tmp/hw_health_messages_cache_$$.tmp" if [ -f /var/log/messages ]; then cat /var/log/messages 2>/dev/null > "$MESSAGES_CACHE" else touch "$MESSAGES_CACHE" fi # Cleanup cache on exit trap "rm -f $MESSAGES_CACHE" EXIT # Run diagnostics with progress indicators echo -e "${YELLOW}[1/11]${NC} Analyzing disk SMART status and predictive failure indicators..." check_disk_smart echo -e "${YELLOW}[2/11]${NC} Checking memory health (ECC errors, OOM events, swap usage)..." check_memory_health echo -e "${YELLOW}[3/11]${NC} Monitoring CPU health (temperature, throttling, MCE errors)..." check_cpu_health echo -e "${YELLOW}[4/11]${NC} Scanning system hardware error logs..." check_hardware_errors echo -e "${YELLOW}[5/11]${NC} Verifying RAID array status..." check_raid_status echo -e "${YELLOW}[6/11]${NC} Analyzing disk I/O errors..." check_disk_io_errors echo -e "${YELLOW}[7/11]${NC} Checking for filesystem errors..." check_filesystem_errors echo -e "${YELLOW}[8/11]${NC} Monitoring system fans..." check_system_fans echo -e "${YELLOW}[9/11]${NC} Checking for PCI/PCIe errors..." check_pci_errors echo -e "${YELLOW}[10/11]${NC} Checking network interface errors..." check_network_errors echo -e "${YELLOW}[11/11]${NC} Validating kernel parameters..." check_kernel_parameters echo "" echo -e "${GREEN}[✓]${NC} Hardware diagnostics complete!" echo "" # Generate and display report echo -e "${CYAN}Generating detailed report...${NC}" echo "" generate_report # EXECUTIVE SUMMARY - Quick status overview echo "" echo -e "${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${BOLD}║ EXECUTIVE SUMMARY - Component Status ║${NC}" echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" # Analyze findings to determine component status local disk_status="✅" memory_status="✅" cpu_status="✅" raid_status="✅" local fs_status="✅" fan_status="✅" pci_status="✅" network_status="✅" kernel_status="✅" overall="HEALTHY" for finding in "${FINDINGS[@]}"; do local severity_title="${finding%%@@@SEP@@@*}" local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p') local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //') # Categorize by component if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"DRIVE"* ]]; then if [ "$severity" = "CRITICAL" ]; then disk_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$disk_status" != "🔴" ]; then disk_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]] || [[ "$title" == *"RAM"* ]]; then if [ "$severity" = "CRITICAL" ]; then memory_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$memory_status" != "🔴" ]; then memory_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"thermal"* ]] || [[ "$title" == *"temperature"* ]]; then if [ "$severity" = "CRITICAL" ]; then cpu_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$cpu_status" != "🔴" ]; then cpu_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"RAID"* ]]; then if [ "$severity" = "CRITICAL" ]; then raid_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$raid_status" != "🔴" ]; then raid_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"Filesystem"* ]] || [[ "$title" == *"read-only"* ]]; then if [ "$severity" = "CRITICAL" ]; then fs_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$fs_status" != "🔴" ]; then fs_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"Fan"* ]] || [[ "$title" == *"fan"* ]]; then if [ "$severity" = "CRITICAL" ]; then fan_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$fan_status" != "🔴" ]; then fan_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"PCI"* ]] || [[ "$title" == *"PCIe"* ]]; then if [ "$severity" = "CRITICAL" ]; then pci_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$pci_status" != "🔴" ]; then pci_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"Network"* ]] || [[ "$title" == *"Interface"* ]]; then if [ "$severity" = "CRITICAL" ]; then network_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$network_status" != "🔴" ]; then network_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi elif [[ "$title" == *"Kernel"* ]] || [[ "$title" == *"Parameter"* ]]; then if [ "$severity" = "CRITICAL" ]; then kernel_status="🔴"; overall="CRITICAL" elif [ "$severity" = "WARNING" ] && [ "$kernel_status" != "🔴" ]; then kernel_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING" fi fi done # Display component summary echo -e " Disks/Storage: $disk_status Memory: $memory_status CPU: $cpu_status RAID: $raid_status" echo -e " Filesystem: $fs_status Fans: $fan_status PCI/PCIe: $pci_status" echo -e " Network: $network_status Kernel: $kernel_status" echo "" # Count critical/warning issues local critical_count=0 local warning_count=0 for finding in "${FINDINGS[@]}"; do local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p') case "$severity" in CRITICAL) critical_count=$((critical_count + 1)) ;; WARNING) warning_count=$((warning_count + 1)) ;; esac done # Overall status if [ "$overall" = "CRITICAL" ]; then echo -e " ${RED}${BOLD}Overall Status: 🔴 CRITICAL - $critical_count issue(s) require IMMEDIATE action!${NC}" elif [ "$overall" = "WARNING" ]; then echo -e " ${YELLOW}${BOLD}Overall Status: 🟡 WARNING - $warning_count issue(s) detected${NC}" else echo -e " ${GREEN}${BOLD}Overall Status: ✅ HEALTHY - All systems operating normally${NC}" fi echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" echo -e "${CYAN}Full report saved to:${NC} ${BOLD}$REPORT_FILE${NC}" echo "" press_enter # Severity-based exit codes for monitoring system integration # Only use exit codes when script is run standalone (not sourced by launcher) # When sourced, the return value is available via $? but won't exit the parent shell if [ "${BASH_SOURCE[0]}" = "${0}" ]; then # Script is being run directly, use exit codes case "$overall" in CRITICAL) exit 2 ;; WARNING) exit 1 ;; *) exit 0 ;; esac else # Script is being sourced (called from launcher), use return codes case "$overall" in CRITICAL) return 2 ;; WARNING) return 1 ;; *) return 0 ;; esac fi } # Run main function only if script is executed directly (not sourced) if [ "${BASH_SOURCE[0]}" = "${0}" ]; then main else # When sourced, call main but don't auto-run main fi