443e246bf0
Problem:
When run from the launcher menu, the hardware health check script
would exit the entire toolkit after completion instead of returning
to the menu. This was frustrating for users who wanted to run multiple
operations.
Root Cause:
The script used `exit 0/1/2` at the end to provide severity-based exit
codes for monitoring system integration. However, this caused the script
to terminate the parent shell when sourced by the launcher.
Solution:
Detect execution context and use appropriate behavior:
1. Standalone Execution (./hardware-health-check.sh):
- Use `exit` codes (0, 1, 2) for monitoring integration
- Script terminates as expected for cron/monitoring tools
2. Sourced Execution (called from launcher):
- Use `return` codes (0, 1, 2) instead of exit
- Returns control to launcher menu
- Exit codes still available via $? if launcher wants to check
Detection Method:
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
# Script run directly → use exit
else
# Script sourced by launcher → use return
fi
Changes to modules/performance/hardware-health-check.sh:
- Lines 1840-1854: Added execution context detection
- Standalone: exit 0/1/2 (monitoring integration)
- Sourced: return 0/1/2 (back to menu)
- Lines 1857-1863: Only auto-run main if executed directly
Benefits:
✅ Returns to menu when run from launcher
✅ Still provides exit codes for monitoring tools
✅ Best of both worlds - works in all contexts
✅ No breaking changes to monitoring integration
Testing:
- Standalone: ./hardware-health-check.sh → exits with code
- From launcher: Returns to menu ✅
User Report: "when the script exists it is not built into taking back
to the menu. it just runs and exits everything once its done"
Status: ✅ FIXED - Now returns to menu properly
1864 lines
78 KiB
Bash
Executable File
1864 lines
78 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Hardware Health Check
|
||
# Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors
|
||
|
||
# Get the script's directory
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
|
||
# Source required libraries
|
||
source "$TOOLKIT_ROOT/lib/common-functions.sh"
|
||
source "$TOOLKIT_ROOT/lib/system-detect.sh"
|
||
source "$TOOLKIT_ROOT/lib/reference-db.sh"
|
||
|
||
# Initialize system detection
|
||
detect_system
|
||
|
||
# Load system info from reference database
|
||
if [ -f "$TOOLKIT_ROOT/.sysref" ]; then
|
||
SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
|
||
SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
|
||
fi
|
||
|
||
# Color definitions
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
MAGENTA='\033[0;35m'
|
||
CYAN='\033[0;36m'
|
||
BOLD='\033[1m'
|
||
NC='\033[0m'
|
||
|
||
# Report file
|
||
REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt"
|
||
|
||
# Analysis results storage
|
||
declare -a FINDINGS=()
|
||
|
||
# Function to add finding
|
||
add_finding() {
|
||
[ -z "$1" ] || [ -z "$2" ] && return 1
|
||
local severity="$1"
|
||
local title="$2"
|
||
local details="$3"
|
||
local recommendation="$4"
|
||
|
||
# Use @@@SEP@@@ as separator to avoid conflicts with content
|
||
FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation")
|
||
}
|
||
|
||
# Function to check if command exists
|
||
command_exists() {
|
||
[ -z "$1" ] && return 1
|
||
command -v "$1" &>/dev/null
|
||
}
|
||
|
||
# Global variables for virtualization detection
|
||
IS_VIRTUAL=false
|
||
VIRT_TYPE="physical"
|
||
|
||
# Function to detect virtualization
|
||
detect_virtualization() {
|
||
IS_VIRTUAL=false
|
||
VIRT_TYPE="physical"
|
||
|
||
# Try systemd-detect-virt first (most reliable)
|
||
if command_exists systemd-detect-virt; then
|
||
local detected=$(systemd-detect-virt 2>/dev/null)
|
||
if [ -n "$detected" ] && [ "$detected" != "none" ]; then
|
||
IS_VIRTUAL=true
|
||
VIRT_TYPE="$detected"
|
||
fi
|
||
# Fallback: check dmidecode
|
||
elif command_exists dmidecode; then
|
||
local product=$(dmidecode -s system-product-name 2>/dev/null)
|
||
if echo "$product" | grep -qiE "kvm|qemu|vmware|virtualbox|xen|hyperv"; then
|
||
IS_VIRTUAL=true
|
||
VIRT_TYPE=$(echo "$product" | grep -oiE "kvm|qemu|vmware|virtualbox|xen|hyperv" | head -1)
|
||
fi
|
||
fi
|
||
|
||
# Add finding if virtual
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
add_finding "INFO" "ℹ️ Virtual Machine Detected" \
|
||
"Environment: $VIRT_TYPE
|
||
Hardware checks adapted for virtual machine:
|
||
• SMART disk checks: SKIPPED (VMs use virtual disks)
|
||
• Fan monitoring: SKIPPED (hypervisor controls physical fans)
|
||
• Some sensors: SKIPPED (not accessible in VM)
|
||
• Memory/CPU/Network checks: ACTIVE (VM-compatible)" \
|
||
"This is normal for virtual machines. Hardware monitoring is limited to VM-accessible components."
|
||
else
|
||
add_finding "INFO" "ℹ️ Physical Server Detected" \
|
||
"Environment: Physical hardware
|
||
All hardware health checks will be performed:
|
||
• SMART disk monitoring
|
||
• Fan speed monitoring
|
||
• Temperature sensors
|
||
• Memory ECC errors
|
||
• CPU thermal monitoring
|
||
• Network interface errors
|
||
• Kernel parameters" \
|
||
"Full hardware monitoring enabled for physical server."
|
||
fi
|
||
}
|
||
|
||
# Function to check SMART status with deep analysis
|
||
check_disk_smart() {
|
||
# Skip SMART checks on virtual machines (VMs use virtual disks)
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping SMART checks (virtual machine - $VIRT_TYPE)"
|
||
return 0
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..."
|
||
|
||
if ! command_exists smartctl; then
|
||
add_finding "INFO" "SMART Tools Not Installed" \
|
||
"smartmontools is not installed - cannot check disk health" \
|
||
"Install SMART tools: yum install smartmontools
|
||
After installing, run: systemctl enable smartd && systemctl start smartd"
|
||
return
|
||
fi
|
||
|
||
# Find all disks
|
||
local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')
|
||
|
||
if [ -z "$disks" ]; then
|
||
add_finding "WARNING" "No Disks Found" \
|
||
"Could not detect any disk devices" \
|
||
"Check system configuration: lsblk -a"
|
||
return
|
||
fi
|
||
|
||
local disk_count=0
|
||
local healthy_count=0
|
||
local warning_count=0
|
||
local failed_count=0
|
||
local skipped_count=0
|
||
local skipped_raid=0
|
||
local skipped_virtual=0
|
||
local skipped_lvm=0
|
||
local skipped_other=0
|
||
|
||
for disk in $disks; do
|
||
disk_count=$((disk_count + 1))
|
||
|
||
# Get device info to determine if SMART is applicable
|
||
local device_info=$(smartctl -i "$disk" 2>&1)
|
||
|
||
# COMPREHENSIVE DEVICE DETECTION - Works for ALL storage types
|
||
|
||
# 1. CHECK: Device exists and smartctl can communicate
|
||
if echo "$device_info" | grep -qiE "No such device|open device.*failed|Permission denied"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (device not accessible)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 2. CHECK: SMART support availability
|
||
if echo "$device_info" | grep -qiE "SMART support is:.*Unavailable|SMART support is:.*Disabled|Unable to detect device type"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (SMART not supported or disabled)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 3. EXTRACT: Device type, model, vendor for intelligent detection
|
||
local model=$(echo "$device_info" | grep -iE "Device Model:|Product:|Model Number:|Model Family:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local vendor=$(echo "$device_info" | grep -iE "Vendor:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local device_type=$(echo "$device_info" | grep -iE "Device type:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local serial=$(echo "$device_info" | grep -iE "Serial Number:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
|
||
# Combine model and vendor for comprehensive matching
|
||
local full_id="${vendor} ${model} ${device_type}"
|
||
|
||
# 4. DETECT: Hardware RAID Controllers (all major brands)
|
||
# These devices are RAID controller logical volumes, not physical disks
|
||
if echo "$full_id" | grep -qiE "MR[0-9]{4}|LSI|MegaRAID|PERC|Avago|Broadcom.*RAID|HPE.*Smart Array|HP.*Smart Array|Dell.*RAID|IBM.*ServeRAID|Adaptec|3ware|Areca|HighPoint|Promise.*RAID"; then
|
||
local raid_type="Hardware RAID Controller"
|
||
local tools="Unknown RAID tools"
|
||
|
||
# Identify specific RAID type and provide exact tools
|
||
if echo "$full_id" | grep -qiE "MR[0-9]{4}|MegaRAID"; then
|
||
raid_type="MegaRAID Controller"
|
||
tools="megacli -LDInfo -Lall -aALL or storcli /c0 /vall show all"
|
||
elif echo "$full_id" | grep -qiE "LSI|Avago|Broadcom"; then
|
||
raid_type="LSI/Broadcom RAID Controller"
|
||
tools="sas2ircu LIST or storcli show"
|
||
elif echo "$full_id" | grep -qiE "PERC|Dell"; then
|
||
raid_type="Dell PERC RAID Controller"
|
||
tools="perccli /c0 /vall show all or omreport storage vdisk"
|
||
elif echo "$full_id" | grep -qiE "HP|HPE.*Smart"; then
|
||
raid_type="HP Smart Array Controller"
|
||
tools="hpacucli ctrl all show config or ssacli ctrl all show config"
|
||
elif echo "$full_id" | grep -qiE "Adaptec"; then
|
||
raid_type="Adaptec RAID Controller"
|
||
tools="arcconf getconfig 1"
|
||
elif echo "$full_id" | grep -qiE "3ware"; then
|
||
raid_type="3ware RAID Controller"
|
||
tools="tw_cli info c0"
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($raid_type: $model)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_raid=$((skipped_raid + 1))
|
||
add_finding "INFO" "ℹ️ $raid_type Detected: $disk" \
|
||
"Device: $disk
|
||
Controller: $model
|
||
Type: $raid_type
|
||
SMART Status: Not applicable (logical volume from RAID controller)
|
||
|
||
This is a logical volume presented by a hardware RAID controller.
|
||
SMART data is not available for these devices - the controller manages
|
||
the physical disks and presents them as a single logical volume.
|
||
|
||
To monitor RAID health, use controller-specific tools:
|
||
Command: $tools
|
||
|
||
Physical disk health is monitored by the RAID controller itself.
|
||
Check controller logs and status for drive failures." \
|
||
"Monitor RAID array health using controller tools, not SMART"
|
||
continue
|
||
fi
|
||
|
||
# 5. DETECT: Virtual/Emulated Devices (VMs and containers)
|
||
if echo "$full_id" | grep -qiE "QEMU|VirtIO|Virtual|VMware|Xen.*Virtual|Msft.*Virtual|Google.*Persistent|Amazon.*Elastic"; then
|
||
local virt_type="Virtual Disk"
|
||
|
||
if echo "$full_id" | grep -qiE "QEMU"; then
|
||
virt_type="QEMU Virtual Disk (KVM)"
|
||
elif echo "$full_id" | grep -qiE "VMware"; then
|
||
virt_type="VMware Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "VirtIO"; then
|
||
virt_type="VirtIO Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Msft.*Virtual"; then
|
||
virt_type="Hyper-V Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Xen"; then
|
||
virt_type="Xen Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Google"; then
|
||
virt_type="Google Persistent Disk"
|
||
elif echo "$full_id" | grep -qiE "Amazon"; then
|
||
virt_type="AWS EBS Volume"
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($virt_type)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_virtual=$((skipped_virtual + 1))
|
||
# Already handled by VM detection at start of function
|
||
continue
|
||
fi
|
||
|
||
# 6. DETECT: Software RAID / LVM / Device Mapper
|
||
if echo "$disk" | grep -qE "md[0-9]|dm-[0-9]"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (software RAID/LVM device)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_lvm=$((skipped_lvm + 1))
|
||
add_finding "INFO" "ℹ️ Software RAID/LVM Detected: $disk" \
|
||
"Device: $disk
|
||
Type: Software RAID or LVM logical volume
|
||
|
||
This is a logical device managed by the kernel (mdadm or LVM).
|
||
SMART monitoring should be performed on the underlying physical disks.
|
||
|
||
For software RAID (md devices):
|
||
• Check RAID status: cat /proc/mdstat
|
||
• Monitor physical disks: smartctl -a /dev/sd[X]
|
||
|
||
For LVM (dm- devices):
|
||
• Check LV status: lvdisplay
|
||
• Monitor physical volumes: pvdisplay
|
||
• Check underlying disks: smartctl -a /dev/sd[X]" \
|
||
"Monitor underlying physical disks, not the logical volume"
|
||
continue
|
||
fi
|
||
|
||
# 7. DETECT: Loop devices, RAM disks, other special devices
|
||
if echo "$disk" | grep -qE "loop|ram|zram|nbd"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (special device: loop/ram/network)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 8. FINAL CHECK: Is this a real disk with SMART data?
|
||
# Try to get SMART attributes - if this fails, skip
|
||
if ! smartctl -A "$disk" &>/dev/null; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (no SMART attributes available)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
add_finding "INFO" "ℹ️ Device Without SMART: $disk" \
|
||
"Device: $disk
|
||
Model: ${model:-Unknown}
|
||
|
||
This device does not provide SMART attributes.
|
||
Common reasons:
|
||
• USB-connected drives (SMART data not passed through)
|
||
• Some hardware RAID configurations
|
||
• Older drives without SMART support
|
||
• Passthrough issues in virtual environments
|
||
|
||
If this is a critical disk, verify health through other means:
|
||
• Check dmesg for errors: dmesg | grep -i '$disk'
|
||
• Monitor I/O errors: iostat -x $disk
|
||
• Check filesystem errors: mount | grep $disk" \
|
||
"Monitor through system logs and I/O statistics"
|
||
continue
|
||
fi
|
||
|
||
# Get SMART health status
|
||
local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}')
|
||
|
||
# Get disk model and serial
|
||
local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//')
|
||
[ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//')
|
||
local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//')
|
||
|
||
# Get ALL SMART data at once (optimize - single call instead of multiple)
|
||
local smart_data=$(smartctl -A "$disk" 2>/dev/null)
|
||
|
||
# Get key SMART attributes with deep parsing
|
||
local reallocated=$(echo "$smart_data" | grep "Reallocated_Sector" | awk '{print $10}')
|
||
local pending=$(echo "$smart_data" | grep "Current_Pending_Sector" | awk '{print $10}')
|
||
local uncorrectable=$(echo "$smart_data" | grep "Offline_Uncorrectable" | awk '{print $10}')
|
||
local temp=$(echo "$smart_data" | grep "Temperature_Celsius" | awk '{print $10}')
|
||
local power_on=$(echo "$smart_data" | grep "Power_On_Hours" | awk '{print $10}')
|
||
|
||
# Additional critical attributes for predictive failure
|
||
local read_error_rate=$(echo "$smart_data" | grep "Raw_Read_Error_Rate" | awk '{print $10}')
|
||
local spin_retry=$(echo "$smart_data" | grep "Spin_Retry_Count" | awk '{print $10}')
|
||
local realloc_event=$(echo "$smart_data" | grep "Reallocated_Event_Count" | awk '{print $10}')
|
||
local wear_leveling=$(echo "$smart_data" | grep "Wear_Leveling_Count" | awk '{print $10}')
|
||
|
||
# DISK AGE ANALYSIS
|
||
local disk_age_years=0
|
||
local age_warning=""
|
||
if [ -n "$power_on" ] && [ "$power_on" -gt 0 ]; then
|
||
disk_age_years=$((power_on / 8760)) # 8760 hours per year
|
||
if [ "$disk_age_years" -ge 5 ]; then
|
||
age_warning="⚠️ DISK AGE: $disk_age_years years old (REPLACE - expected lifespan: 3-5 years)"
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
elif [ "$disk_age_years" -ge 3 ]; then
|
||
age_warning="ℹ️ DISK AGE: $disk_age_years years old (consider replacement soon)"
|
||
fi
|
||
fi
|
||
|
||
# NVMe-SPECIFIC HEALTH (if NVMe drive)
|
||
local is_nvme=false
|
||
local nvme_wear=""
|
||
local nvme_spare=""
|
||
if [[ "$disk" == *"nvme"* ]]; then
|
||
is_nvme=true
|
||
# Get NVMe SMART data
|
||
local nvme_smart=$(smartctl -A "$disk" 2>/dev/null)
|
||
|
||
# Percentage Used (wear indicator)
|
||
local percent_used=$(echo "$nvme_smart" | grep "Percentage Used" | awk '{print $3}' | tr -d '%')
|
||
if [ -n "$percent_used" ] && [ "$percent_used" -gt 90 ]; then
|
||
nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (CRITICAL - near end of life!)"
|
||
failure_risk="HIGH"
|
||
elif [ -n "$percent_used" ] && [ "$percent_used" -gt 80 ]; then
|
||
nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (high wear - monitor closely)"
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
fi
|
||
|
||
# Available Spare
|
||
local avail_spare=$(echo "$nvme_smart" | grep "Available Spare" | awk '{print $3}' | tr -d '%')
|
||
if [ -n "$avail_spare" ] && [ "$avail_spare" -lt 10 ]; then
|
||
nvme_spare="⚠️ NVMe SPARE: ${avail_spare}% available spare (CRITICAL!)"
|
||
failure_risk="HIGH"
|
||
fi
|
||
fi
|
||
|
||
# Check for I/O errors in system logs (last 7 days)
|
||
local disk_name=$(basename "$disk")
|
||
local io_errors=$(grep -i "$disk_name.*error\|$disk_name.*failed\|ata.*$disk_name" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_io_samples=""
|
||
if [ "$io_errors" -gt 0 ]; then
|
||
recent_io_samples=$(grep -i "$disk_name.*error\|$disk_name.*failed" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# PREDICTIVE FAILURE ANALYSIS - Make critical issues OBVIOUS
|
||
local failure_risk="NONE"
|
||
local risk_factors=""
|
||
|
||
# CRITICAL: Immediate failure indicators
|
||
if [ -n "$reallocated" ] && [ "$reallocated" -gt 50 ]; then
|
||
failure_risk="IMMINENT"
|
||
risk_factors+="⚠️ CRITICAL: $reallocated reallocated sectors (DRIVE FAILING SOON!)"$'\n'
|
||
elif [ -n "$reallocated" ] && [ "$reallocated" -gt 10 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $reallocated reallocated sectors (failure risk increasing)"$'\n'
|
||
elif [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then
|
||
failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $reallocated reallocated sectors detected"$'\n'
|
||
fi
|
||
|
||
if [ -n "$pending" ] && [ "$pending" -gt 10 ]; then
|
||
failure_risk="IMMINENT"
|
||
risk_factors+="⚠️ CRITICAL: $pending pending sectors (READ/WRITE FAILURES!)"$'\n'
|
||
elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $pending pending sectors"$'\n'
|
||
fi
|
||
|
||
if [ -n "$uncorrectable" ] && [ "$uncorrectable" -gt 0 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $uncorrectable uncorrectable sectors (data loss possible)"$'\n'
|
||
fi
|
||
|
||
# Temperature warnings
|
||
if [ -n "$temp" ] && [ "$temp" -gt 55 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ Temperature: ${temp}°C (OVERHEATING - threshold: 50°C)"$'\n'
|
||
elif [ -n "$temp" ] && [ "$temp" -gt 50 ]; then
|
||
risk_factors+="⚠️ Temperature: ${temp}°C (above recommended 50°C)"$'\n'
|
||
fi
|
||
|
||
# I/O errors from logs
|
||
if [ "$io_errors" -gt 50 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $io_errors I/O errors in last 7 days (hardware problem!)"$'\n'
|
||
elif [ "$io_errors" -gt 10 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $io_errors I/O errors in last 7 days"$'\n'
|
||
fi
|
||
|
||
# Add disk age warning to risk factors
|
||
[ -n "$age_warning" ] && risk_factors+="$age_warning"$'\n'
|
||
|
||
# Add NVMe-specific warnings to risk factors
|
||
[ -n "$nvme_wear" ] && risk_factors+="$nvme_wear"$'\n'
|
||
[ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n'
|
||
|
||
# Determine severity and report
|
||
# Be SMART about health status - only flag if explicitly FAILED
|
||
if [[ "$health" =~ FAILED ]]; then
|
||
# SMART health check explicitly FAILED
|
||
failed_count=$((failed_count + 1))
|
||
add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: FAILED ❌
|
||
|
||
SMART Status: FAILED
|
||
Reallocated Sectors: ${reallocated:-N/A}
|
||
Pending Sectors: ${pending:-N/A}
|
||
Uncorrectable Sectors: ${uncorrectable:-N/A}
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
|
||
Recent I/O Errors (last 7 days): $io_errors
|
||
${recent_io_samples:+Recent errors from /var/log/messages:
|
||
$recent_io_samples}" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED - DISK FAILING:
|
||
1. BACKUP ALL DATA IMMEDIATELY (drive may fail at any moment)
|
||
2. Order replacement disk NOW
|
||
3. Plan maintenance window for replacement
|
||
4. Review SMART details: smartctl -a $disk
|
||
5. Check logs: grep -i '${disk_name}' /var/log/messages
|
||
6. If RAID: Verify array status and prepare for rebuild"
|
||
|
||
elif [ "$failure_risk" = "IMMINENT" ]; then
|
||
# Predictive: Drive will fail SOON
|
||
failed_count=$((failed_count + 1))
|
||
add_finding "CRITICAL" "🔴 DRIVE FAILING SOON: $disk - REPLACE URGENTLY" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health (but critical attributes detected)
|
||
|
||
⚠️ FAILURE RISK: IMMINENT - Drive will likely fail within days/weeks
|
||
|
||
Critical Issues:
|
||
$risk_factors
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors
|
||
${recent_io_samples:+Recent errors from /var/log/messages:
|
||
$recent_io_samples}" \
|
||
"🚨 URGENT - DRIVE REPLACEMENT REQUIRED:
|
||
1. Order replacement disk immediately
|
||
2. Ensure backups are current and verified
|
||
3. Plan replacement within 1-2 weeks (sooner if possible)
|
||
4. Monitor daily: smartctl -A $disk
|
||
5. Watch for increasing errors: grep -i '${disk_name}' /var/log/messages
|
||
6. Do NOT wait for complete failure - replace proactively"
|
||
|
||
elif [ "$failure_risk" = "HIGH" ]; then
|
||
# High risk of failure
|
||
warning_count=$((warning_count + 1))
|
||
add_finding "WARNING" "🟡 HIGH FAILURE RISK: $disk - Plan Replacement" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health
|
||
|
||
⚠️ FAILURE RISK: HIGH - Replacement recommended
|
||
|
||
Risk Factors:
|
||
$risk_factors
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors" \
|
||
"⚠️ PLAN DISK REPLACEMENT:
|
||
• Order spare disk as precaution
|
||
• Monitor weekly: smartctl -A $disk
|
||
• Watch for deterioration in attributes
|
||
• Ensure backups are current
|
||
• Check logs regularly: grep -i '${disk_name}' /var/log/messages"
|
||
|
||
elif [ "$failure_risk" = "MODERATE" ]; then
|
||
# Moderate risk - monitor closely
|
||
warning_count=$((warning_count + 1))
|
||
add_finding "WARNING" "🟡 Disk $disk: Warning Signs Detected" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health
|
||
|
||
⚠️ FAILURE RISK: MODERATE - Monitor closely
|
||
|
||
Warning Signs:
|
||
$risk_factors
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors" \
|
||
"Monitor this disk closely:
|
||
• Check SMART weekly: smartctl -A $disk
|
||
• Watch for increasing reallocated/pending sectors
|
||
• Monitor system logs: grep -i '${disk_name}' /var/log/messages
|
||
• Ensure backups are current"
|
||
|
||
else
|
||
# Disk is healthy
|
||
healthy_count=$((healthy_count + 1))
|
||
add_finding "INFO" "✅ Disk $disk: Healthy" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health ✅
|
||
|
||
SMART Attributes:
|
||
Reallocated Sectors: ${reallocated:-0}
|
||
Pending Sectors: ${pending:-0}
|
||
Uncorrectable Sectors: ${uncorrectable:-0}
|
||
Temperature: ${temp:-N/A}°C (optimal: <50°C)
|
||
Power On Hours: ${power_on:-N/A}
|
||
I/O Errors (7 days): $io_errors" \
|
||
"Disk is healthy - continue regular monitoring
|
||
• Monthly SMART check recommended: smartctl -A $disk"
|
||
fi
|
||
done
|
||
|
||
# Summary finding with skip breakdown
|
||
local summary_details="Total devices found: $disk_count
|
||
Physical disks monitored: $healthy_count healthy, $warning_count warning, $failed_count failed"
|
||
|
||
if [ "$skipped_count" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
Devices skipped (SMART not applicable): $skipped_count"
|
||
if [ "$skipped_raid" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Hardware RAID controllers: $skipped_raid (use vendor tools)"
|
||
fi
|
||
if [ "$skipped_lvm" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Software RAID/LVM: $skipped_lvm (monitor underlying disks)"
|
||
fi
|
||
if [ "$skipped_virtual" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Virtual/cloud disks: $skipped_virtual (managed by hypervisor)"
|
||
fi
|
||
if [ "$skipped_other" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Other (USB/special): $skipped_other (see findings for details)"
|
||
fi
|
||
fi
|
||
|
||
add_finding "INFO" "Disk Health Summary" \
|
||
"$summary_details" \
|
||
"Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
|
||
}
|
||
|
||
# Function to check memory health with ECC error detection
|
||
check_memory_health() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking memory health..."
|
||
|
||
if ! command_exists dmidecode; then
|
||
add_finding "INFO" "dmidecode Not Available" \
|
||
"dmidecode is not installed - cannot check memory details" \
|
||
"Install dmidecode: yum install dmidecode"
|
||
return
|
||
fi
|
||
|
||
# Get memory information
|
||
local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$")
|
||
local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed")
|
||
|
||
# Get total memory
|
||
local total_mem=$(free -h | grep "Mem:" | awk '{print $2}')
|
||
local used_mem=$(free -h | grep "Mem:" | awk '{print $3}')
|
||
local available_mem=$(free -h | grep "Mem:" | awk '{print $7}')
|
||
|
||
# Check for ECC
|
||
local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l)
|
||
local ecc_type=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | sed 's/.*Error Correction Type:[ ]*//')
|
||
|
||
# Check for memory errors in dmesg
|
||
local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l)
|
||
|
||
# Check hardware errors in system log (last 7 days)
|
||
local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error\|edac.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for specific ECC error types
|
||
local single_bit_errors=$(grep -i "single.*bit.*error\|correctable.*ecc" /var/log/messages 2>/dev/null | wc -l)
|
||
local multi_bit_errors=$(grep -i "multi.*bit.*error\|uncorrectable.*ecc" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for OOM killer events
|
||
local oom_events=$(grep -i "out of memory\|oom.*kill\|invoked oom-killer" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_oom=""
|
||
if [ "$oom_events" -gt 0 ]; then
|
||
recent_oom=$(grep -i "out of memory\|oom.*kill" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Check swap usage (high swap can indicate memory pressure)
|
||
local swap_total=$(free -h | grep "Swap:" | awk '{print $2}')
|
||
local swap_used=$(free -h | grep "Swap:" | awk '{print $3}')
|
||
local swap_pct=0
|
||
if [ "$swap_total" != "0B" ] && [ -n "$swap_total" ]; then
|
||
swap_pct=$(free | grep "Swap:" | awk '{if ($2>0) print int($3/$2*100); else print 0}')
|
||
fi
|
||
|
||
# Try to identify bad memory module from ECC errors
|
||
local bad_dimm=""
|
||
if [ "$hw_mem_errors" -gt 0 ]; then
|
||
# Look for EDAC messages that identify specific DIMMs
|
||
bad_dimm=$(grep -i "edac.*dimm\|edac.*channel\|edac.*slot" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
if [ -z "$bad_dimm" ]; then
|
||
# Try CE (Correctable Error) messages
|
||
bad_dimm=$(grep -i "ce.*error.*channel\|ce.*error.*dimm" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
fi
|
||
fi
|
||
|
||
# Build memory details
|
||
local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:|Locator:" | sed 's/^[ \t]*/ /')
|
||
|
||
# ANALYZE MEMORY HEALTH
|
||
local mem_status="HEALTHY"
|
||
local mem_risk=""
|
||
|
||
# CRITICAL: Multi-bit ECC errors (uncorrectable)
|
||
if [ "$multi_bit_errors" -gt 0 ]; then
|
||
mem_status="CRITICAL"
|
||
mem_risk+="🔴 CRITICAL: $multi_bit_errors UNCORRECTABLE ECC errors (multi-bit) - DATA CORRUPTION RISK!"$'\n'
|
||
fi
|
||
|
||
# HIGH: Excessive single-bit errors
|
||
if [ "$single_bit_errors" -gt 100 ]; then
|
||
mem_status="CRITICAL"
|
||
mem_risk+="🔴 CRITICAL: $single_bit_errors correctable ECC errors (BAD DIMM - replace immediately!)"$'\n'
|
||
elif [ "$single_bit_errors" -gt 20 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: $single_bit_errors correctable ECC errors (faulty DIMM likely)"$'\n'
|
||
elif [ "$single_bit_errors" -gt 0 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="INFO"
|
||
mem_risk+="ℹ️ INFO: $single_bit_errors correctable ECC errors (monitor closely)"$'\n'
|
||
fi
|
||
|
||
# OOM killer events
|
||
if [ "$oom_events" -gt 10 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: $oom_events Out-Of-Memory events (insufficient RAM for workload!)"$'\n'
|
||
elif [ "$oom_events" -gt 0 ]; then
|
||
mem_risk+="ℹ️ INFO: $oom_events OOM events (consider adding RAM)"$'\n'
|
||
fi
|
||
|
||
# Swap thrashing
|
||
if [ "$swap_pct" -gt 80 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: Swap ${swap_pct}% full (memory pressure - consider upgrade)"$'\n'
|
||
elif [ "$swap_pct" -gt 50 ]; then
|
||
mem_risk+="ℹ️ INFO: Swap ${swap_pct}% used (moderate memory pressure)"$'\n'
|
||
fi
|
||
|
||
# Generate findings based on analysis
|
||
if [ "$mem_status" = "CRITICAL" ]; then
|
||
local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 MEMORY FAILURE: Replace RAM Immediately" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
🔴 CRITICAL MEMORY ISSUES:
|
||
$mem_risk
|
||
|
||
Memory Errors Detected:
|
||
• Total errors in logs: $hw_mem_errors
|
||
• Single-bit (correctable): $single_bit_errors
|
||
• Multi-bit (UNCORRECTABLE): $multi_bit_errors
|
||
• OOM killer events: $oom_events
|
||
|
||
${bad_dimm:+Faulty Module Location:
|
||
$bad_dimm
|
||
}
|
||
Recent errors from /var/log/messages:
|
||
$recent_errors" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED:
|
||
1. IDENTIFY BAD DIMM: Check logs above for slot/channel information
|
||
2. REPLACE FAULTY RAM: Order replacement immediately
|
||
3. RUN MEMTEST: Boot memtest86+ to identify bad module
|
||
4. CHECK ALL ERRORS: grep -i 'ecc\|edac' /var/log/messages | less
|
||
5. MONITOR CORRUPTION: Watch for application crashes, file corruption
|
||
6. If multi-bit errors: PLAN IMMEDIATE DOWNTIME for replacement
|
||
|
||
Commands to identify faulty DIMM:
|
||
• dmidecode -t memory (shows all slots)
|
||
• grep -i edac /var/log/messages (shows which slot failing)
|
||
• edac-util (if installed: yum install edac-utils)"
|
||
|
||
elif [ "$mem_status" = "WARNING" ]; then
|
||
local recent_errors=$(grep -i "memory.*error\|ecc.*error\|oom" /var/log/messages 2>/dev/null | tail -8 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "🟡 Memory Issues Detected - Action Required" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
⚠️ WARNING - Memory Issues:
|
||
$mem_risk
|
||
|
||
Memory Errors Detected:
|
||
• Total errors in logs: $hw_mem_errors
|
||
• Single-bit (correctable): $single_bit_errors
|
||
• Multi-bit (UNCORRECTABLE): $multi_bit_errors
|
||
• OOM killer events: $oom_events
|
||
|
||
${recent_oom:+Recent OOM Events:
|
||
$recent_oom
|
||
}
|
||
${bad_dimm:+Possible Faulty Module:
|
||
$bad_dimm
|
||
}
|
||
Recent errors:
|
||
$recent_errors" \
|
||
"⚠️ RECOMMENDED ACTIONS:
|
||
• Monitor error rate: grep -i 'ecc\|memory error' /var/log/messages | wc -l
|
||
• Check for increasing errors (run daily, compare counts)
|
||
• If ECC errors increasing: Plan RAM replacement
|
||
• If OOM events: Consider RAM upgrade or reduce workload
|
||
• Review memory usage: free -h && top -o %MEM | head -15
|
||
|
||
For ECC errors:
|
||
• Install monitoring: yum install edac-utils
|
||
• Check status: edac-util -v
|
||
• Identify DIMM: dmidecode -t memory | grep -A 20 'Memory Device'"
|
||
|
||
else
|
||
add_finding "INFO" "✅ Memory Health: No Issues Detected" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
Memory Errors: None detected
|
||
OOM Events: None detected
|
||
ECC Errors: None detected
|
||
|
||
Installed Modules:
|
||
$mem_modules" \
|
||
"Memory appears healthy
|
||
• Regular monitoring recommended if ECC supported
|
||
• Watch for OOM events: grep -i 'oom' /var/log/messages"
|
||
fi
|
||
}
|
||
|
||
# Function to check CPU health with thermal throttling detection
|
||
check_cpu_health() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking CPU health..."
|
||
|
||
# Get CPU info
|
||
local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')
|
||
local cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
|
||
local cpu_threads=$(nproc)
|
||
|
||
# Check for CPU errors in dmesg
|
||
local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l)
|
||
|
||
# Check system log for hardware errors
|
||
local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for thermal throttling events
|
||
local throttle_events=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical|thermal.*shutdown" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_throttle=""
|
||
if [ "$throttle_events" -gt 0 ]; then
|
||
recent_throttle=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Get current CPU frequency and max frequency
|
||
local cpu_freq=""
|
||
local cpu_max_freq=""
|
||
local freq_throttled=false
|
||
if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then
|
||
local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null)
|
||
cpu_freq=$(awk "BEGIN {printf \"%.2f\", $freq_khz / 1000000}" 2>/dev/null)" GHz"
|
||
|
||
# Check max frequency
|
||
if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" ]; then
|
||
local max_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 2>/dev/null)
|
||
cpu_max_freq=$(awk "BEGIN {printf \"%.2f\", $max_freq_khz / 1000000}" 2>/dev/null)" GHz"
|
||
|
||
# Check if significantly throttled (more than 20% below max)
|
||
local throttle_pct=$(awk "BEGIN {if ($max_freq_khz > 0) print int((1 - $freq_khz/$max_freq_khz) * 100); else print 0}" 2>/dev/null)
|
||
if [ "$throttle_pct" -gt 20 ]; then
|
||
freq_throttled=true
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# Check CPU temperature with multiple methods
|
||
local cpu_temp="N/A"
|
||
local temp_value=0
|
||
local all_core_temps=""
|
||
|
||
if command_exists sensors; then
|
||
# Try to get all core temperatures
|
||
all_core_temps=$(sensors 2>/dev/null | grep -E "Core [0-9]+:" | sed 's/^/ /')
|
||
|
||
# Get highest core temperature
|
||
cpu_temp=$(sensors 2>/dev/null | grep -E "Core [0-9]+:|temp1:" | grep -oP '\+\K[0-9.]+' | sort -n | tail -1)
|
||
if [ -n "$cpu_temp" ]; then
|
||
temp_value=${cpu_temp%.*}
|
||
cpu_temp="${cpu_temp}°C"
|
||
else
|
||
cpu_temp="N/A"
|
||
fi
|
||
fi
|
||
|
||
# Fallback: Check thermal zones
|
||
if [ "$cpu_temp" = "N/A" ] && [ -d "/sys/class/thermal" ]; then
|
||
for zone in /sys/class/thermal/thermal_zone*/temp; do
|
||
if [ -f "$zone" ]; then
|
||
local temp=$(cat "$zone" 2>/dev/null)
|
||
if [ -n "$temp" ] && [ "$temp" -gt 0 ]; then
|
||
temp_value=$((temp / 1000))
|
||
cpu_temp="${temp_value}°C"
|
||
break
|
||
fi
|
||
fi
|
||
done
|
||
fi
|
||
|
||
# Check load average
|
||
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//')
|
||
local load_1min=$(echo "$load_avg" | awk -F',' '{print $1}' | tr -d ' ')
|
||
|
||
# Calculate load percentage
|
||
local load_pct=0
|
||
if [ -n "$load_1min" ] && [ "$cpu_threads" -gt 0 ]; then
|
||
load_pct=$(awk "BEGIN {printf \"%.0f\", ($load_1min / $cpu_threads) * 100}" 2>/dev/null)
|
||
fi
|
||
|
||
# ANALYZE CPU HEALTH
|
||
local cpu_status="HEALTHY"
|
||
local cpu_risk=""
|
||
|
||
# CRITICAL: MCE/Hardware errors
|
||
if [ "$hw_cpu_errors" -gt 0 ] || [ "$cpu_errors" -gt 0 ]; then
|
||
cpu_status="CRITICAL"
|
||
cpu_risk+="🔴 CRITICAL: $((cpu_errors + hw_cpu_errors)) Machine Check Exceptions (MCE) - HARDWARE FAILURE!"$'\n'
|
||
fi
|
||
|
||
# CRITICAL: Extreme overheating
|
||
if [ "$temp_value" -gt 90 ]; then
|
||
cpu_status="CRITICAL"
|
||
cpu_risk+="🔴 CRITICAL: CPU temperature ${cpu_temp} - EXTREME OVERHEATING (damage risk!)"$'\n'
|
||
elif [ "$temp_value" -gt 80 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - OVERHEATING (threshold: 80°C)"$'\n'
|
||
elif [ "$temp_value" -gt 70 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - HIGH (normal: <70°C)"$'\n'
|
||
fi
|
||
|
||
# Thermal throttling
|
||
if [ "$throttle_events" -gt 10 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: $throttle_events thermal throttling events - COOLING PROBLEM!"$'\n'
|
||
elif [ "$throttle_events" -gt 0 ]; then
|
||
cpu_risk+="ℹ️ INFO: $throttle_events thermal throttling events detected"$'\n'
|
||
fi
|
||
|
||
# Frequency throttling
|
||
if $freq_throttled; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU frequency throttled (${cpu_freq} / ${cpu_max_freq} max) - thermal or power limiting"$'\n'
|
||
fi
|
||
|
||
# High sustained load
|
||
if [ "$load_pct" -gt 200 ]; then
|
||
cpu_risk+="ℹ️ INFO: Very high load (${load_pct}% of capacity) - server may be overloaded"$'\n'
|
||
fi
|
||
|
||
# Generate findings
|
||
if [ "$cpu_status" = "CRITICAL" ]; then
|
||
local recent_errors=$(grep -iE "mce|machine check|cpu.*error|thermal.*critical" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 CPU CRITICAL: Hardware Failure or Overheating" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
|
||
Temperature: $cpu_temp ${temp_value:+(CRITICAL threshold: 80°C)}
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
🔴 CRITICAL CPU ISSUES:
|
||
$cpu_risk
|
||
|
||
Hardware Errors:
|
||
• MCE/CPU errors: $((cpu_errors + hw_cpu_errors))
|
||
• Thermal throttling events: $throttle_events
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}
|
||
${recent_throttle:+Recent Thermal Events:
|
||
$recent_throttle
|
||
}
|
||
Recent errors from logs:
|
||
$recent_errors" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED:
|
||
1. CHECK TEMPERATURE: If >90°C, shut down immediately to prevent damage!
|
||
2. COOLING SYSTEM: Check fans, heatsink, thermal paste
|
||
3. MCE ERRORS: Critical hardware failure - contact vendor/provider
|
||
4. CLEAN SYSTEM: Remove dust from fans and heatsinks
|
||
5. VERIFY AIRFLOW: Ensure proper case ventilation
|
||
6. MONITOR: Watch temps continuously: watch -n 2 sensors
|
||
|
||
Commands:
|
||
• View all temps: sensors
|
||
• Check MCE details: dmesg | grep -i mce | less
|
||
• Monitor throttling: grep -i thermal /var/log/messages
|
||
• Check frequency: cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq"
|
||
|
||
elif [ "$cpu_status" = "WARNING" ]; then
|
||
add_finding "WARNING" "🟡 CPU Issues Detected - Cooling or Hardware Problem" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
|
||
Temperature: $cpu_temp
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
⚠️ WARNING - CPU Issues:
|
||
$cpu_risk
|
||
|
||
Monitoring:
|
||
• Thermal throttling events: $throttle_events
|
||
• Current temperature: $cpu_temp
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}
|
||
${recent_throttle:+Recent Thermal Events:
|
||
$recent_throttle
|
||
}" \
|
||
"⚠️ RECOMMENDED ACTIONS:
|
||
• Clean cooling system (fans, heatsink)
|
||
• Verify fan operation: sensors (check fan RPM)
|
||
• Check case ventilation and airflow
|
||
• Monitor temperature trends: watch -n 5 sensors
|
||
• If throttling persists: Replace thermal paste or upgrade cooling
|
||
• Consider reducing workload if temperature stays high
|
||
|
||
Commands:
|
||
• Monitor live: watch -n 2 sensors
|
||
• Check throttling: grep -i thermal /var/log/messages
|
||
• View frequencies: grep MHz /proc/cpuinfo"
|
||
|
||
else
|
||
add_finding "INFO" "✅ CPU Health: Normal Operation" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} ${cpu_max_freq:+(Max: ${cpu_max_freq})}
|
||
Temperature: $cpu_temp ${temp_value:+(normal: <70°C)}
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
Hardware Errors: None detected
|
||
Thermal Throttling: None detected
|
||
Frequency Throttling: None detected
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}" \
|
||
"CPU is operating normally
|
||
• Regular temperature monitoring recommended
|
||
• Monitor: sensors (if installed)"
|
||
fi
|
||
|
||
# Check if sensors are available for monitoring
|
||
if ! command_exists sensors; then
|
||
add_finding "INFO" "Temperature Monitoring Not Available" \
|
||
"lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \
|
||
"Install sensors for temperature monitoring:
|
||
1. yum install lm_sensors
|
||
2. Run: sensors-detect (answer YES to all prompts)
|
||
3. Start service: systemctl start lm_sensors
|
||
4. View temperatures: sensors"
|
||
fi
|
||
}
|
||
|
||
# Function to check system hardware errors
|
||
check_hardware_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..."
|
||
|
||
# Check for general hardware errors
|
||
local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
if [ "$hw_errors" -gt 0 ]; then
|
||
local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "Hardware Errors in System Log" \
|
||
"Total hardware-related errors: $hw_errors
|
||
|
||
Recent errors (last 10):
|
||
$recent_errors" \
|
||
"Hardware errors detected in system logs:
|
||
• Review full log: grep -iE 'hardware error|i/o error' /var/log/messages
|
||
• Check dmesg: dmesg | grep -i error | tail -20
|
||
• Identify failing component (disk, memory, CPU, etc.)
|
||
• Run component-specific diagnostics
|
||
• Contact hosting provider if persistent"
|
||
fi
|
||
}
|
||
|
||
# Function to check RAID status
|
||
check_raid_status() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking RAID status..."
|
||
|
||
local raid_found=false
|
||
|
||
# Check for software RAID (mdadm)
|
||
if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then
|
||
raid_found=true
|
||
local raid_status=$(cat /proc/mdstat)
|
||
local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]")
|
||
|
||
if [ "$degraded" -gt 0 ]; then
|
||
add_finding "CRITICAL" "Software RAID Degraded" \
|
||
"RAID array is degraded:
|
||
|
||
$raid_status" \
|
||
"RAID array degraded - immediate action required:
|
||
• Check details: cat /proc/mdstat
|
||
• Identify failed drive: mdadm --detail /dev/md*
|
||
• Replace failed drive and rebuild array
|
||
• Ensure backups are current"
|
||
else
|
||
add_finding "INFO" "Software RAID Status" \
|
||
"$raid_status" \
|
||
"Software RAID is healthy"
|
||
fi
|
||
fi
|
||
|
||
# Check for hardware RAID (common controllers)
|
||
if command_exists megacli; then
|
||
raid_found=true
|
||
local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive")
|
||
add_finding "INFO" "MegaRAID Status" \
|
||
"$raid_info" \
|
||
"Check details: megacli -LDInfo -Lall -aALL"
|
||
fi
|
||
|
||
if ! $raid_found; then
|
||
add_finding "INFO" "No RAID Detected" \
|
||
"No software or hardware RAID arrays detected" \
|
||
"System appears to use non-RAID storage"
|
||
fi
|
||
}
|
||
|
||
# Function to check disk I/O errors
|
||
check_disk_io_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..."
|
||
|
||
# Check for I/O errors in dmesg
|
||
local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l)
|
||
|
||
if [ "$io_errors" -gt 0 ]; then
|
||
local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "Disk I/O Errors Detected" \
|
||
"Total I/O errors in dmesg: $io_errors
|
||
|
||
Recent I/O errors (last 10):
|
||
$recent_io_errors" \
|
||
"Disk I/O errors detected - indicates hardware or connection issues:
|
||
• Check SMART status (see above)
|
||
• Review dmesg: dmesg | grep -i 'i/o error'
|
||
• Check cables and connections (if physical server)
|
||
• Check for disk controller issues
|
||
• May indicate failing disk or controller"
|
||
fi
|
||
}
|
||
|
||
# Function to check filesystem errors
|
||
check_filesystem_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking filesystem errors..."
|
||
|
||
# Check for filesystem errors in logs
|
||
local fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
if [ "$fs_errors" -gt 0 ]; then
|
||
local recent_fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
|
||
local severity="WARNING"
|
||
# Check for read-only remounts (critical)
|
||
local ro_remounts=$(grep -i "remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)
|
||
if [ "$ro_remounts" -gt 0 ]; then
|
||
severity="CRITICAL"
|
||
fi
|
||
|
||
add_finding "$severity" "🔴 Filesystem Errors Detected" \
|
||
"Total filesystem errors in logs: $fs_errors
|
||
Read-only remounts: $ro_remounts
|
||
|
||
Recent filesystem errors (last 5):
|
||
$recent_fs_errors" \
|
||
"Filesystem errors detected - may indicate disk corruption:
|
||
• Check filesystem: fsck (requires unmounting or single-user mode)
|
||
• Review all errors: grep -i 'filesystem.*error' /var/log/messages
|
||
• Check disk SMART status above
|
||
• If read-only remount: System is protecting data - investigate immediately
|
||
• May need to boot rescue mode to repair
|
||
• Ensure backups are current before repair attempts"
|
||
fi
|
||
}
|
||
|
||
# Function to check system fans
|
||
check_system_fans() {
|
||
# Skip fan checks on virtual machines (hypervisor controls physical fans)
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping fan checks (virtual machine - $VIRT_TYPE)"
|
||
return 0
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Checking system fan status..."
|
||
|
||
if ! command_exists sensors; then
|
||
return # Silently skip if sensors not installed
|
||
fi
|
||
|
||
# Get fan information
|
||
local fan_data=$(sensors 2>/dev/null | grep -i "fan")
|
||
|
||
if [ -z "$fan_data" ]; then
|
||
return # No fan data available
|
||
fi
|
||
|
||
# Check for failed fans (0 RPM or missing)
|
||
local failed_fans=$(echo "$fan_data" | grep "0 RPM\|FAULT" | wc -l)
|
||
local slow_fans=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | wc -l)
|
||
|
||
if [ "$failed_fans" -gt 0 ]; then
|
||
local failed_fan_list=$(echo "$fan_data" | grep "0 RPM\|FAULT" | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 FAILED FAN(S) DETECTED" \
|
||
"Failed fans: $failed_fans
|
||
|
||
Failed fan details:
|
||
$failed_fan_list
|
||
|
||
All fan data:
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"🚨 CRITICAL - FAN FAILURE DETECTED:
|
||
• Failed fans detected - system may overheat!
|
||
• Check all fan data: sensors
|
||
• Physical inspection required
|
||
• Replace failed fan immediately
|
||
• Monitor CPU/system temperatures closely
|
||
• May need emergency shutdown if temps rise above 90°C"
|
||
|
||
elif [ "$slow_fans" -gt 0 ]; then
|
||
local slow_fan_list=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "🟡 Slow Fan(s) Detected" \
|
||
"Slow fans (< 800 RPM): $slow_fans
|
||
|
||
Slow fan details:
|
||
$slow_fan_list
|
||
|
||
All fan data:
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"⚠️ WARNING - FANS RUNNING SLOW:
|
||
• Fans running slower than normal
|
||
• May indicate fan wear or BIOS power settings
|
||
• Monitor temperatures closely
|
||
• Consider fan replacement if temperatures rise
|
||
• Check BIOS fan control settings"
|
||
else
|
||
add_finding "INFO" "✅ System Fans: Normal Operation" \
|
||
"All fans operating normally:
|
||
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"All system fans operating within normal parameters"
|
||
fi
|
||
}
|
||
|
||
# Function to check network interface errors
|
||
check_network_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking network interface errors..."
|
||
|
||
if ! command_exists ethtool; then
|
||
return # Silently skip if ethtool not installed
|
||
fi
|
||
|
||
# Get all active network interfaces (exclude loopback)
|
||
local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v '^lo$' | grep -v '^docker' | grep -v '^veth' | grep -v '^br-')
|
||
|
||
if [ -z "$interfaces" ]; then
|
||
return # No interfaces found
|
||
fi
|
||
|
||
local total_rx_dropped=0
|
||
local total_tx_dropped=0
|
||
local total_rx_errors=0
|
||
local total_tx_errors=0
|
||
local total_crc_errors=0
|
||
local problem_interfaces=""
|
||
local has_issues=false
|
||
|
||
while IFS= read -r interface; do
|
||
# Get statistics for this interface
|
||
local stats=$(ethtool -S "$interface" 2>/dev/null)
|
||
|
||
if [ -n "$stats" ]; then
|
||
# Extract key error metrics (different NICs use different naming)
|
||
local rx_dropped=$(echo "$stats" | grep -iE "rx.*drop|rx_discards" | awk '{sum+=$2} END {print sum+0}')
|
||
local tx_dropped=$(echo "$stats" | grep -iE "tx.*drop|tx_discards" | awk '{sum+=$2} END {print sum+0}')
|
||
local rx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*rx_errors" | awk '{print $2}')
|
||
local tx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*tx_errors" | awk '{print $2}')
|
||
local crc_errors=$(echo "$stats" | grep -iE "crc.*error|rx_crc" | awk '{sum+=$2} END {print sum+0}')
|
||
|
||
# Accumulate totals
|
||
total_rx_dropped=$((total_rx_dropped + rx_dropped))
|
||
total_tx_dropped=$((total_tx_dropped + tx_dropped))
|
||
total_rx_errors=$((total_rx_errors + rx_errors))
|
||
total_tx_errors=$((total_tx_errors + tx_errors))
|
||
total_crc_errors=$((total_crc_errors + crc_errors))
|
||
|
||
# Check if this interface has significant issues
|
||
if [ "$rx_dropped" -gt 1000 ] || [ "$tx_dropped" -gt 1000 ] || [ "$crc_errors" -gt 100 ]; then
|
||
has_issues=true
|
||
problem_interfaces+=" $interface:
|
||
RX dropped: $rx_dropped
|
||
TX dropped: $tx_dropped
|
||
CRC errors: $crc_errors
|
||
"
|
||
fi
|
||
fi
|
||
done <<< "$interfaces"
|
||
|
||
# Determine severity
|
||
local severity="INFO"
|
||
if [ "$total_rx_dropped" -gt 10000 ] || [ "$total_tx_dropped" -gt 10000 ] || [ "$total_crc_errors" -gt 1000 ]; then
|
||
severity="CRITICAL"
|
||
elif [ "$total_rx_dropped" -gt 1000 ] || [ "$total_tx_dropped" -gt 1000 ] || [ "$total_crc_errors" -gt 100 ]; then
|
||
severity="WARNING"
|
||
fi
|
||
|
||
if [ "$has_issues" = true ] || [ "$severity" != "INFO" ]; then
|
||
add_finding "$severity" "🔴 Network Interface Errors Detected" \
|
||
"Total across all interfaces:
|
||
• RX packets dropped: $total_rx_dropped
|
||
• TX packets dropped: $total_tx_dropped
|
||
• RX errors: $total_rx_errors
|
||
• TX errors: $total_tx_errors
|
||
• CRC errors: $total_crc_errors
|
||
|
||
Problem interfaces:
|
||
$problem_interfaces" \
|
||
"Network errors detected - may indicate hardware or driver issues:
|
||
• Check interface: ethtool eth0
|
||
• Check dmesg: dmesg | grep -i 'eth\|network'
|
||
• High drops may indicate:
|
||
- Network card failure
|
||
- Driver issues
|
||
- Switch/cable problems
|
||
- Bandwidth saturation
|
||
• CRC errors indicate:
|
||
- Bad cable
|
||
- EMI interference
|
||
- Faulty NIC
|
||
• If persistent: Replace network cable first, then NIC if needed"
|
||
else
|
||
# All healthy
|
||
add_finding "INFO" "✅ Network Interfaces: Healthy" \
|
||
"All network interfaces operating normally
|
||
Total interfaces checked: $(echo "$interfaces" | wc -l)
|
||
No significant packet drops or errors detected" \
|
||
"Network hardware is functioning properly"
|
||
fi
|
||
}
|
||
|
||
# Function to check PCI/PCIe errors
|
||
check_pci_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking PCI/PCIe errors..."
|
||
|
||
# Check for PCI errors in dmesg and logs
|
||
local pci_errors=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error|correctable.*error.*pci|uncorrectable.*error.*pci" | wc -l)
|
||
|
||
# Also check cached messages if available
|
||
local log_pci_errors=0
|
||
if [ -f "$MESSAGES_CACHE" ]; then
|
||
log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" "$MESSAGES_CACHE" 2>/dev/null | wc -l)
|
||
else
|
||
log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
fi
|
||
|
||
local total_pci_errors=$((pci_errors + log_pci_errors))
|
||
|
||
if [ "$total_pci_errors" -gt 0 ]; then
|
||
# Get samples from both sources
|
||
local dmesg_samples=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error" | tail -5 | sed 's/^/ /')
|
||
local log_samples=""
|
||
if [ -f "$MESSAGES_CACHE" ]; then
|
||
log_samples=$(grep -iE "pci.*error|pcie.*error" "$MESSAGES_CACHE" 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Check for uncorrectable errors (more serious)
|
||
local uncorrectable=0
|
||
if echo "$dmesg_samples" | grep -qi "uncorrectable"; then
|
||
uncorrectable=1
|
||
fi
|
||
|
||
local severity="WARNING"
|
||
if [ "$uncorrectable" -eq 1 ] || [ "$total_pci_errors" -gt 50 ]; then
|
||
severity="CRITICAL"
|
||
fi
|
||
|
||
add_finding "$severity" "🔴 PCI/PCIe Errors Detected" \
|
||
"Total PCI errors: $total_pci_errors
|
||
Uncorrectable errors: $([ "$uncorrectable" -eq 1 ] && echo 'YES (CRITICAL!)' || echo 'No')
|
||
|
||
Recent errors from dmesg:
|
||
$dmesg_samples
|
||
|
||
${log_samples:+Recent errors from /var/log/messages:
|
||
$log_samples}" \
|
||
"PCI/PCIe errors detected - may indicate hardware problems:
|
||
• Uncorrectable errors = serious hardware issue
|
||
• Correctable errors = potential signal integrity problems
|
||
• Check details: dmesg | grep -i 'pci.*error'
|
||
• Check PCIe link status: lspci -vv | grep -A 5 'LnkSta'
|
||
• May indicate:
|
||
- Faulty PCIe device (network card, RAID controller, etc.)
|
||
- Motherboard issues
|
||
- Power supply problems
|
||
- Improper card seating
|
||
• If persistent: Reseat cards, check for firmware updates
|
||
• If uncorrectable: Replace failing hardware immediately"
|
||
fi
|
||
}
|
||
|
||
# Function to check kernel parameters
|
||
check_kernel_parameters() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking kernel parameters..."
|
||
|
||
local issues=""
|
||
local warnings=""
|
||
local info=""
|
||
|
||
# Check vm.swappiness (should be 1-10 for servers)
|
||
if command_exists sysctl; then
|
||
local swappiness=$(sysctl -n vm.swappiness 2>/dev/null)
|
||
if [ -n "$swappiness" ]; then
|
||
if [ "$swappiness" -gt 60 ]; then
|
||
warnings+=" • vm.swappiness=$swappiness (HIGH - should be 1-10 for servers)
|
||
"
|
||
elif [ "$swappiness" -gt 10 ]; then
|
||
info+=" • vm.swappiness=$swappiness (consider lowering to 1-10 for better performance)
|
||
"
|
||
else
|
||
info+=" • vm.swappiness=$swappiness ✅
|
||
"
|
||
fi
|
||
fi
|
||
|
||
# Check vm.dirty_ratio (should be 10-20)
|
||
local dirty_ratio=$(sysctl -n vm.dirty_ratio 2>/dev/null)
|
||
if [ -n "$dirty_ratio" ]; then
|
||
if [ "$dirty_ratio" -gt 40 ]; then
|
||
warnings+=" • vm.dirty_ratio=$dirty_ratio (HIGH - may cause stalls, recommended: 10-20)
|
||
"
|
||
elif [ "$dirty_ratio" -lt 10 ]; then
|
||
info+=" • vm.dirty_ratio=$dirty_ratio (low - may impact write performance)
|
||
"
|
||
else
|
||
info+=" • vm.dirty_ratio=$dirty_ratio ✅
|
||
"
|
||
fi
|
||
fi
|
||
|
||
# Check Transparent Huge Pages (should be never or madvise for databases)
|
||
local thp_enabled=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+')
|
||
if [ -n "$thp_enabled" ]; then
|
||
if [ "$thp_enabled" = "always" ]; then
|
||
warnings+=" • Transparent Huge Pages=always (can cause latency spikes for databases)
|
||
Recommended: echo never > /sys/kernel/mm/transparent_hugepage/enabled
|
||
"
|
||
else
|
||
info+=" • Transparent Huge Pages=$thp_enabled ✅
|
||
"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# Check I/O schedulers for each disk
|
||
if [ "$IS_VIRTUAL" != true ]; then # Only check on physical servers
|
||
local disks=$(lsblk -nd -o NAME,TYPE 2>/dev/null | awk '$2=="disk" {print $1}')
|
||
if [ -n "$disks" ]; then
|
||
while IFS= read -r disk; do
|
||
local scheduler=$(cat /sys/block/$disk/queue/scheduler 2>/dev/null | grep -oP '\[\K[^\]]+')
|
||
local rotational=$(cat /sys/block/$disk/queue/rotational 2>/dev/null)
|
||
|
||
if [ -n "$scheduler" ] && [ -n "$rotational" ]; then
|
||
# Check if scheduler is appropriate for disk type
|
||
if [[ "$disk" == nvme* ]]; then
|
||
# NVMe should use 'none'
|
||
if [ "$scheduler" != "none" ]; then
|
||
info+=" • /dev/$disk (NVMe): scheduler=$scheduler (consider 'none' for NVMe)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (NVMe): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
elif [ "$rotational" = "0" ]; then
|
||
# SSD should use mq-deadline or none
|
||
if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "none" ] && [ "$scheduler" != "deadline" ]; then
|
||
info+=" • /dev/$disk (SSD): scheduler=$scheduler (consider 'mq-deadline' for SSD)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (SSD): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
else
|
||
# HDD should use mq-deadline or deadline
|
||
if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "deadline" ]; then
|
||
info+=" • /dev/$disk (HDD): scheduler=$scheduler (consider 'mq-deadline' for HDD)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (HDD): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
fi
|
||
fi
|
||
done <<< "$disks"
|
||
fi
|
||
fi
|
||
|
||
# Generate finding based on what we found
|
||
if [ -n "$warnings" ]; then
|
||
add_finding "WARNING" "⚠️ Kernel Parameters: Sub-Optimal Configuration" \
|
||
"Performance-impacting kernel parameters detected:
|
||
|
||
$warnings
|
||
${info:+
|
||
Informational:
|
||
$info}" \
|
||
"Kernel parameters affect system performance and stability:
|
||
• vm.swappiness: Controls swap usage (1-10 for servers)
|
||
- Fix: sysctl -w vm.swappiness=10
|
||
- Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf
|
||
• vm.dirty_ratio: Controls dirty page cache
|
||
- Fix: sysctl -w vm.dirty_ratio=15
|
||
• Transparent Huge Pages: Can cause latency for databases
|
||
- Fix: echo never > /sys/kernel/mm/transparent_hugepage/enabled
|
||
• I/O Scheduler: Affects disk performance
|
||
- NVMe: echo none > /sys/block/nvme0n1/queue/scheduler
|
||
- SSD: echo mq-deadline > /sys/block/sda/queue/scheduler"
|
||
elif [ -n "$info" ]; then
|
||
add_finding "INFO" "ℹ️ Kernel Parameters: Configuration Status" \
|
||
"Current kernel parameters:
|
||
|
||
$info" \
|
||
"Kernel parameters are within acceptable ranges. Minor optimizations may be possible."
|
||
fi
|
||
}
|
||
|
||
# Function to generate report
|
||
generate_report() {
|
||
local report_content=""
|
||
|
||
# Count findings by severity
|
||
local critical_count=0
|
||
local warning_count=0
|
||
local info_count=0
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
case "$severity" in
|
||
CRITICAL) critical_count=$((critical_count + 1)) ;;
|
||
WARNING) warning_count=$((warning_count + 1)) ;;
|
||
INFO) info_count=$((info_count + 1)) ;;
|
||
esac
|
||
done
|
||
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
report_content+="║ HARDWARE HEALTH CHECK REPORT ║"$'\n'
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Date: $(date '+%Y-%m-%d %H:%M:%S')"$'\n'
|
||
report_content+="System: $SYS_HOSTNAME"$'\n'
|
||
report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n'
|
||
report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# VISUAL SEVERITY SUMMARY - Make issues OBVIOUS
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+="║ 🔴 CRITICAL ISSUES DETECTED - IMMEDIATE ACTION REQUIRED ║"$'\n'
|
||
elif [ "$warning_count" -gt 0 ]; then
|
||
report_content+="║ 🟡 WARNING - Hardware Issues Detected ║"$'\n'
|
||
else
|
||
report_content+="║ ✅ ALL HARDWARE CHECKS PASSED - System Healthy ║"$'\n'
|
||
fi
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# Severity breakdown
|
||
report_content+="FINDINGS SUMMARY:"$'\n'
|
||
report_content+="──────────────────────────────────────────────────────────────────────────────"$'\n'
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+=" 🔴 CRITICAL: $critical_count issue(s) - URGENT ATTENTION REQUIRED"$'\n'
|
||
fi
|
||
if [ "$warning_count" -gt 0 ]; then
|
||
report_content+=" 🟡 WARNING: $warning_count issue(s) - Review and plan action"$'\n'
|
||
fi
|
||
report_content+=" ℹ️ INFO: $info_count item(s) - Status information"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# If critical issues, list them prominently at the top
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
report_content+="║ 🚨 CRITICAL ISSUES REQUIRING IMMEDIATE ATTENTION ║"$'\n'
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
local critical_num=1
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
if [ "$severity" = "CRITICAL" ]; then
|
||
local title=$(echo "$finding" | sed 's/^\[[^]]*\] //' | sed 's/@@@SEP@@@.*//')
|
||
report_content+=" $critical_num. $title"$'\n'
|
||
critical_num=$((critical_num + 1))
|
||
fi
|
||
done
|
||
report_content+=""$'\n'
|
||
report_content+=" ⚠️ SEE DETAILED FINDINGS BELOW FOR SPECIFIC ACTIONS TO TAKE"$'\n'
|
||
report_content+=""$'\n'
|
||
fi
|
||
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# Group findings by category
|
||
local -A categories
|
||
categories["DISK"]=""
|
||
categories["MEMORY"]=""
|
||
categories["CPU"]=""
|
||
categories["RAID"]=""
|
||
categories["OTHER"]=""
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
# Split by @@@SEP@@@ delimiter
|
||
local severity_title="${finding%%@@@SEP@@@*}"
|
||
local temp="${finding#*@@@SEP@@@}"
|
||
local details="${temp%%@@@SEP@@@*}"
|
||
local recommendation="${temp#*@@@SEP@@@}"
|
||
|
||
# Extract severity from [SEVERITY] Title format
|
||
local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')
|
||
|
||
local category="OTHER"
|
||
if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then
|
||
category="DISK"
|
||
elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then
|
||
category="MEMORY"
|
||
elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then
|
||
category="CPU"
|
||
elif [[ "$title" == *"RAID"* ]]; then
|
||
category="RAID"
|
||
fi
|
||
|
||
local entry=""
|
||
entry+="[$severity] $title"$'\n'
|
||
entry+="$details"$'\n'
|
||
if [ -n "$recommendation" ]; then
|
||
entry+="Recommendation:"$'\n'
|
||
entry+="$recommendation"$'\n'
|
||
fi
|
||
entry+=""$'\n'
|
||
entry+="------------------------------------------------------------------------------"$'\n'
|
||
entry+=""$'\n'
|
||
|
||
categories[$category]+="$entry"
|
||
done
|
||
|
||
# Output sections
|
||
if [ -n "${categories[DISK]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="DISK HEALTH & SMART STATUS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[DISK]}"
|
||
fi
|
||
|
||
if [ -n "${categories[MEMORY]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="MEMORY HEALTH"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[MEMORY]}"
|
||
fi
|
||
|
||
if [ -n "${categories[CPU]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="CPU HEALTH"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[CPU]}"
|
||
fi
|
||
|
||
if [ -n "${categories[RAID]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="RAID STATUS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[RAID]}"
|
||
fi
|
||
|
||
if [ -n "${categories[OTHER]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="OTHER HARDWARE FINDINGS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[OTHER]}"
|
||
fi
|
||
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="NEXT STEPS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Priority Actions:"$'\n'
|
||
report_content+=" 1. Address any CRITICAL issues immediately"$'\n'
|
||
report_content+=" 2. Monitor WARNING issues closely"$'\n'
|
||
report_content+=" 3. Schedule regular hardware health checks"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Additional Analysis Available:"$'\n'
|
||
report_content+=" • System Health Check (Main Menu) for overall server health"$'\n'
|
||
report_content+=" • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Report saved to: $REPORT_FILE"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
echo "$report_content"
|
||
echo "$report_content" > "$REPORT_FILE"
|
||
}
|
||
|
||
# Main execution
|
||
main() {
|
||
show_banner
|
||
echo -e "${MAGENTA}${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${MAGENTA}${BOLD}║ 🔧 HARDWARE HEALTH CHECK - Deep Analysis ║${NC}"
|
||
echo -e "${MAGENTA}${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
# Detect virtualization FIRST (affects which checks to run)
|
||
echo -e "${CYAN}[INFO]${NC} Detecting environment (physical vs virtual)..."
|
||
detect_virtualization
|
||
echo ""
|
||
|
||
echo -e "${CYAN}Performing comprehensive hardware diagnostics...${NC}"
|
||
echo -e "${CYAN}Checks: Disks (SMART/NVMe/Age), Memory (ECC), CPU (Thermal), RAID, Filesystem, Fans, PCI, Network, Kernel${NC}"
|
||
echo ""
|
||
|
||
# OPTIMIZATION: Cache /var/log/messages once (avoid 32 separate grep calls)
|
||
# Note: Using temp file instead of variable to avoid "Argument list too long" errors
|
||
echo -e "${CYAN}[INFO]${NC} Caching system logs for analysis..."
|
||
MESSAGES_CACHE="/tmp/hw_health_messages_cache_$$.tmp"
|
||
if [ -f /var/log/messages ]; then
|
||
cat /var/log/messages 2>/dev/null > "$MESSAGES_CACHE"
|
||
else
|
||
touch "$MESSAGES_CACHE"
|
||
fi
|
||
# Cleanup cache on exit
|
||
trap "rm -f $MESSAGES_CACHE" EXIT
|
||
|
||
# Run diagnostics with progress indicators
|
||
echo -e "${YELLOW}[1/11]${NC} Analyzing disk SMART status and predictive failure indicators..."
|
||
check_disk_smart
|
||
|
||
echo -e "${YELLOW}[2/11]${NC} Checking memory health (ECC errors, OOM events, swap usage)..."
|
||
check_memory_health
|
||
|
||
echo -e "${YELLOW}[3/11]${NC} Monitoring CPU health (temperature, throttling, MCE errors)..."
|
||
check_cpu_health
|
||
|
||
echo -e "${YELLOW}[4/11]${NC} Scanning system hardware error logs..."
|
||
check_hardware_errors
|
||
|
||
echo -e "${YELLOW}[5/11]${NC} Verifying RAID array status..."
|
||
check_raid_status
|
||
|
||
echo -e "${YELLOW}[6/11]${NC} Analyzing disk I/O errors..."
|
||
check_disk_io_errors
|
||
|
||
echo -e "${YELLOW}[7/11]${NC} Checking for filesystem errors..."
|
||
check_filesystem_errors
|
||
|
||
echo -e "${YELLOW}[8/11]${NC} Monitoring system fans..."
|
||
check_system_fans
|
||
|
||
echo -e "${YELLOW}[9/11]${NC} Checking for PCI/PCIe errors..."
|
||
check_pci_errors
|
||
|
||
echo -e "${YELLOW}[10/11]${NC} Checking network interface errors..."
|
||
check_network_errors
|
||
|
||
echo -e "${YELLOW}[11/11]${NC} Validating kernel parameters..."
|
||
check_kernel_parameters
|
||
|
||
echo ""
|
||
echo -e "${GREEN}[✓]${NC} Hardware diagnostics complete!"
|
||
echo ""
|
||
|
||
# Generate and display report
|
||
echo -e "${CYAN}Generating detailed report...${NC}"
|
||
echo ""
|
||
generate_report
|
||
|
||
# EXECUTIVE SUMMARY - Quick status overview
|
||
echo ""
|
||
echo -e "${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${BOLD}║ EXECUTIVE SUMMARY - Component Status ║${NC}"
|
||
echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
# Analyze findings to determine component status
|
||
local disk_status="✅" memory_status="✅" cpu_status="✅" raid_status="✅"
|
||
local fs_status="✅" fan_status="✅" pci_status="✅" network_status="✅" kernel_status="✅" overall="HEALTHY"
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity_title="${finding%%@@@SEP@@@*}"
|
||
local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')
|
||
|
||
# Categorize by component
|
||
if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"DRIVE"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then disk_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$disk_status" != "🔴" ]; then disk_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]] || [[ "$title" == *"RAM"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then memory_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$memory_status" != "🔴" ]; then memory_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"thermal"* ]] || [[ "$title" == *"temperature"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then cpu_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$cpu_status" != "🔴" ]; then cpu_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"RAID"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then raid_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$raid_status" != "🔴" ]; then raid_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Filesystem"* ]] || [[ "$title" == *"read-only"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then fs_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$fs_status" != "🔴" ]; then fs_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Fan"* ]] || [[ "$title" == *"fan"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then fan_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$fan_status" != "🔴" ]; then fan_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"PCI"* ]] || [[ "$title" == *"PCIe"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then pci_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$pci_status" != "🔴" ]; then pci_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Network"* ]] || [[ "$title" == *"Interface"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then network_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$network_status" != "🔴" ]; then network_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Kernel"* ]] || [[ "$title" == *"Parameter"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then kernel_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$kernel_status" != "🔴" ]; then kernel_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# Display component summary
|
||
echo -e " Disks/Storage: $disk_status Memory: $memory_status CPU: $cpu_status RAID: $raid_status"
|
||
echo -e " Filesystem: $fs_status Fans: $fan_status PCI/PCIe: $pci_status"
|
||
echo -e " Network: $network_status Kernel: $kernel_status"
|
||
echo ""
|
||
|
||
# Count critical/warning issues
|
||
local critical_count=0
|
||
local warning_count=0
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
case "$severity" in
|
||
CRITICAL) critical_count=$((critical_count + 1)) ;;
|
||
WARNING) warning_count=$((warning_count + 1)) ;;
|
||
esac
|
||
done
|
||
|
||
# Overall status
|
||
if [ "$overall" = "CRITICAL" ]; then
|
||
echo -e " ${RED}${BOLD}Overall Status: 🔴 CRITICAL - $critical_count issue(s) require IMMEDIATE action!${NC}"
|
||
elif [ "$overall" = "WARNING" ]; then
|
||
echo -e " ${YELLOW}${BOLD}Overall Status: 🟡 WARNING - $warning_count issue(s) detected${NC}"
|
||
else
|
||
echo -e " ${GREEN}${BOLD}Overall Status: ✅ HEALTHY - All systems operating normally${NC}"
|
||
fi
|
||
|
||
echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
echo -e "${CYAN}Full report saved to:${NC} ${BOLD}$REPORT_FILE${NC}"
|
||
echo ""
|
||
|
||
press_enter
|
||
|
||
# Severity-based exit codes for monitoring system integration
|
||
# Only use exit codes when script is run standalone (not sourced by launcher)
|
||
# When sourced, the return value is available via $? but won't exit the parent shell
|
||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||
# Script is being run directly, use exit codes
|
||
case "$overall" in
|
||
CRITICAL) exit 2 ;;
|
||
WARNING) exit 1 ;;
|
||
*) exit 0 ;;
|
||
esac
|
||
else
|
||
# Script is being sourced (called from launcher), use return codes
|
||
case "$overall" in
|
||
CRITICAL) return 2 ;;
|
||
WARNING) return 1 ;;
|
||
*) return 0 ;;
|
||
esac
|
||
fi
|
||
}
|
||
|
||
# Run main function only if script is executed directly (not sourced)
|
||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||
main
|
||
else
|
||
# When sourced, call main but don't auto-run
|
||
main
|
||
fi
|