e8fae7f7ae
Added validation checks for potentially empty variables before use to prevent errors and unsafe operations. WordPress Cron Manager (5 fixes): - Added site_path validation after dirname operations - Prevents using empty paths in cd commands and file operations - Pattern: Check [ -z "$site_path" ] before use Bot Analyzer: - Quoted TEMP_DIR in trap command for safety Hardware Health Check: - Quoted MESSAGES_CACHE in trap command for safety Note: 5 issues flagged in toolkit-qa-check.sh were false positives (echo statements demonstrating bad patterns, not actual code issues) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1864 lines
78 KiB
Bash
Executable File
1864 lines
78 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Hardware Health Check
|
||
# Comprehensive hardware diagnostics including SMART, memory, CPU, and sensors
|
||
|
||
# Get the script's directory
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
TOOLKIT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
|
||
# Source required libraries
|
||
source "$TOOLKIT_ROOT/lib/common-functions.sh"
|
||
source "$TOOLKIT_ROOT/lib/system-detect.sh"
|
||
source "$TOOLKIT_ROOT/lib/reference-db.sh"
|
||
|
||
# Initialize system detection
|
||
detect_system
|
||
|
||
# Load system info from reference database
|
||
if [ -f "$TOOLKIT_ROOT/.sysref" ]; then
|
||
SYS_HOSTNAME=$(grep "^SYS|HOSTNAME|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_PANEL=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_PANEL_VER=$(grep "^SYS|CONTROL_PANEL|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
|
||
SYS_OS=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f3)
|
||
SYS_OS_VER=$(grep "^SYS|OS|" "$TOOLKIT_ROOT/.sysref" 2>/dev/null | cut -d'|' -f4)
|
||
fi
|
||
|
||
# Color definitions
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
MAGENTA='\033[0;35m'
|
||
CYAN='\033[0;36m'
|
||
BOLD='\033[1m'
|
||
NC='\033[0m'
|
||
|
||
# Report file
|
||
REPORT_FILE="/tmp/hardware_health_report_$(date +%Y%m%d_%H%M%S).txt"
|
||
|
||
# Analysis results storage
|
||
declare -a FINDINGS=()
|
||
|
||
# Function to add finding
|
||
add_finding() {
|
||
[ -z "$1" ] || [ -z "$2" ] && return 1
|
||
local severity="$1"
|
||
local title="$2"
|
||
local details="$3"
|
||
local recommendation="$4"
|
||
|
||
# Use @@@SEP@@@ as separator to avoid conflicts with content
|
||
FINDINGS+=("[$severity] $title@@@SEP@@@$details@@@SEP@@@$recommendation")
|
||
}
|
||
|
||
# Function to check if command exists
|
||
command_exists() {
|
||
[ -z "$1" ] && return 1
|
||
command -v "$1" &>/dev/null
|
||
}
|
||
|
||
# Global variables for virtualization detection
|
||
IS_VIRTUAL=false
|
||
VIRT_TYPE="physical"
|
||
|
||
# Function to detect virtualization
|
||
detect_virtualization() {
|
||
IS_VIRTUAL=false
|
||
VIRT_TYPE="physical"
|
||
|
||
# Try systemd-detect-virt first (most reliable)
|
||
if command_exists systemd-detect-virt; then
|
||
local detected=$(systemd-detect-virt 2>/dev/null)
|
||
if [ -n "$detected" ] && [ "$detected" != "none" ]; then
|
||
IS_VIRTUAL=true
|
||
VIRT_TYPE="$detected"
|
||
fi
|
||
# Fallback: check dmidecode
|
||
elif command_exists dmidecode; then
|
||
local product=$(dmidecode -s system-product-name 2>/dev/null)
|
||
if echo "$product" | grep -qiE "kvm|qemu|vmware|virtualbox|xen|hyperv"; then
|
||
IS_VIRTUAL=true
|
||
VIRT_TYPE=$(echo "$product" | grep -oiE "kvm|qemu|vmware|virtualbox|xen|hyperv" | head -1)
|
||
fi
|
||
fi
|
||
|
||
# Add finding if virtual
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
add_finding "INFO" "ℹ️ Virtual Machine Detected" \
|
||
"Environment: $VIRT_TYPE
|
||
Hardware checks adapted for virtual machine:
|
||
• SMART disk checks: SKIPPED (VMs use virtual disks)
|
||
• Fan monitoring: SKIPPED (hypervisor controls physical fans)
|
||
• Some sensors: SKIPPED (not accessible in VM)
|
||
• Memory/CPU/Network checks: ACTIVE (VM-compatible)" \
|
||
"This is normal for virtual machines. Hardware monitoring is limited to VM-accessible components."
|
||
else
|
||
add_finding "INFO" "ℹ️ Physical Server Detected" \
|
||
"Environment: Physical hardware
|
||
All hardware health checks will be performed:
|
||
• SMART disk monitoring
|
||
• Fan speed monitoring
|
||
• Temperature sensors
|
||
• Memory ECC errors
|
||
• CPU thermal monitoring
|
||
• Network interface errors
|
||
• Kernel parameters" \
|
||
"Full hardware monitoring enabled for physical server."
|
||
fi
|
||
}
|
||
|
||
# Function to check SMART status with deep analysis
|
||
check_disk_smart() {
|
||
# Skip SMART checks on virtual machines (VMs use virtual disks)
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping SMART checks (virtual machine - $VIRT_TYPE)"
|
||
return 0
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Checking disk SMART status..."
|
||
|
||
if ! command_exists smartctl; then
|
||
add_finding "INFO" "SMART Tools Not Installed" \
|
||
"smartmontools is not installed - cannot check disk health" \
|
||
"Install SMART tools: yum install smartmontools
|
||
After installing, run: systemctl enable smartd && systemctl start smartd"
|
||
return
|
||
fi
|
||
|
||
# Find all disks
|
||
local disks=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1}')
|
||
|
||
if [ -z "$disks" ]; then
|
||
add_finding "WARNING" "No Disks Found" \
|
||
"Could not detect any disk devices" \
|
||
"Check system configuration: lsblk -a"
|
||
return
|
||
fi
|
||
|
||
local disk_count=0
|
||
local healthy_count=0
|
||
local warning_count=0
|
||
local failed_count=0
|
||
local skipped_count=0
|
||
local skipped_raid=0
|
||
local skipped_virtual=0
|
||
local skipped_lvm=0
|
||
local skipped_other=0
|
||
|
||
for disk in $disks; do
|
||
disk_count=$((disk_count + 1))
|
||
|
||
# Get device info to determine if SMART is applicable
|
||
local device_info=$(smartctl -i "$disk" 2>&1)
|
||
|
||
# COMPREHENSIVE DEVICE DETECTION - Works for ALL storage types
|
||
|
||
# 1. CHECK: Device exists and smartctl can communicate
|
||
if echo "$device_info" | grep -qiE "No such device|open device.*failed|Permission denied"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (device not accessible)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 2. CHECK: SMART support availability
|
||
if echo "$device_info" | grep -qiE "SMART support is:.*Unavailable|SMART support is:.*Disabled|Unable to detect device type"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (SMART not supported or disabled)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 3. EXTRACT: Device type, model, vendor for intelligent detection
|
||
local model=$(echo "$device_info" | grep -iE "Device Model:|Product:|Model Number:|Model Family:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local vendor=$(echo "$device_info" | grep -iE "Vendor:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local device_type=$(echo "$device_info" | grep -iE "Device type:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
local serial=$(echo "$device_info" | grep -iE "Serial Number:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||
|
||
# Combine model and vendor for comprehensive matching
|
||
local full_id="${vendor} ${model} ${device_type}"
|
||
|
||
# 4. DETECT: Hardware RAID Controllers (all major brands)
|
||
# These devices are RAID controller logical volumes, not physical disks
|
||
if echo "$full_id" | grep -qiE "MR[0-9]{4}|LSI|MegaRAID|PERC|Avago|Broadcom.*RAID|HPE.*Smart Array|HP.*Smart Array|Dell.*RAID|IBM.*ServeRAID|Adaptec|3ware|Areca|HighPoint|Promise.*RAID"; then
|
||
local raid_type="Hardware RAID Controller"
|
||
local tools="Unknown RAID tools"
|
||
|
||
# Identify specific RAID type and provide exact tools
|
||
if echo "$full_id" | grep -qiE "MR[0-9]{4}|MegaRAID"; then
|
||
raid_type="MegaRAID Controller"
|
||
tools="megacli -LDInfo -Lall -aALL or storcli /c0 /vall show all"
|
||
elif echo "$full_id" | grep -qiE "LSI|Avago|Broadcom"; then
|
||
raid_type="LSI/Broadcom RAID Controller"
|
||
tools="sas2ircu LIST or storcli show"
|
||
elif echo "$full_id" | grep -qiE "PERC|Dell"; then
|
||
raid_type="Dell PERC RAID Controller"
|
||
tools="perccli /c0 /vall show all or omreport storage vdisk"
|
||
elif echo "$full_id" | grep -qiE "HP|HPE.*Smart"; then
|
||
raid_type="HP Smart Array Controller"
|
||
tools="hpacucli ctrl all show config or ssacli ctrl all show config"
|
||
elif echo "$full_id" | grep -qiE "Adaptec"; then
|
||
raid_type="Adaptec RAID Controller"
|
||
tools="arcconf getconfig 1"
|
||
elif echo "$full_id" | grep -qiE "3ware"; then
|
||
raid_type="3ware RAID Controller"
|
||
tools="tw_cli info c0"
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($raid_type: $model)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_raid=$((skipped_raid + 1))
|
||
add_finding "INFO" "ℹ️ $raid_type Detected: $disk" \
|
||
"Device: $disk
|
||
Controller: $model
|
||
Type: $raid_type
|
||
SMART Status: Not applicable (logical volume from RAID controller)
|
||
|
||
This is a logical volume presented by a hardware RAID controller.
|
||
SMART data is not available for these devices - the controller manages
|
||
the physical disks and presents them as a single logical volume.
|
||
|
||
To monitor RAID health, use controller-specific tools:
|
||
Command: $tools
|
||
|
||
Physical disk health is monitored by the RAID controller itself.
|
||
Check controller logs and status for drive failures." \
|
||
"Monitor RAID array health using controller tools, not SMART"
|
||
continue
|
||
fi
|
||
|
||
# 5. DETECT: Virtual/Emulated Devices (VMs and containers)
|
||
if echo "$full_id" | grep -qiE "QEMU|VirtIO|Virtual|VMware|Xen.*Virtual|Msft.*Virtual|Google.*Persistent|Amazon.*Elastic"; then
|
||
local virt_type="Virtual Disk"
|
||
|
||
if echo "$full_id" | grep -qiE "QEMU"; then
|
||
virt_type="QEMU Virtual Disk (KVM)"
|
||
elif echo "$full_id" | grep -qiE "VMware"; then
|
||
virt_type="VMware Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "VirtIO"; then
|
||
virt_type="VirtIO Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Msft.*Virtual"; then
|
||
virt_type="Hyper-V Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Xen"; then
|
||
virt_type="Xen Virtual Disk"
|
||
elif echo "$full_id" | grep -qiE "Google"; then
|
||
virt_type="Google Persistent Disk"
|
||
elif echo "$full_id" | grep -qiE "Amazon"; then
|
||
virt_type="AWS EBS Volume"
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($virt_type)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_virtual=$((skipped_virtual + 1))
|
||
# Already handled by VM detection at start of function
|
||
continue
|
||
fi
|
||
|
||
# 6. DETECT: Software RAID / LVM / Device Mapper
|
||
if echo "$disk" | grep -qE "md[0-9]|dm-[0-9]"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (software RAID/LVM device)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_lvm=$((skipped_lvm + 1))
|
||
add_finding "INFO" "ℹ️ Software RAID/LVM Detected: $disk" \
|
||
"Device: $disk
|
||
Type: Software RAID or LVM logical volume
|
||
|
||
This is a logical device managed by the kernel (mdadm or LVM).
|
||
SMART monitoring should be performed on the underlying physical disks.
|
||
|
||
For software RAID (md devices):
|
||
• Check RAID status: cat /proc/mdstat
|
||
• Monitor physical disks: smartctl -a /dev/sd[X]
|
||
|
||
For LVM (dm- devices):
|
||
• Check LV status: lvdisplay
|
||
• Monitor physical volumes: pvdisplay
|
||
• Check underlying disks: smartctl -a /dev/sd[X]" \
|
||
"Monitor underlying physical disks, not the logical volume"
|
||
continue
|
||
fi
|
||
|
||
# 7. DETECT: Loop devices, RAM disks, other special devices
|
||
if echo "$disk" | grep -qE "loop|ram|zram|nbd"; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (special device: loop/ram/network)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
continue
|
||
fi
|
||
|
||
# 8. FINAL CHECK: Is this a real disk with SMART data?
|
||
# Try to get SMART attributes - if this fails, skip
|
||
if ! smartctl -A "$disk" &>/dev/null; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping $disk (no SMART attributes available)"
|
||
skipped_count=$((skipped_count + 1))
|
||
skipped_other=$((skipped_other + 1))
|
||
add_finding "INFO" "ℹ️ Device Without SMART: $disk" \
|
||
"Device: $disk
|
||
Model: ${model:-Unknown}
|
||
|
||
This device does not provide SMART attributes.
|
||
Common reasons:
|
||
• USB-connected drives (SMART data not passed through)
|
||
• Some hardware RAID configurations
|
||
• Older drives without SMART support
|
||
• Passthrough issues in virtual environments
|
||
|
||
If this is a critical disk, verify health through other means:
|
||
• Check dmesg for errors: dmesg | grep -i '$disk'
|
||
• Monitor I/O errors: iostat -x $disk
|
||
• Check filesystem errors: mount | grep $disk" \
|
||
"Monitor through system logs and I/O statistics"
|
||
continue
|
||
fi
|
||
|
||
# Get SMART health status
|
||
local health=$(smartctl -H "$disk" 2>/dev/null | grep -i "SMART overall-health" | awk '{print $NF}')
|
||
|
||
# Get disk model and serial
|
||
local model=$(smartctl -i "$disk" 2>/dev/null | grep "Device Model" | sed 's/Device Model:[ ]*//')
|
||
[ -z "$model" ] && model=$(smartctl -i "$disk" 2>/dev/null | grep "Product:" | sed 's/Product:[ ]*//')
|
||
local serial=$(smartctl -i "$disk" 2>/dev/null | grep "Serial Number" | sed 's/Serial Number:[ ]*//')
|
||
|
||
# Get ALL SMART data at once (optimize - single call instead of multiple)
|
||
local smart_data=$(smartctl -A "$disk" 2>/dev/null)
|
||
|
||
# Get key SMART attributes with deep parsing
|
||
local reallocated=$(echo "$smart_data" | grep "Reallocated_Sector" | awk '{print $10}')
|
||
local pending=$(echo "$smart_data" | grep "Current_Pending_Sector" | awk '{print $10}')
|
||
local uncorrectable=$(echo "$smart_data" | grep "Offline_Uncorrectable" | awk '{print $10}')
|
||
local temp=$(echo "$smart_data" | grep "Temperature_Celsius" | awk '{print $10}')
|
||
local power_on=$(echo "$smart_data" | grep "Power_On_Hours" | awk '{print $10}')
|
||
|
||
# Additional critical attributes for predictive failure
|
||
local read_error_rate=$(echo "$smart_data" | grep "Raw_Read_Error_Rate" | awk '{print $10}')
|
||
local spin_retry=$(echo "$smart_data" | grep "Spin_Retry_Count" | awk '{print $10}')
|
||
local realloc_event=$(echo "$smart_data" | grep "Reallocated_Event_Count" | awk '{print $10}')
|
||
local wear_leveling=$(echo "$smart_data" | grep "Wear_Leveling_Count" | awk '{print $10}')
|
||
|
||
# DISK AGE ANALYSIS
|
||
local disk_age_years=0
|
||
local age_warning=""
|
||
if [ -n "$power_on" ] && [ "$power_on" -gt 0 ]; then
|
||
disk_age_years=$((power_on / 8760)) # 8760 hours per year
|
||
if [ "$disk_age_years" -ge 5 ]; then
|
||
age_warning="⚠️ DISK AGE: $disk_age_years years old (REPLACE - expected lifespan: 3-5 years)"
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
elif [ "$disk_age_years" -ge 3 ]; then
|
||
age_warning="ℹ️ DISK AGE: $disk_age_years years old (consider replacement soon)"
|
||
fi
|
||
fi
|
||
|
||
# NVMe-SPECIFIC HEALTH (if NVMe drive)
|
||
local is_nvme=false
|
||
local nvme_wear=""
|
||
local nvme_spare=""
|
||
if [[ "$disk" == *"nvme"* ]]; then
|
||
is_nvme=true
|
||
# Get NVMe SMART data
|
||
local nvme_smart=$(smartctl -A "$disk" 2>/dev/null)
|
||
|
||
# Percentage Used (wear indicator)
|
||
local percent_used=$(echo "$nvme_smart" | grep "Percentage Used" | awk '{print $3}' | tr -d '%')
|
||
if [ -n "$percent_used" ] && [ "$percent_used" -gt 90 ]; then
|
||
nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (CRITICAL - near end of life!)"
|
||
failure_risk="HIGH"
|
||
elif [ -n "$percent_used" ] && [ "$percent_used" -gt 80 ]; then
|
||
nvme_wear="⚠️ NVMe WEAR: ${percent_used}% used (high wear - monitor closely)"
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
fi
|
||
|
||
# Available Spare
|
||
local avail_spare=$(echo "$nvme_smart" | grep "Available Spare" | awk '{print $3}' | tr -d '%')
|
||
if [ -n "$avail_spare" ] && [ "$avail_spare" -lt 10 ]; then
|
||
nvme_spare="⚠️ NVMe SPARE: ${avail_spare}% available spare (CRITICAL!)"
|
||
failure_risk="HIGH"
|
||
fi
|
||
fi
|
||
|
||
# Check for I/O errors in system logs (last 7 days)
|
||
local disk_name=$(basename "$disk")
|
||
local io_errors=$(grep -i "$disk_name.*error\|$disk_name.*failed\|ata.*$disk_name" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_io_samples=""
|
||
if [ "$io_errors" -gt 0 ]; then
|
||
recent_io_samples=$(grep -i "$disk_name.*error\|$disk_name.*failed" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# PREDICTIVE FAILURE ANALYSIS - Make critical issues OBVIOUS
|
||
local failure_risk="NONE"
|
||
local risk_factors=""
|
||
|
||
# CRITICAL: Immediate failure indicators
|
||
if [ -n "$reallocated" ] && [ "$reallocated" -gt 50 ]; then
|
||
failure_risk="IMMINENT"
|
||
risk_factors+="⚠️ CRITICAL: $reallocated reallocated sectors (DRIVE FAILING SOON!)"$'\n'
|
||
elif [ -n "$reallocated" ] && [ "$reallocated" -gt 10 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $reallocated reallocated sectors (failure risk increasing)"$'\n'
|
||
elif [ -n "$reallocated" ] && [ "$reallocated" -gt 0 ]; then
|
||
failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $reallocated reallocated sectors detected"$'\n'
|
||
fi
|
||
|
||
if [ -n "$pending" ] && [ "$pending" -gt 10 ]; then
|
||
failure_risk="IMMINENT"
|
||
risk_factors+="⚠️ CRITICAL: $pending pending sectors (READ/WRITE FAILURES!)"$'\n'
|
||
elif [ -n "$pending" ] && [ "$pending" -gt 0 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $pending pending sectors"$'\n'
|
||
fi
|
||
|
||
if [ -n "$uncorrectable" ] && [ "$uncorrectable" -gt 0 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $uncorrectable uncorrectable sectors (data loss possible)"$'\n'
|
||
fi
|
||
|
||
# Temperature warnings
|
||
if [ -n "$temp" ] && [ "$temp" -gt 55 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ Temperature: ${temp}°C (OVERHEATING - threshold: 50°C)"$'\n'
|
||
elif [ -n "$temp" ] && [ "$temp" -gt 50 ]; then
|
||
risk_factors+="⚠️ Temperature: ${temp}°C (above recommended 50°C)"$'\n'
|
||
fi
|
||
|
||
# I/O errors from logs
|
||
if [ "$io_errors" -gt 50 ]; then
|
||
failure_risk="HIGH"
|
||
risk_factors+="⚠️ HIGH: $io_errors I/O errors in last 7 days (hardware problem!)"$'\n'
|
||
elif [ "$io_errors" -gt 10 ]; then
|
||
[ "$failure_risk" = "NONE" ] && failure_risk="MODERATE"
|
||
risk_factors+="⚠️ MODERATE: $io_errors I/O errors in last 7 days"$'\n'
|
||
fi
|
||
|
||
# Add disk age warning to risk factors
|
||
[ -n "$age_warning" ] && risk_factors+="$age_warning"$'\n'
|
||
|
||
# Add NVMe-specific warnings to risk factors
|
||
[ -n "$nvme_wear" ] && risk_factors+="$nvme_wear"$'\n'
|
||
[ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n'
|
||
|
||
# Determine severity and report
|
||
# Be SMART about health status - only flag if explicitly FAILED
|
||
if [[ "$health" =~ FAILED ]]; then
|
||
# SMART health check explicitly FAILED
|
||
failed_count=$((failed_count + 1))
|
||
add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: FAILED ❌
|
||
|
||
SMART Status: FAILED
|
||
Reallocated Sectors: ${reallocated:-N/A}
|
||
Pending Sectors: ${pending:-N/A}
|
||
Uncorrectable Sectors: ${uncorrectable:-N/A}
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
|
||
Recent I/O Errors (last 7 days): $io_errors
|
||
${recent_io_samples:+Recent errors from /var/log/messages:
|
||
$recent_io_samples}" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED - DISK FAILING:
|
||
1. BACKUP ALL DATA IMMEDIATELY (drive may fail at any moment)
|
||
2. Order replacement disk NOW
|
||
3. Plan maintenance window for replacement
|
||
4. Review SMART details: smartctl -a $disk
|
||
5. Check logs: grep -i '${disk_name}' /var/log/messages
|
||
6. If RAID: Verify array status and prepare for rebuild"
|
||
|
||
elif [ "$failure_risk" = "IMMINENT" ]; then
|
||
# Predictive: Drive will fail SOON
|
||
failed_count=$((failed_count + 1))
|
||
add_finding "CRITICAL" "🔴 DRIVE FAILING SOON: $disk - REPLACE URGENTLY" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health (but critical attributes detected)
|
||
|
||
⚠️ FAILURE RISK: IMMINENT - Drive will likely fail within days/weeks
|
||
|
||
Critical Issues:
|
||
$risk_factors
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors
|
||
${recent_io_samples:+Recent errors from /var/log/messages:
|
||
$recent_io_samples}" \
|
||
"🚨 URGENT - DRIVE REPLACEMENT REQUIRED:
|
||
1. Order replacement disk immediately
|
||
2. Ensure backups are current and verified
|
||
3. Plan replacement within 1-2 weeks (sooner if possible)
|
||
4. Monitor daily: smartctl -A $disk
|
||
5. Watch for increasing errors: grep -i '${disk_name}' /var/log/messages
|
||
6. Do NOT wait for complete failure - replace proactively"
|
||
|
||
elif [ "$failure_risk" = "HIGH" ]; then
|
||
# High risk of failure
|
||
warning_count=$((warning_count + 1))
|
||
add_finding "WARNING" "🟡 HIGH FAILURE RISK: $disk - Plan Replacement" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health
|
||
|
||
⚠️ FAILURE RISK: HIGH - Replacement recommended
|
||
|
||
Risk Factors:
|
||
$risk_factors
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors" \
|
||
"⚠️ PLAN DISK REPLACEMENT:
|
||
• Order spare disk as precaution
|
||
• Monitor weekly: smartctl -A $disk
|
||
• Watch for deterioration in attributes
|
||
• Ensure backups are current
|
||
• Check logs regularly: grep -i '${disk_name}' /var/log/messages"
|
||
|
||
elif [ "$failure_risk" = "MODERATE" ]; then
|
||
# Moderate risk - monitor closely
|
||
warning_count=$((warning_count + 1))
|
||
add_finding "WARNING" "🟡 Disk $disk: Warning Signs Detected" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health
|
||
|
||
⚠️ FAILURE RISK: MODERATE - Monitor closely
|
||
|
||
Warning Signs:
|
||
$risk_factors
|
||
Temperature: ${temp:-N/A}°C
|
||
Power On Hours: ${power_on:-N/A}
|
||
Recent I/O Errors (last 7 days): $io_errors" \
|
||
"Monitor this disk closely:
|
||
• Check SMART weekly: smartctl -A $disk
|
||
• Watch for increasing reallocated/pending sectors
|
||
• Monitor system logs: grep -i '${disk_name}' /var/log/messages
|
||
• Ensure backups are current"
|
||
|
||
else
|
||
# Disk is healthy
|
||
healthy_count=$((healthy_count + 1))
|
||
add_finding "INFO" "✅ Disk $disk: Healthy" \
|
||
"Device: $disk
|
||
Model: $model
|
||
Serial: $serial
|
||
Health: $health ✅
|
||
|
||
SMART Attributes:
|
||
Reallocated Sectors: ${reallocated:-0}
|
||
Pending Sectors: ${pending:-0}
|
||
Uncorrectable Sectors: ${uncorrectable:-0}
|
||
Temperature: ${temp:-N/A}°C (optimal: <50°C)
|
||
Power On Hours: ${power_on:-N/A}
|
||
I/O Errors (7 days): $io_errors" \
|
||
"Disk is healthy - continue regular monitoring
|
||
• Monthly SMART check recommended: smartctl -A $disk"
|
||
fi
|
||
done
|
||
|
||
# Summary finding with skip breakdown
|
||
local summary_details="Total devices found: $disk_count
|
||
Physical disks monitored: $healthy_count healthy, $warning_count warning, $failed_count failed"
|
||
|
||
if [ "$skipped_count" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
Devices skipped (SMART not applicable): $skipped_count"
|
||
if [ "$skipped_raid" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Hardware RAID controllers: $skipped_raid (use vendor tools)"
|
||
fi
|
||
if [ "$skipped_lvm" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Software RAID/LVM: $skipped_lvm (monitor underlying disks)"
|
||
fi
|
||
if [ "$skipped_virtual" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Virtual/cloud disks: $skipped_virtual (managed by hypervisor)"
|
||
fi
|
||
if [ "$skipped_other" -gt 0 ]; then
|
||
summary_details="${summary_details}
|
||
• Other (USB/special): $skipped_other (see findings for details)"
|
||
fi
|
||
fi
|
||
|
||
add_finding "INFO" "Disk Health Summary" \
|
||
"$summary_details" \
|
||
"Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
|
||
}
|
||
|
||
# Function to check memory health with ECC error detection
|
||
check_memory_health() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking memory health..."
|
||
|
||
if ! command_exists dmidecode; then
|
||
add_finding "INFO" "dmidecode Not Available" \
|
||
"dmidecode is not installed - cannot check memory details" \
|
||
"Install dmidecode: yum install dmidecode"
|
||
return
|
||
fi
|
||
|
||
# Get memory information
|
||
local total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "Memory Device$")
|
||
local populated_slots=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep "Size:" | grep -cv "No Module Installed")
|
||
|
||
# Get total memory
|
||
local total_mem=$(free -h | grep "Mem:" | awk '{print $2}')
|
||
local used_mem=$(free -h | grep "Mem:" | awk '{print $3}')
|
||
local available_mem=$(free -h | grep "Mem:" | awk '{print $7}')
|
||
|
||
# Check for ECC
|
||
local ecc_support=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | grep -v "None" | wc -l)
|
||
local ecc_type=$(dmidecode -t memory 2>/dev/null | grep "Error Correction Type" | head -1 | sed 's/.*Error Correction Type:[ ]*//')
|
||
|
||
# Check for memory errors in dmesg
|
||
local mem_errors=$(dmesg | grep -i "memory error\|ecc error\|mcelog" | wc -l)
|
||
|
||
# Check hardware errors in system log (last 7 days)
|
||
local hw_mem_errors=$(grep -i "memory.*error\|ecc.*error\|edac.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for specific ECC error types
|
||
local single_bit_errors=$(grep -i "single.*bit.*error\|correctable.*ecc" /var/log/messages 2>/dev/null | wc -l)
|
||
local multi_bit_errors=$(grep -i "multi.*bit.*error\|uncorrectable.*ecc" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for OOM killer events
|
||
local oom_events=$(grep -i "out of memory\|oom.*kill\|invoked oom-killer" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_oom=""
|
||
if [ "$oom_events" -gt 0 ]; then
|
||
recent_oom=$(grep -i "out of memory\|oom.*kill" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Check swap usage (high swap can indicate memory pressure)
|
||
local swap_total=$(free -h | grep "Swap:" | awk '{print $2}')
|
||
local swap_used=$(free -h | grep "Swap:" | awk '{print $3}')
|
||
local swap_pct=0
|
||
if [ "$swap_total" != "0B" ] && [ -n "$swap_total" ]; then
|
||
swap_pct=$(free | grep "Swap:" | awk '{if ($2>0) print int($3/$2*100); else print 0}')
|
||
fi
|
||
|
||
# Try to identify bad memory module from ECC errors
|
||
local bad_dimm=""
|
||
if [ "$hw_mem_errors" -gt 0 ]; then
|
||
# Look for EDAC messages that identify specific DIMMs
|
||
bad_dimm=$(grep -i "edac.*dimm\|edac.*channel\|edac.*slot" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
if [ -z "$bad_dimm" ]; then
|
||
# Try CE (Correctable Error) messages
|
||
bad_dimm=$(grep -i "ce.*error.*channel\|ce.*error.*dimm" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
fi
|
||
fi
|
||
|
||
# Build memory details
|
||
local mem_modules=$(dmidecode -t memory 2>/dev/null | grep -A 20 "Memory Device" | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:|Locator:" | sed 's/^[ \t]*/ /')
|
||
|
||
# ANALYZE MEMORY HEALTH
|
||
local mem_status="HEALTHY"
|
||
local mem_risk=""
|
||
|
||
# CRITICAL: Multi-bit ECC errors (uncorrectable)
|
||
if [ "$multi_bit_errors" -gt 0 ]; then
|
||
mem_status="CRITICAL"
|
||
mem_risk+="🔴 CRITICAL: $multi_bit_errors UNCORRECTABLE ECC errors (multi-bit) - DATA CORRUPTION RISK!"$'\n'
|
||
fi
|
||
|
||
# HIGH: Excessive single-bit errors
|
||
if [ "$single_bit_errors" -gt 100 ]; then
|
||
mem_status="CRITICAL"
|
||
mem_risk+="🔴 CRITICAL: $single_bit_errors correctable ECC errors (BAD DIMM - replace immediately!)"$'\n'
|
||
elif [ "$single_bit_errors" -gt 20 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: $single_bit_errors correctable ECC errors (faulty DIMM likely)"$'\n'
|
||
elif [ "$single_bit_errors" -gt 0 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="INFO"
|
||
mem_risk+="ℹ️ INFO: $single_bit_errors correctable ECC errors (monitor closely)"$'\n'
|
||
fi
|
||
|
||
# OOM killer events
|
||
if [ "$oom_events" -gt 10 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: $oom_events Out-Of-Memory events (insufficient RAM for workload!)"$'\n'
|
||
elif [ "$oom_events" -gt 0 ]; then
|
||
mem_risk+="ℹ️ INFO: $oom_events OOM events (consider adding RAM)"$'\n'
|
||
fi
|
||
|
||
# Swap thrashing
|
||
if [ "$swap_pct" -gt 80 ]; then
|
||
[ "$mem_status" = "HEALTHY" ] && mem_status="WARNING"
|
||
mem_risk+="🟡 WARNING: Swap ${swap_pct}% full (memory pressure - consider upgrade)"$'\n'
|
||
elif [ "$swap_pct" -gt 50 ]; then
|
||
mem_risk+="ℹ️ INFO: Swap ${swap_pct}% used (moderate memory pressure)"$'\n'
|
||
fi
|
||
|
||
# Generate findings based on analysis
|
||
if [ "$mem_status" = "CRITICAL" ]; then
|
||
local recent_errors=$(grep -i "memory.*error\|ecc.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 MEMORY FAILURE: Replace RAM Immediately" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
🔴 CRITICAL MEMORY ISSUES:
|
||
$mem_risk
|
||
|
||
Memory Errors Detected:
|
||
• Total errors in logs: $hw_mem_errors
|
||
• Single-bit (correctable): $single_bit_errors
|
||
• Multi-bit (UNCORRECTABLE): $multi_bit_errors
|
||
• OOM killer events: $oom_events
|
||
|
||
${bad_dimm:+Faulty Module Location:
|
||
$bad_dimm
|
||
}
|
||
Recent errors from /var/log/messages:
|
||
$recent_errors" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED:
|
||
1. IDENTIFY BAD DIMM: Check logs above for slot/channel information
|
||
2. REPLACE FAULTY RAM: Order replacement immediately
|
||
3. RUN MEMTEST: Boot memtest86+ to identify bad module
|
||
4. CHECK ALL ERRORS: grep -i 'ecc\|edac' /var/log/messages | less
|
||
5. MONITOR CORRUPTION: Watch for application crashes, file corruption
|
||
6. If multi-bit errors: PLAN IMMEDIATE DOWNTIME for replacement
|
||
|
||
Commands to identify faulty DIMM:
|
||
• dmidecode -t memory (shows all slots)
|
||
• grep -i edac /var/log/messages (shows which slot failing)
|
||
• edac-util (if installed: yum install edac-utils)"
|
||
|
||
elif [ "$mem_status" = "WARNING" ]; then
|
||
local recent_errors=$(grep -i "memory.*error\|ecc.*error\|oom" /var/log/messages 2>/dev/null | tail -8 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "🟡 Memory Issues Detected - Action Required" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
⚠️ WARNING - Memory Issues:
|
||
$mem_risk
|
||
|
||
Memory Errors Detected:
|
||
• Total errors in logs: $hw_mem_errors
|
||
• Single-bit (correctable): $single_bit_errors
|
||
• Multi-bit (UNCORRECTABLE): $multi_bit_errors
|
||
• OOM killer events: $oom_events
|
||
|
||
${recent_oom:+Recent OOM Events:
|
||
$recent_oom
|
||
}
|
||
${bad_dimm:+Possible Faulty Module:
|
||
$bad_dimm
|
||
}
|
||
Recent errors:
|
||
$recent_errors" \
|
||
"⚠️ RECOMMENDED ACTIONS:
|
||
• Monitor error rate: grep -i 'ecc\|memory error' /var/log/messages | wc -l
|
||
• Check for increasing errors (run daily, compare counts)
|
||
• If ECC errors increasing: Plan RAM replacement
|
||
• If OOM events: Consider RAM upgrade or reduce workload
|
||
• Review memory usage: free -h && top -o %MEM | head -15
|
||
|
||
For ECC errors:
|
||
• Install monitoring: yum install edac-utils
|
||
• Check status: edac-util -v
|
||
• Identify DIMM: dmidecode -t memory | grep -A 20 'Memory Device'"
|
||
|
||
else
|
||
add_finding "INFO" "✅ Memory Health: No Issues Detected" \
|
||
"Total Memory: $total_mem (Used: $used_mem, Available: $available_mem)
|
||
Slots: $populated_slots / $total_slots
|
||
ECC Support: $([ "${ecc_support:-0}" -gt 0 ] && echo "Yes ($ecc_type)" || echo 'No')
|
||
Swap Usage: $swap_used / $swap_total (${swap_pct}% used)
|
||
|
||
Memory Errors: None detected
|
||
OOM Events: None detected
|
||
ECC Errors: None detected
|
||
|
||
Installed Modules:
|
||
$mem_modules" \
|
||
"Memory appears healthy
|
||
• Regular monitoring recommended if ECC supported
|
||
• Watch for OOM events: grep -i 'oom' /var/log/messages"
|
||
fi
|
||
}
|
||
|
||
# Function to check CPU health with thermal throttling detection
|
||
check_cpu_health() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking CPU health..."
|
||
|
||
# Get CPU info
|
||
local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')
|
||
local cpu_cores=$(grep -c "^processor" /proc/cpuinfo)
|
||
local cpu_threads=$(nproc)
|
||
|
||
# Check for CPU errors in dmesg
|
||
local cpu_errors=$(dmesg | grep -i "mce\|machine check\|cpu.*error" | wc -l)
|
||
|
||
# Check system log for hardware errors
|
||
local hw_cpu_errors=$(grep -iE "mce|machine check exception|cpu.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
# Check for thermal throttling events
|
||
local throttle_events=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical|thermal.*shutdown" /var/log/messages 2>/dev/null | wc -l)
|
||
local recent_throttle=""
|
||
if [ "$throttle_events" -gt 0 ]; then
|
||
recent_throttle=$(grep -iE "thermal.*throttl|cpu.*overheat|temperature.*critical" /var/log/messages 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Get current CPU frequency and max frequency
|
||
local cpu_freq=""
|
||
local cpu_max_freq=""
|
||
local freq_throttled=false
|
||
if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" ]; then
|
||
local freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 2>/dev/null)
|
||
cpu_freq=$(awk "BEGIN {printf \"%.2f\", $freq_khz / 1000000}" 2>/dev/null)" GHz"
|
||
|
||
# Check max frequency
|
||
if [ -f "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" ]; then
|
||
local max_freq_khz=$(cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 2>/dev/null)
|
||
cpu_max_freq=$(awk "BEGIN {printf \"%.2f\", $max_freq_khz / 1000000}" 2>/dev/null)" GHz"
|
||
|
||
# Check if significantly throttled (more than 20% below max)
|
||
local throttle_pct=$(awk "BEGIN {if ($max_freq_khz > 0) print int((1 - $freq_khz/$max_freq_khz) * 100); else print 0}" 2>/dev/null)
|
||
if [ "$throttle_pct" -gt 20 ]; then
|
||
freq_throttled=true
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# Check CPU temperature with multiple methods
|
||
local cpu_temp="N/A"
|
||
local temp_value=0
|
||
local all_core_temps=""
|
||
|
||
if command_exists sensors; then
|
||
# Try to get all core temperatures
|
||
all_core_temps=$(sensors 2>/dev/null | grep -E "Core [0-9]+:" | sed 's/^/ /')
|
||
|
||
# Get highest core temperature
|
||
cpu_temp=$(sensors 2>/dev/null | grep -E "Core [0-9]+:|temp1:" | grep -oP '\+\K[0-9.]+' | sort -n | tail -1)
|
||
if [ -n "$cpu_temp" ]; then
|
||
temp_value=${cpu_temp%.*}
|
||
cpu_temp="${cpu_temp}°C"
|
||
else
|
||
cpu_temp="N/A"
|
||
fi
|
||
fi
|
||
|
||
# Fallback: Check thermal zones
|
||
if [ "$cpu_temp" = "N/A" ] && [ -d "/sys/class/thermal" ]; then
|
||
for zone in /sys/class/thermal/thermal_zone*/temp; do
|
||
if [ -f "$zone" ]; then
|
||
local temp=$(cat "$zone" 2>/dev/null)
|
||
if [ -n "$temp" ] && [ "$temp" -gt 0 ]; then
|
||
temp_value=$((temp / 1000))
|
||
cpu_temp="${temp_value}°C"
|
||
break
|
||
fi
|
||
fi
|
||
done
|
||
fi
|
||
|
||
# Check load average
|
||
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^[ \t]*//')
|
||
local load_1min=$(echo "$load_avg" | awk -F',' '{print $1}' | tr -d ' ')
|
||
|
||
# Calculate load percentage
|
||
local load_pct=0
|
||
if [ -n "$load_1min" ] && [ "$cpu_threads" -gt 0 ]; then
|
||
load_pct=$(awk "BEGIN {printf \"%.0f\", ($load_1min / $cpu_threads) * 100}" 2>/dev/null)
|
||
fi
|
||
|
||
# ANALYZE CPU HEALTH
|
||
local cpu_status="HEALTHY"
|
||
local cpu_risk=""
|
||
|
||
# CRITICAL: MCE/Hardware errors
|
||
if [ "$hw_cpu_errors" -gt 0 ] || [ "$cpu_errors" -gt 0 ]; then
|
||
cpu_status="CRITICAL"
|
||
cpu_risk+="🔴 CRITICAL: $((cpu_errors + hw_cpu_errors)) Machine Check Exceptions (MCE) - HARDWARE FAILURE!"$'\n'
|
||
fi
|
||
|
||
# CRITICAL: Extreme overheating
|
||
if [ "$temp_value" -gt 90 ]; then
|
||
cpu_status="CRITICAL"
|
||
cpu_risk+="🔴 CRITICAL: CPU temperature ${cpu_temp} - EXTREME OVERHEATING (damage risk!)"$'\n'
|
||
elif [ "$temp_value" -gt 80 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - OVERHEATING (threshold: 80°C)"$'\n'
|
||
elif [ "$temp_value" -gt 70 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU temperature ${cpu_temp} - HIGH (normal: <70°C)"$'\n'
|
||
fi
|
||
|
||
# Thermal throttling
|
||
if [ "$throttle_events" -gt 10 ]; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: $throttle_events thermal throttling events - COOLING PROBLEM!"$'\n'
|
||
elif [ "$throttle_events" -gt 0 ]; then
|
||
cpu_risk+="ℹ️ INFO: $throttle_events thermal throttling events detected"$'\n'
|
||
fi
|
||
|
||
# Frequency throttling
|
||
if $freq_throttled; then
|
||
[ "$cpu_status" = "HEALTHY" ] && cpu_status="WARNING"
|
||
cpu_risk+="🟡 WARNING: CPU frequency throttled (${cpu_freq} / ${cpu_max_freq} max) - thermal or power limiting"$'\n'
|
||
fi
|
||
|
||
# High sustained load
|
||
if [ "$load_pct" -gt 200 ]; then
|
||
cpu_risk+="ℹ️ INFO: Very high load (${load_pct}% of capacity) - server may be overloaded"$'\n'
|
||
fi
|
||
|
||
# Generate findings
|
||
if [ "$cpu_status" = "CRITICAL" ]; then
|
||
local recent_errors=$(grep -iE "mce|machine check|cpu.*error|thermal.*critical" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 CPU CRITICAL: Hardware Failure or Overheating" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
|
||
Temperature: $cpu_temp ${temp_value:+(CRITICAL threshold: 80°C)}
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
🔴 CRITICAL CPU ISSUES:
|
||
$cpu_risk
|
||
|
||
Hardware Errors:
|
||
• MCE/CPU errors: $((cpu_errors + hw_cpu_errors))
|
||
• Thermal throttling events: $throttle_events
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}
|
||
${recent_throttle:+Recent Thermal Events:
|
||
$recent_throttle
|
||
}
|
||
Recent errors from logs:
|
||
$recent_errors" \
|
||
"🚨 IMMEDIATE ACTION REQUIRED:
|
||
1. CHECK TEMPERATURE: If >90°C, shut down immediately to prevent damage!
|
||
2. COOLING SYSTEM: Check fans, heatsink, thermal paste
|
||
3. MCE ERRORS: Critical hardware failure - contact vendor/provider
|
||
4. CLEAN SYSTEM: Remove dust from fans and heatsinks
|
||
5. VERIFY AIRFLOW: Ensure proper case ventilation
|
||
6. MONITOR: Watch temps continuously: watch -n 2 sensors
|
||
|
||
Commands:
|
||
• View all temps: sensors
|
||
• Check MCE details: dmesg | grep -i mce | less
|
||
• Monitor throttling: grep -i thermal /var/log/messages
|
||
• Check frequency: cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq"
|
||
|
||
elif [ "$cpu_status" = "WARNING" ]; then
|
||
add_finding "WARNING" "🟡 CPU Issues Detected - Cooling or Hardware Problem" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} (Max: ${cpu_max_freq:-N/A})
|
||
Temperature: $cpu_temp
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
⚠️ WARNING - CPU Issues:
|
||
$cpu_risk
|
||
|
||
Monitoring:
|
||
• Thermal throttling events: $throttle_events
|
||
• Current temperature: $cpu_temp
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}
|
||
${recent_throttle:+Recent Thermal Events:
|
||
$recent_throttle
|
||
}" \
|
||
"⚠️ RECOMMENDED ACTIONS:
|
||
• Clean cooling system (fans, heatsink)
|
||
• Verify fan operation: sensors (check fan RPM)
|
||
• Check case ventilation and airflow
|
||
• Monitor temperature trends: watch -n 5 sensors
|
||
• If throttling persists: Replace thermal paste or upgrade cooling
|
||
• Consider reducing workload if temperature stays high
|
||
|
||
Commands:
|
||
• Monitor live: watch -n 2 sensors
|
||
• Check throttling: grep -i thermal /var/log/messages
|
||
• View frequencies: grep MHz /proc/cpuinfo"
|
||
|
||
else
|
||
add_finding "INFO" "✅ CPU Health: Normal Operation" \
|
||
"CPU Model: $cpu_model
|
||
Cores: $cpu_cores (Threads: $cpu_threads)
|
||
Current Frequency: ${cpu_freq:-N/A} ${cpu_max_freq:+(Max: ${cpu_max_freq})}
|
||
Temperature: $cpu_temp ${temp_value:+(normal: <70°C)}
|
||
Load Average: $load_avg (${load_pct}% capacity)
|
||
|
||
Hardware Errors: None detected
|
||
Thermal Throttling: None detected
|
||
Frequency Throttling: None detected
|
||
|
||
${all_core_temps:+Individual Core Temperatures:
|
||
$all_core_temps
|
||
}" \
|
||
"CPU is operating normally
|
||
• Regular temperature monitoring recommended
|
||
• Monitor: sensors (if installed)"
|
||
fi
|
||
|
||
# Check if sensors are available for monitoring
|
||
if ! command_exists sensors; then
|
||
add_finding "INFO" "Temperature Monitoring Not Available" \
|
||
"lm_sensors is not installed - cannot monitor CPU/hardware temperatures" \
|
||
"Install sensors for temperature monitoring:
|
||
1. yum install lm_sensors
|
||
2. Run: sensors-detect (answer YES to all prompts)
|
||
3. Start service: systemctl start lm_sensors
|
||
4. View temperatures: sensors"
|
||
fi
|
||
}
|
||
|
||
# Function to check system hardware errors
|
||
check_hardware_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking system hardware error logs..."
|
||
|
||
# Check for general hardware errors
|
||
local hw_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
if [ "$hw_errors" -gt 0 ]; then
|
||
local recent_errors=$(grep -iE "hardware error|i/o error|ata.*error|scsi.*error" /var/log/messages 2>/dev/null | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "Hardware Errors in System Log" \
|
||
"Total hardware-related errors: $hw_errors
|
||
|
||
Recent errors (last 10):
|
||
$recent_errors" \
|
||
"Hardware errors detected in system logs:
|
||
• Review full log: grep -iE 'hardware error|i/o error' /var/log/messages
|
||
• Check dmesg: dmesg | grep -i error | tail -20
|
||
• Identify failing component (disk, memory, CPU, etc.)
|
||
• Run component-specific diagnostics
|
||
• Contact hosting provider if persistent"
|
||
fi
|
||
}
|
||
|
||
# Function to check RAID status
|
||
check_raid_status() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking RAID status..."
|
||
|
||
local raid_found=false
|
||
|
||
# Check for software RAID (mdadm)
|
||
if [ -f /proc/mdstat ] && grep -q "active" /proc/mdstat 2>/dev/null; then
|
||
raid_found=true
|
||
local raid_status=$(cat /proc/mdstat)
|
||
local degraded=$(echo "$raid_status" | grep -c "\[.*_.*\]")
|
||
|
||
if [ "$degraded" -gt 0 ]; then
|
||
add_finding "CRITICAL" "Software RAID Degraded" \
|
||
"RAID array is degraded:
|
||
|
||
$raid_status" \
|
||
"RAID array degraded - immediate action required:
|
||
• Check details: cat /proc/mdstat
|
||
• Identify failed drive: mdadm --detail /dev/md*
|
||
• Replace failed drive and rebuild array
|
||
• Ensure backups are current"
|
||
else
|
||
add_finding "INFO" "Software RAID Status" \
|
||
"$raid_status" \
|
||
"Software RAID is healthy"
|
||
fi
|
||
fi
|
||
|
||
# Check for hardware RAID (common controllers)
|
||
if command_exists megacli; then
|
||
raid_found=true
|
||
local raid_info=$(megacli -LDInfo -Lall -aALL 2>/dev/null | grep -E "State|Virtual Drive")
|
||
add_finding "INFO" "MegaRAID Status" \
|
||
"$raid_info" \
|
||
"Check details: megacli -LDInfo -Lall -aALL"
|
||
fi
|
||
|
||
if ! $raid_found; then
|
||
add_finding "INFO" "No RAID Detected" \
|
||
"No software or hardware RAID arrays detected" \
|
||
"System appears to use non-RAID storage"
|
||
fi
|
||
}
|
||
|
||
# Function to check disk I/O errors
|
||
check_disk_io_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking disk I/O errors..."
|
||
|
||
# Check for I/O errors in dmesg
|
||
local io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | wc -l)
|
||
|
||
if [ "$io_errors" -gt 0 ]; then
|
||
local recent_io_errors=$(dmesg | grep -iE "i/o error|blk_update_request|Buffer I/O error" | tail -10 | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "Disk I/O Errors Detected" \
|
||
"Total I/O errors in dmesg: $io_errors
|
||
|
||
Recent I/O errors (last 10):
|
||
$recent_io_errors" \
|
||
"Disk I/O errors detected - indicates hardware or connection issues:
|
||
• Check SMART status (see above)
|
||
• Review dmesg: dmesg | grep -i 'i/o error'
|
||
• Check cables and connections (if physical server)
|
||
• Check for disk controller issues
|
||
• May indicate failing disk or controller"
|
||
fi
|
||
}
|
||
|
||
# Function to check filesystem errors
|
||
check_filesystem_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking filesystem errors..."
|
||
|
||
# Check for filesystem errors in logs
|
||
local fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)
|
||
|
||
if [ "$fs_errors" -gt 0 ]; then
|
||
local recent_fs_errors=$(grep -iE "ext4-fs error|xfs.*error|filesystem.*error|remounted.*read-only" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
|
||
|
||
local severity="WARNING"
|
||
# Check for read-only remounts (critical)
|
||
local ro_remounts=$(grep -i "remounted.*read-only" /var/log/messages 2>/dev/null | wc -l)
|
||
if [ "$ro_remounts" -gt 0 ]; then
|
||
severity="CRITICAL"
|
||
fi
|
||
|
||
add_finding "$severity" "🔴 Filesystem Errors Detected" \
|
||
"Total filesystem errors in logs: $fs_errors
|
||
Read-only remounts: $ro_remounts
|
||
|
||
Recent filesystem errors (last 5):
|
||
$recent_fs_errors" \
|
||
"Filesystem errors detected - may indicate disk corruption:
|
||
• Check filesystem: fsck (requires unmounting or single-user mode)
|
||
• Review all errors: grep -i 'filesystem.*error' /var/log/messages
|
||
• Check disk SMART status above
|
||
• If read-only remount: System is protecting data - investigate immediately
|
||
• May need to boot rescue mode to repair
|
||
• Ensure backups are current before repair attempts"
|
||
fi
|
||
}
|
||
|
||
# Function to check system fans
|
||
check_system_fans() {
|
||
# Skip fan checks on virtual machines (hypervisor controls physical fans)
|
||
if [ "$IS_VIRTUAL" = true ]; then
|
||
echo -e "${CYAN}[INFO]${NC} Skipping fan checks (virtual machine - $VIRT_TYPE)"
|
||
return 0
|
||
fi
|
||
|
||
echo -e "${CYAN}[INFO]${NC} Checking system fan status..."
|
||
|
||
if ! command_exists sensors; then
|
||
return # Silently skip if sensors not installed
|
||
fi
|
||
|
||
# Get fan information
|
||
local fan_data=$(sensors 2>/dev/null | grep -i "fan")
|
||
|
||
if [ -z "$fan_data" ]; then
|
||
return # No fan data available
|
||
fi
|
||
|
||
# Check for failed fans (0 RPM or missing)
|
||
local failed_fans=$(echo "$fan_data" | grep "0 RPM\|FAULT" | wc -l)
|
||
local slow_fans=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | wc -l)
|
||
|
||
if [ "$failed_fans" -gt 0 ]; then
|
||
local failed_fan_list=$(echo "$fan_data" | grep "0 RPM\|FAULT" | sed 's/^/ /')
|
||
|
||
add_finding "CRITICAL" "🔴 FAILED FAN(S) DETECTED" \
|
||
"Failed fans: $failed_fans
|
||
|
||
Failed fan details:
|
||
$failed_fan_list
|
||
|
||
All fan data:
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"🚨 CRITICAL - FAN FAILURE DETECTED:
|
||
• Failed fans detected - system may overheat!
|
||
• Check all fan data: sensors
|
||
• Physical inspection required
|
||
• Replace failed fan immediately
|
||
• Monitor CPU/system temperatures closely
|
||
• May need emergency shutdown if temps rise above 90°C"
|
||
|
||
elif [ "$slow_fans" -gt 0 ]; then
|
||
local slow_fan_list=$(echo "$fan_data" | awk '/RPM/ {if ($2 > 0 && $2 < 800) print}' | sed 's/^/ /')
|
||
|
||
add_finding "WARNING" "🟡 Slow Fan(s) Detected" \
|
||
"Slow fans (< 800 RPM): $slow_fans
|
||
|
||
Slow fan details:
|
||
$slow_fan_list
|
||
|
||
All fan data:
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"⚠️ WARNING - FANS RUNNING SLOW:
|
||
• Fans running slower than normal
|
||
• May indicate fan wear or BIOS power settings
|
||
• Monitor temperatures closely
|
||
• Consider fan replacement if temperatures rise
|
||
• Check BIOS fan control settings"
|
||
else
|
||
add_finding "INFO" "✅ System Fans: Normal Operation" \
|
||
"All fans operating normally:
|
||
|
||
$(echo "$fan_data" | sed 's/^/ /')" \
|
||
"All system fans operating within normal parameters"
|
||
fi
|
||
}
|
||
|
||
# Function to check network interface errors
|
||
check_network_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking network interface errors..."
|
||
|
||
if ! command_exists ethtool; then
|
||
return # Silently skip if ethtool not installed
|
||
fi
|
||
|
||
# Get all active network interfaces (exclude loopback)
|
||
local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v '^lo$' | grep -v '^docker' | grep -v '^veth' | grep -v '^br-')
|
||
|
||
if [ -z "$interfaces" ]; then
|
||
return # No interfaces found
|
||
fi
|
||
|
||
local total_rx_dropped=0
|
||
local total_tx_dropped=0
|
||
local total_rx_errors=0
|
||
local total_tx_errors=0
|
||
local total_crc_errors=0
|
||
local problem_interfaces=""
|
||
local has_issues=false
|
||
|
||
while IFS= read -r interface; do
|
||
# Get statistics for this interface
|
||
local stats=$(ethtool -S "$interface" 2>/dev/null)
|
||
|
||
if [ -n "$stats" ]; then
|
||
# Extract key error metrics (different NICs use different naming)
|
||
local rx_dropped=$(echo "$stats" | grep -iE "rx.*drop|rx_discards" | awk '{sum+=$2} END {print sum+0}')
|
||
local tx_dropped=$(echo "$stats" | grep -iE "tx.*drop|tx_discards" | awk '{sum+=$2} END {print sum+0}')
|
||
local rx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*rx_errors" | awk '{print $2}')
|
||
local tx_errors=$(echo "$stats" | grep -iE "^[[:space:]]*tx_errors" | awk '{print $2}')
|
||
local crc_errors=$(echo "$stats" | grep -iE "crc.*error|rx_crc" | awk '{sum+=$2} END {print sum+0}')
|
||
|
||
# Accumulate totals
|
||
total_rx_dropped=$((total_rx_dropped + rx_dropped))
|
||
total_tx_dropped=$((total_tx_dropped + tx_dropped))
|
||
total_rx_errors=$((total_rx_errors + rx_errors))
|
||
total_tx_errors=$((total_tx_errors + tx_errors))
|
||
total_crc_errors=$((total_crc_errors + crc_errors))
|
||
|
||
# Check if this interface has significant issues
|
||
if [ "$rx_dropped" -gt 1000 ] || [ "$tx_dropped" -gt 1000 ] || [ "$crc_errors" -gt 100 ]; then
|
||
has_issues=true
|
||
problem_interfaces+=" $interface:
|
||
RX dropped: $rx_dropped
|
||
TX dropped: $tx_dropped
|
||
CRC errors: $crc_errors
|
||
"
|
||
fi
|
||
fi
|
||
done <<< "$interfaces"
|
||
|
||
# Determine severity
|
||
local severity="INFO"
|
||
if [ "$total_rx_dropped" -gt 10000 ] || [ "$total_tx_dropped" -gt 10000 ] || [ "$total_crc_errors" -gt 1000 ]; then
|
||
severity="CRITICAL"
|
||
elif [ "$total_rx_dropped" -gt 1000 ] || [ "$total_tx_dropped" -gt 1000 ] || [ "$total_crc_errors" -gt 100 ]; then
|
||
severity="WARNING"
|
||
fi
|
||
|
||
if [ "$has_issues" = true ] || [ "$severity" != "INFO" ]; then
|
||
add_finding "$severity" "🔴 Network Interface Errors Detected" \
|
||
"Total across all interfaces:
|
||
• RX packets dropped: $total_rx_dropped
|
||
• TX packets dropped: $total_tx_dropped
|
||
• RX errors: $total_rx_errors
|
||
• TX errors: $total_tx_errors
|
||
• CRC errors: $total_crc_errors
|
||
|
||
Problem interfaces:
|
||
$problem_interfaces" \
|
||
"Network errors detected - may indicate hardware or driver issues:
|
||
• Check interface: ethtool eth0
|
||
• Check dmesg: dmesg | grep -i 'eth\|network'
|
||
• High drops may indicate:
|
||
- Network card failure
|
||
- Driver issues
|
||
- Switch/cable problems
|
||
- Bandwidth saturation
|
||
• CRC errors indicate:
|
||
- Bad cable
|
||
- EMI interference
|
||
- Faulty NIC
|
||
• If persistent: Replace network cable first, then NIC if needed"
|
||
else
|
||
# All healthy
|
||
add_finding "INFO" "✅ Network Interfaces: Healthy" \
|
||
"All network interfaces operating normally
|
||
Total interfaces checked: $(echo "$interfaces" | wc -l)
|
||
No significant packet drops or errors detected" \
|
||
"Network hardware is functioning properly"
|
||
fi
|
||
}
|
||
|
||
# Function to check PCI/PCIe errors
|
||
check_pci_errors() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking PCI/PCIe errors..."
|
||
|
||
# Check for PCI errors in dmesg and logs
|
||
local pci_errors=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error|correctable.*error.*pci|uncorrectable.*error.*pci" | wc -l)
|
||
|
||
# Also check cached messages if available
|
||
local log_pci_errors=0
|
||
if [ -f "$MESSAGES_CACHE" ]; then
|
||
log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" "$MESSAGES_CACHE" 2>/dev/null | wc -l)
|
||
else
|
||
log_pci_errors=$(grep -iE "pci.*error|pcie.*error|aer.*error" /var/log/messages 2>/dev/null | wc -l)
|
||
fi
|
||
|
||
local total_pci_errors=$((pci_errors + log_pci_errors))
|
||
|
||
if [ "$total_pci_errors" -gt 0 ]; then
|
||
# Get samples from both sources
|
||
local dmesg_samples=$(dmesg | grep -iE "pci.*error|pcie.*error|aer.*error" | tail -5 | sed 's/^/ /')
|
||
local log_samples=""
|
||
if [ -f "$MESSAGES_CACHE" ]; then
|
||
log_samples=$(grep -iE "pci.*error|pcie.*error" "$MESSAGES_CACHE" 2>/dev/null | tail -3 | sed 's/^/ /')
|
||
fi
|
||
|
||
# Check for uncorrectable errors (more serious)
|
||
local uncorrectable=0
|
||
if echo "$dmesg_samples" | grep -qi "uncorrectable"; then
|
||
uncorrectable=1
|
||
fi
|
||
|
||
local severity="WARNING"
|
||
if [ "$uncorrectable" -eq 1 ] || [ "$total_pci_errors" -gt 50 ]; then
|
||
severity="CRITICAL"
|
||
fi
|
||
|
||
add_finding "$severity" "🔴 PCI/PCIe Errors Detected" \
|
||
"Total PCI errors: $total_pci_errors
|
||
Uncorrectable errors: $([ "$uncorrectable" -eq 1 ] && echo 'YES (CRITICAL!)' || echo 'No')
|
||
|
||
Recent errors from dmesg:
|
||
$dmesg_samples
|
||
|
||
${log_samples:+Recent errors from /var/log/messages:
|
||
$log_samples}" \
|
||
"PCI/PCIe errors detected - may indicate hardware problems:
|
||
• Uncorrectable errors = serious hardware issue
|
||
• Correctable errors = potential signal integrity problems
|
||
• Check details: dmesg | grep -i 'pci.*error'
|
||
• Check PCIe link status: lspci -vv | grep -A 5 'LnkSta'
|
||
• May indicate:
|
||
- Faulty PCIe device (network card, RAID controller, etc.)
|
||
- Motherboard issues
|
||
- Power supply problems
|
||
- Improper card seating
|
||
• If persistent: Reseat cards, check for firmware updates
|
||
• If uncorrectable: Replace failing hardware immediately"
|
||
fi
|
||
}
|
||
|
||
# Function to check kernel parameters
|
||
check_kernel_parameters() {
|
||
echo -e "${CYAN}[INFO]${NC} Checking kernel parameters..."
|
||
|
||
local issues=""
|
||
local warnings=""
|
||
local info=""
|
||
|
||
# Check vm.swappiness (should be 1-10 for servers)
|
||
if command_exists sysctl; then
|
||
local swappiness=$(sysctl -n vm.swappiness 2>/dev/null)
|
||
if [ -n "$swappiness" ]; then
|
||
if [ "$swappiness" -gt 60 ]; then
|
||
warnings+=" • vm.swappiness=$swappiness (HIGH - should be 1-10 for servers)
|
||
"
|
||
elif [ "$swappiness" -gt 10 ]; then
|
||
info+=" • vm.swappiness=$swappiness (consider lowering to 1-10 for better performance)
|
||
"
|
||
else
|
||
info+=" • vm.swappiness=$swappiness ✅
|
||
"
|
||
fi
|
||
fi
|
||
|
||
# Check vm.dirty_ratio (should be 10-20)
|
||
local dirty_ratio=$(sysctl -n vm.dirty_ratio 2>/dev/null)
|
||
if [ -n "$dirty_ratio" ]; then
|
||
if [ "$dirty_ratio" -gt 40 ]; then
|
||
warnings+=" • vm.dirty_ratio=$dirty_ratio (HIGH - may cause stalls, recommended: 10-20)
|
||
"
|
||
elif [ "$dirty_ratio" -lt 10 ]; then
|
||
info+=" • vm.dirty_ratio=$dirty_ratio (low - may impact write performance)
|
||
"
|
||
else
|
||
info+=" • vm.dirty_ratio=$dirty_ratio ✅
|
||
"
|
||
fi
|
||
fi
|
||
|
||
# Check Transparent Huge Pages (should be never or madvise for databases)
|
||
local thp_enabled=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+')
|
||
if [ -n "$thp_enabled" ]; then
|
||
if [ "$thp_enabled" = "always" ]; then
|
||
warnings+=" • Transparent Huge Pages=always (can cause latency spikes for databases)
|
||
Recommended: echo never > /sys/kernel/mm/transparent_hugepage/enabled
|
||
"
|
||
else
|
||
info+=" • Transparent Huge Pages=$thp_enabled ✅
|
||
"
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# Check I/O schedulers for each disk
|
||
if [ "$IS_VIRTUAL" != true ]; then # Only check on physical servers
|
||
local disks=$(lsblk -nd -o NAME,TYPE 2>/dev/null | awk '$2=="disk" {print $1}')
|
||
if [ -n "$disks" ]; then
|
||
while IFS= read -r disk; do
|
||
local scheduler=$(cat "/sys/block/$disk/queue/scheduler" 2>/dev/null | grep -oP '\[\K[^\]]+')
|
||
local rotational=$(cat "/sys/block/$disk/queue/rotational" 2>/dev/null)
|
||
|
||
if [ -n "$scheduler" ] && [ -n "$rotational" ]; then
|
||
# Check if scheduler is appropriate for disk type
|
||
if [[ "$disk" == nvme* ]]; then
|
||
# NVMe should use 'none'
|
||
if [ "$scheduler" != "none" ]; then
|
||
info+=" • /dev/$disk (NVMe): scheduler=$scheduler (consider 'none' for NVMe)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (NVMe): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
elif [ "$rotational" = "0" ]; then
|
||
# SSD should use mq-deadline or none
|
||
if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "none" ] && [ "$scheduler" != "deadline" ]; then
|
||
info+=" • /dev/$disk (SSD): scheduler=$scheduler (consider 'mq-deadline' for SSD)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (SSD): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
else
|
||
# HDD should use mq-deadline or deadline
|
||
if [ "$scheduler" != "mq-deadline" ] && [ "$scheduler" != "deadline" ]; then
|
||
info+=" • /dev/$disk (HDD): scheduler=$scheduler (consider 'mq-deadline' for HDD)
|
||
"
|
||
else
|
||
info+=" • /dev/$disk (HDD): scheduler=$scheduler ✅
|
||
"
|
||
fi
|
||
fi
|
||
fi
|
||
done <<< "$disks"
|
||
fi
|
||
fi
|
||
|
||
# Generate finding based on what we found
|
||
if [ -n "$warnings" ]; then
|
||
add_finding "WARNING" "⚠️ Kernel Parameters: Sub-Optimal Configuration" \
|
||
"Performance-impacting kernel parameters detected:
|
||
|
||
$warnings
|
||
${info:+
|
||
Informational:
|
||
$info}" \
|
||
"Kernel parameters affect system performance and stability:
|
||
• vm.swappiness: Controls swap usage (1-10 for servers)
|
||
- Fix: sysctl -w vm.swappiness=10
|
||
- Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf
|
||
• vm.dirty_ratio: Controls dirty page cache
|
||
- Fix: sysctl -w vm.dirty_ratio=15
|
||
• Transparent Huge Pages: Can cause latency for databases
|
||
- Fix: echo never > /sys/kernel/mm/transparent_hugepage/enabled
|
||
• I/O Scheduler: Affects disk performance
|
||
- NVMe: echo none > /sys/block/nvme0n1/queue/scheduler
|
||
- SSD: echo mq-deadline > /sys/block/sda/queue/scheduler"
|
||
elif [ -n "$info" ]; then
|
||
add_finding "INFO" "ℹ️ Kernel Parameters: Configuration Status" \
|
||
"Current kernel parameters:
|
||
|
||
$info" \
|
||
"Kernel parameters are within acceptable ranges. Minor optimizations may be possible."
|
||
fi
|
||
}
|
||
|
||
# Function to generate report
|
||
generate_report() {
|
||
local report_content=""
|
||
|
||
# Count findings by severity
|
||
local critical_count=0
|
||
local warning_count=0
|
||
local info_count=0
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
case "$severity" in
|
||
CRITICAL) critical_count=$((critical_count + 1)) ;;
|
||
WARNING) warning_count=$((warning_count + 1)) ;;
|
||
INFO) info_count=$((info_count + 1)) ;;
|
||
esac
|
||
done
|
||
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
report_content+="║ HARDWARE HEALTH CHECK REPORT ║"$'\n'
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Date: $(date '+%Y-%m-%d %H:%M:%S')"$'\n'
|
||
report_content+="System: $SYS_HOSTNAME"$'\n'
|
||
report_content+="Control Panel: $SYS_PANEL ${SYS_PANEL_VER:-unknown}"$'\n'
|
||
report_content+="OS: $SYS_OS ${SYS_OS_VER:-unknown}"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# VISUAL SEVERITY SUMMARY - Make issues OBVIOUS
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+="║ 🔴 CRITICAL ISSUES DETECTED - IMMEDIATE ACTION REQUIRED ║"$'\n'
|
||
elif [ "$warning_count" -gt 0 ]; then
|
||
report_content+="║ 🟡 WARNING - Hardware Issues Detected ║"$'\n'
|
||
else
|
||
report_content+="║ ✅ ALL HARDWARE CHECKS PASSED - System Healthy ║"$'\n'
|
||
fi
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# Severity breakdown
|
||
report_content+="FINDINGS SUMMARY:"$'\n'
|
||
report_content+="──────────────────────────────────────────────────────────────────────────────"$'\n'
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+=" 🔴 CRITICAL: $critical_count issue(s) - URGENT ATTENTION REQUIRED"$'\n'
|
||
fi
|
||
if [ "$warning_count" -gt 0 ]; then
|
||
report_content+=" 🟡 WARNING: $warning_count issue(s) - Review and plan action"$'\n'
|
||
fi
|
||
report_content+=" ℹ️ INFO: $info_count item(s) - Status information"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# If critical issues, list them prominently at the top
|
||
if [ "$critical_count" -gt 0 ]; then
|
||
report_content+="╔══════════════════════════════════════════════════════════════════════════════╗"$'\n'
|
||
report_content+="║ 🚨 CRITICAL ISSUES REQUIRING IMMEDIATE ATTENTION ║"$'\n'
|
||
report_content+="╚══════════════════════════════════════════════════════════════════════════════╝"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
local critical_num=1
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
if [ "$severity" = "CRITICAL" ]; then
|
||
local title=$(echo "$finding" | sed 's/^\[[^]]*\] //' | sed 's/@@@SEP@@@.*//')
|
||
report_content+=" $critical_num. $title"$'\n'
|
||
critical_num=$((critical_num + 1))
|
||
fi
|
||
done
|
||
report_content+=""$'\n'
|
||
report_content+=" ⚠️ SEE DETAILED FINDINGS BELOW FOR SPECIFIC ACTIONS TO TAKE"$'\n'
|
||
report_content+=""$'\n'
|
||
fi
|
||
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
# Group findings by category
|
||
local -A categories
|
||
categories["DISK"]=""
|
||
categories["MEMORY"]=""
|
||
categories["CPU"]=""
|
||
categories["RAID"]=""
|
||
categories["OTHER"]=""
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
# Split by @@@SEP@@@ delimiter
|
||
local severity_title="${finding%%@@@SEP@@@*}"
|
||
local temp="${finding#*@@@SEP@@@}"
|
||
local details="${temp%%@@@SEP@@@*}"
|
||
local recommendation="${temp#*@@@SEP@@@}"
|
||
|
||
# Extract severity from [SEVERITY] Title format
|
||
local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')
|
||
|
||
local category="OTHER"
|
||
if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"I/O"* ]]; then
|
||
category="DISK"
|
||
elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]]; then
|
||
category="MEMORY"
|
||
elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"MCE"* ]]; then
|
||
category="CPU"
|
||
elif [[ "$title" == *"RAID"* ]]; then
|
||
category="RAID"
|
||
fi
|
||
|
||
local entry=""
|
||
entry+="[$severity] $title"$'\n'
|
||
entry+="$details"$'\n'
|
||
if [ -n "$recommendation" ]; then
|
||
entry+="Recommendation:"$'\n'
|
||
entry+="$recommendation"$'\n'
|
||
fi
|
||
entry+=""$'\n'
|
||
entry+="------------------------------------------------------------------------------"$'\n'
|
||
entry+=""$'\n'
|
||
|
||
categories[$category]+="$entry"
|
||
done
|
||
|
||
# Output sections
|
||
if [ -n "${categories[DISK]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="DISK HEALTH & SMART STATUS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[DISK]}"
|
||
fi
|
||
|
||
if [ -n "${categories[MEMORY]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="MEMORY HEALTH"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[MEMORY]}"
|
||
fi
|
||
|
||
if [ -n "${categories[CPU]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="CPU HEALTH"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[CPU]}"
|
||
fi
|
||
|
||
if [ -n "${categories[RAID]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="RAID STATUS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[RAID]}"
|
||
fi
|
||
|
||
if [ -n "${categories[OTHER]}" ]; then
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="OTHER HARDWARE FINDINGS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="${categories[OTHER]}"
|
||
fi
|
||
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+="NEXT STEPS"$'\n'
|
||
report_content+="=============================================================================="$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Priority Actions:"$'\n'
|
||
report_content+=" 1. Address any CRITICAL issues immediately"$'\n'
|
||
report_content+=" 2. Monitor WARNING issues closely"$'\n'
|
||
report_content+=" 3. Schedule regular hardware health checks"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Additional Analysis Available:"$'\n'
|
||
report_content+=" • System Health Check (Main Menu) for overall server health"$'\n'
|
||
report_content+=" • Disk I/O Analyzer (Main Menu → Performance) for disk performance"$'\n'
|
||
report_content+=""$'\n'
|
||
report_content+="Report saved to: $REPORT_FILE"$'\n'
|
||
report_content+=""$'\n'
|
||
|
||
echo "$report_content"
|
||
echo "$report_content" > "$REPORT_FILE"
|
||
}
|
||
|
||
# Main execution
|
||
main() {
|
||
show_banner
|
||
echo -e "${MAGENTA}${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${MAGENTA}${BOLD}║ 🔧 HARDWARE HEALTH CHECK - Deep Analysis ║${NC}"
|
||
echo -e "${MAGENTA}${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
# Detect virtualization FIRST (affects which checks to run)
|
||
echo -e "${CYAN}[INFO]${NC} Detecting environment (physical vs virtual)..."
|
||
detect_virtualization
|
||
echo ""
|
||
|
||
echo -e "${CYAN}Performing comprehensive hardware diagnostics...${NC}"
|
||
echo -e "${CYAN}Checks: Disks (SMART/NVMe/Age), Memory (ECC), CPU (Thermal), RAID, Filesystem, Fans, PCI, Network, Kernel${NC}"
|
||
echo ""
|
||
|
||
# OPTIMIZATION: Cache /var/log/messages once (avoid 32 separate grep calls)
|
||
# Note: Using temp file instead of variable to avoid "Argument list too long" errors
|
||
echo -e "${CYAN}[INFO]${NC} Caching system logs for analysis..."
|
||
MESSAGES_CACHE="/tmp/hw_health_messages_cache_$$.tmp"
|
||
if [ -f /var/log/messages ]; then
|
||
cat /var/log/messages 2>/dev/null > "$MESSAGES_CACHE"
|
||
else
|
||
touch "$MESSAGES_CACHE"
|
||
fi
|
||
# Cleanup cache on exit
|
||
trap "rm -f \"$MESSAGES_CACHE\"" EXIT
|
||
|
||
# Run diagnostics with progress indicators
|
||
echo -e "${YELLOW}[1/11]${NC} Analyzing disk SMART status and predictive failure indicators..."
|
||
check_disk_smart
|
||
|
||
echo -e "${YELLOW}[2/11]${NC} Checking memory health (ECC errors, OOM events, swap usage)..."
|
||
check_memory_health
|
||
|
||
echo -e "${YELLOW}[3/11]${NC} Monitoring CPU health (temperature, throttling, MCE errors)..."
|
||
check_cpu_health
|
||
|
||
echo -e "${YELLOW}[4/11]${NC} Scanning system hardware error logs..."
|
||
check_hardware_errors
|
||
|
||
echo -e "${YELLOW}[5/11]${NC} Verifying RAID array status..."
|
||
check_raid_status
|
||
|
||
echo -e "${YELLOW}[6/11]${NC} Analyzing disk I/O errors..."
|
||
check_disk_io_errors
|
||
|
||
echo -e "${YELLOW}[7/11]${NC} Checking for filesystem errors..."
|
||
check_filesystem_errors
|
||
|
||
echo -e "${YELLOW}[8/11]${NC} Monitoring system fans..."
|
||
check_system_fans
|
||
|
||
echo -e "${YELLOW}[9/11]${NC} Checking for PCI/PCIe errors..."
|
||
check_pci_errors
|
||
|
||
echo -e "${YELLOW}[10/11]${NC} Checking network interface errors..."
|
||
check_network_errors
|
||
|
||
echo -e "${YELLOW}[11/11]${NC} Validating kernel parameters..."
|
||
check_kernel_parameters
|
||
|
||
echo ""
|
||
echo -e "${GREEN}[✓]${NC} Hardware diagnostics complete!"
|
||
echo ""
|
||
|
||
# Generate and display report
|
||
echo -e "${CYAN}Generating detailed report...${NC}"
|
||
echo ""
|
||
generate_report
|
||
|
||
# EXECUTIVE SUMMARY - Quick status overview
|
||
echo ""
|
||
echo -e "${BOLD}╔══════════════════════════════════════════════════════════════╗${NC}"
|
||
echo -e "${BOLD}║ EXECUTIVE SUMMARY - Component Status ║${NC}"
|
||
echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
|
||
# Analyze findings to determine component status
|
||
local disk_status="✅" memory_status="✅" cpu_status="✅" raid_status="✅"
|
||
local fs_status="✅" fan_status="✅" pci_status="✅" network_status="✅" kernel_status="✅" overall="HEALTHY"
|
||
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity_title="${finding%%@@@SEP@@@*}"
|
||
local severity=$(echo "$severity_title" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
local title=$(echo "$severity_title" | sed 's/^\[[^]]*\] //')
|
||
|
||
# Categorize by component
|
||
if [[ "$title" == *"Disk"* ]] || [[ "$title" == *"SMART"* ]] || [[ "$title" == *"DRIVE"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then disk_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$disk_status" != "🔴" ]; then disk_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Memory"* ]] || [[ "$title" == *"ECC"* ]] || [[ "$title" == *"RAM"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then memory_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$memory_status" != "🔴" ]; then memory_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"CPU"* ]] || [[ "$title" == *"thermal"* ]] || [[ "$title" == *"temperature"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then cpu_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$cpu_status" != "🔴" ]; then cpu_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"RAID"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then raid_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$raid_status" != "🔴" ]; then raid_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Filesystem"* ]] || [[ "$title" == *"read-only"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then fs_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$fs_status" != "🔴" ]; then fs_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Fan"* ]] || [[ "$title" == *"fan"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then fan_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$fan_status" != "🔴" ]; then fan_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"PCI"* ]] || [[ "$title" == *"PCIe"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then pci_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$pci_status" != "🔴" ]; then pci_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Network"* ]] || [[ "$title" == *"Interface"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then network_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$network_status" != "🔴" ]; then network_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
elif [[ "$title" == *"Kernel"* ]] || [[ "$title" == *"Parameter"* ]]; then
|
||
if [ "$severity" = "CRITICAL" ]; then kernel_status="🔴"; overall="CRITICAL"
|
||
elif [ "$severity" = "WARNING" ] && [ "$kernel_status" != "🔴" ]; then kernel_status="🟡"; [ "$overall" = "HEALTHY" ] && overall="WARNING"
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# Display component summary
|
||
echo -e " Disks/Storage: $disk_status Memory: $memory_status CPU: $cpu_status RAID: $raid_status"
|
||
echo -e " Filesystem: $fs_status Fans: $fan_status PCI/PCIe: $pci_status"
|
||
echo -e " Network: $network_status Kernel: $kernel_status"
|
||
echo ""
|
||
|
||
# Count critical/warning issues
|
||
local critical_count=0
|
||
local warning_count=0
|
||
for finding in "${FINDINGS[@]}"; do
|
||
local severity=$(echo "$finding" | sed -n 's/^\[\([^]]*\)\].*/\1/p')
|
||
case "$severity" in
|
||
CRITICAL) critical_count=$((critical_count + 1)) ;;
|
||
WARNING) warning_count=$((warning_count + 1)) ;;
|
||
esac
|
||
done
|
||
|
||
# Overall status
|
||
if [ "$overall" = "CRITICAL" ]; then
|
||
echo -e " ${RED}${BOLD}Overall Status: 🔴 CRITICAL - $critical_count issue(s) require IMMEDIATE action!${NC}"
|
||
elif [ "$overall" = "WARNING" ]; then
|
||
echo -e " ${YELLOW}${BOLD}Overall Status: 🟡 WARNING - $warning_count issue(s) detected${NC}"
|
||
else
|
||
echo -e " ${GREEN}${BOLD}Overall Status: ✅ HEALTHY - All systems operating normally${NC}"
|
||
fi
|
||
|
||
echo -e "${BOLD}╚══════════════════════════════════════════════════════════════╝${NC}"
|
||
echo ""
|
||
echo -e "${CYAN}Full report saved to:${NC} ${BOLD}$REPORT_FILE${NC}"
|
||
echo ""
|
||
|
||
press_enter
|
||
|
||
# Severity-based exit codes for monitoring system integration
|
||
# Only use exit codes when script is run standalone (not sourced by launcher)
|
||
# When sourced, the return value is available via $? but won't exit the parent shell
|
||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||
# Script is being run directly, use exit codes
|
||
case "$overall" in
|
||
CRITICAL) exit 2 ;;
|
||
WARNING) exit 1 ;;
|
||
*) exit 0 ;;
|
||
esac
|
||
else
|
||
# Script is being sourced (called from launcher), use return codes
|
||
case "$overall" in
|
||
CRITICAL) return 2 ;;
|
||
WARNING) return 1 ;;
|
||
*) return 0 ;;
|
||
esac
|
||
fi
|
||
}
|
||
|
||
# Run main function only if script is executed directly (not sourced)
|
||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||
main
|
||
else
|
||
# When sourced, call main but don't auto-run
|
||
main
|
||
fi
|