Linux-Server-Management-Too…/modules/diagnostics/system-health-check.sh

#!/bin/bash

################################################################################
# System Health Check - Universal Diagnostics
################################################################################
# Purpose: Comprehensive server health analysis with severity-based reporting
# Supports: cPanel, Plesk, InterWorx, CloudLinux, AlmaLinux
# Author: Server Toolkit
# Version: 1.0.0
################################################################################

# Load common functions
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "$SCRIPT_DIR/lib/common-functions.sh"
source "$SCRIPT_DIR/lib/system-detect.sh"

# Output files
REPORT_FILE="/tmp/system_health_report_$(date +%Y%m%d_%H%M%S).txt"
TEMP_DIR=$(mktemp -d /tmp/health-check-XXXXXX)

# Issue tracking
declare -a CRITICAL_ISSUES=()
declare -a HIGH_ISSUES=()
declare -a MEDIUM_ISSUES=()
declare -a LOW_ISSUES=()

################################################################################
# Helper Functions
################################################################################

add_issue() {
    local severity=$1
    local title=$2
    local details=$3
    local recommendation=$4
    local score=$5

    local issue="[SEVERITY: ${score}%] ${title}
${details}
Recommendation: ${recommendation}
"

    case $severity in
        CRITICAL)
            CRITICAL_ISSUES+=("$issue")
            ;;
        HIGH)
            HIGH_ISSUES+=("$issue")
            ;;
        MEDIUM)
            MEDIUM_ISSUES+=("$issue")
            ;;
        LOW)
            LOW_ISSUES+=("$issue")
            ;;
    esac
}

################################################################################
# Phase 1: Memory Analysis
################################################################################

analyze_memory() {
    print_info "Analyzing memory usage..."

    # Get memory stats
    local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
    local mem_used=$(free -m | awk '/^Mem:/ {print $3}')
    local mem_free=$(free -m | awk '/^Mem:/ {print $4}')
    local mem_available=$(free -m | awk '/^Mem:/ {print $7}')
    local swap_total=$(free -m | awk '/^Swap:/ {print $2}')
    local swap_used=$(free -m | awk '/^Swap:/ {print $3}')

    local mem_percent=$((mem_used * 100 / mem_total))
    local swap_percent=0
    [ "$swap_total" -gt 0 ] && swap_percent=$((swap_used * 100 / swap_total))

    # Check for critical memory issues
    if [ "$mem_percent" -gt 90 ]; then
        add_issue "CRITICAL" "MEMORY - Critical memory usage" \
            "Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)
Available: ${mem_available}MB" \
            "Free up memory immediately. Check top memory consumers." \
            95
    elif [ "$mem_percent" -gt 80 ]; then
        add_issue "HIGH" "MEMORY - High memory usage" \
            "Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)
Available: ${mem_available}MB" \
            "Monitor memory usage. Consider adding RAM or optimizing services." \
            75
    fi

    # Check swap usage
    if [ "$swap_used" -gt 0 ] && [ "$swap_percent" -gt 50 ]; then
        add_issue "HIGH" "MEMORY - High swap usage" \
            "Swap usage: ${swap_used}MB / ${swap_total}MB (${swap_percent}%)
System is swapping - performance degradation likely" \
            "Identify memory-heavy processes. Reduce memory usage or add RAM." \
            80
    fi

    # Check for OOM events
    local oom_count=$(dmesg | grep -i "killed process\|out of memory" | wc -l || echo "0")
    if [ "$oom_count" -gt 0 ]; then
        local recent_oom=$(dmesg | grep -i "killed process" | tail -3 | sed 's/^/  • /' || echo "  • Details not available")
        add_issue "CRITICAL" "MEMORY - OOM Killer Active" \
            "OOM killer invoked ${oom_count} times since boot
Recent events:
${recent_oom}" \
            "Critical memory pressure. Reduce MySQL/Apache memory limits or add RAM.
Check: dmesg | grep -i 'killed process'" \
            95
    fi

    # Get top memory consumers
    local top_mem=$(ps aux --sort=-%mem | head -6 | tail -5 | awk '{printf "  • %-15s %6s %s\n", $1, $4"%", $11}')
    echo "$top_mem" > "$TEMP_DIR/top_memory.txt"
}

################################################################################
# Phase 1.5: Memory Configuration Analysis
################################################################################

analyze_memory_config() {
    print_info "Analyzing memory configuration..."

    # Check if swap exists
    local swap_total=$(free -m | awk '/^Swap:/ {print $2}')
    if [ "$swap_total" -eq 0 ]; then
        local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024))
        if [ "$mem_total_gb" -lt 4 ]; then
            add_issue "HIGH" "MEMORY CONFIG - No swap configured" \
                "System has no swap space
Total RAM: ${mem_total_gb}GB
For servers with <4GB RAM, swap is recommended" \
                "Create swap file:
  • dd if=/dev/zero of=/swapfile bs=1G count=2
  • chmod 600 /swapfile
  • mkswap /swapfile
  • swapon /swapfile
  • Add to /etc/fstab: /swapfile none swap sw 0 0" \
                78
        else
            add_issue "MEDIUM" "MEMORY CONFIG - No swap configured" \
                "System has no swap space (${mem_total_gb}GB RAM)
Swap provides safety net for memory pressure" \
                "Consider adding swap even with sufficient RAM." \
                55
        fi
    fi

    # Check swappiness value
    local swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null || echo "60")
    if [ "$swappiness" -gt 30 ] && [ "$swap_total" -gt 0 ]; then
        add_issue "MEDIUM" "MEMORY CONFIG - Swappiness too high" \
            "Current swappiness: ${swappiness}
Recommended for servers: 10-30
High swappiness causes aggressive swapping, degrading performance" \
            "Reduce swappiness:
  • Temporary: sysctl vm.swappiness=10
  • Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf
  • Apply: sysctl -p" \
                65
    fi

    # Check if OOM killer is enabled
    local oom_kill_allocating=$(cat /proc/sys/vm/oom_kill_allocating_task 2>/dev/null || echo "0")
    local overcommit_memory=$(cat /proc/sys/vm/overcommit_memory 2>/dev/null || echo "0")

    if [ "$overcommit_memory" -eq 2 ]; then
        add_issue "LOW" "MEMORY CONFIG - Overcommit disabled" \
            "vm.overcommit_memory = 2 (disabled)
OOM killer may not activate, risking system lockup" \
            "For most servers, vm.overcommit_memory=0 (heuristic) is recommended.
Only use strict accounting (2) if you understand the implications." \
            45
    fi

    # Check for huge pages (can cause memory fragmentation)
    local transparent_hugepage=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+' || echo "unknown")
    if [ "$transparent_hugepage" = "always" ]; then
        add_issue "LOW" "MEMORY CONFIG - Transparent Huge Pages enabled" \
            "THP can cause memory fragmentation and latency spikes
Recommended: madvise or never for database servers" \
            "Disable THP:
  • echo never > /sys/kernel/mm/transparent_hugepage/enabled
  • Make permanent in /etc/rc.local or systemd" \
            48
    fi
}

################################################################################
# Phase 2: Disk Analysis
################################################################################

analyze_disk() {
    print_info "Analyzing disk usage..."

    # Check all mounted filesystems
    while IFS= read -r line; do
        local filesystem=$(echo "$line" | awk '{print $1}')
        local size=$(echo "$line" | awk '{print $2}')
        local used=$(echo "$line" | awk '{print $3}')
        local avail=$(echo "$line" | awk '{print $4}')
        local percent=$(echo "$line" | awk '{print $5}' | tr -d '%')
        local mount=$(echo "$line" | awk '{print $6}')

        # Skip tmpfs and other virtual filesystems
        [[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue

        if [ "$percent" -gt 95 ]; then
            # Analyze what's consuming space
            local analysis=""

            # Top level directories
            local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -5 | sed 's/^/  • /')

            # Common problem areas - check each
            local problem_areas=""

            # Check for large log files
            if [ -d "$mount/var/log" ]; then
                local large_logs=$(find "$mount/var/log" -type f -size +100M 2>/dev/null | head -5)
                if [ -n "$large_logs" ]; then
                    problem_areas="${problem_areas}Large log files found:\n$(echo "$large_logs" | sed 's/^/  • /' | head -3)\n\n"
                fi
            fi

            # Check for email queue (common issue)
            if [ -d "$mount/var/spool/exim" ]; then
                local queue_count=$(find "$mount/var/spool/exim/input" -type f 2>/dev/null | wc -l)
                if [ "$queue_count" -gt 1000 ]; then
                    problem_areas="${problem_areas}Large email queue: ${queue_count} messages\n\n"
                fi
            fi

            # Check for backups in home directories
            if [ -d "$mount/home" ]; then
                local backup_size=$(find "$mount/home" -type f \( -name "*.tar.gz" -o -name "*.zip" -o -name "*.sql.gz" -o -name "backup-*" \) -size +100M 2>/dev/null | wc -l)
                if [ "$backup_size" -gt 0 ]; then
                    problem_areas="${problem_areas}Found ${backup_size} large backup/archive files in /home\n\n"
                fi
            fi

            # Check for old backups in /backup
            if [ -d "$mount/backup" ]; then
                local old_backups=$(find "$mount/backup" -type f -mtime +30 -size +100M 2>/dev/null | wc -l)
                if [ "$old_backups" -gt 0 ]; then
                    problem_areas="${problem_areas}Found ${old_backups} old backups (>30 days, >100MB) in /backup\n\n"
                fi
            fi

            # Check for core dumps
            local core_dumps=$(find "$mount" -maxdepth 3 -name "core.*" -o -name "core" 2>/dev/null | wc -l)
            if [ "$core_dumps" -gt 5 ]; then
                problem_areas="${problem_areas}Found ${core_dumps} core dump files\n\n"
            fi

            analysis="Top directories by size:\n${top_dirs}\n\n${problem_areas}"

            add_issue "CRITICAL" "DISK - ${mount} critically full" \
                "Filesystem: ${filesystem}
Usage: ${used} / ${size} (${percent}%)
Available: ${avail}

${analysis}" \
                "Free up space immediately:
  • Review top consumers above
  • Clean old logs: find /var/log -type f -mtime +30 -delete
  • Clean package cache: yum clean all
  • Clean email queue: exim -bp | wc -l (check queue)
  • Remove old backups: find /backup -mtime +30 -delete
  • Find large files: find ${mount} -type f -size +100M -exec ls -lh {} \\;" \
                98
        elif [ "$percent" -gt 85 ]; then
            # Quick analysis for high usage
            local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -3 | sed 's/^/  • /')

            add_issue "HIGH" "DISK - ${mount} high usage" \
                "Filesystem: ${filesystem}
Usage: ${used} / ${size} (${percent}%)
Available: ${avail}

Top directories:
${top_dirs}" \
                "Investigate disk usage:
  • du -sh ${mount}/* | sort -rh | head -10
  • Clean up unnecessary files or expand partition
  • Check logs: ls -lh /var/log/*.log
  • Check email queue: exim -bpc (if using Exim)" \
                75
        elif [ "$percent" -gt 75 ]; then
            add_issue "MEDIUM" "DISK - ${mount} approaching capacity" \
                "Usage: ${used} / ${size} (${percent}%)" \
                "Monitor disk usage. Plan for expansion." \
                65
        fi
    done < <(df -h | tail -n +2)

    # Check inode usage
    while IFS= read -r line; do
        local filesystem=$(echo "$line" | awk '{print $1}')
        local percent=$(echo "$line" | awk '{print $5}' | tr -d '%')
        local mount=$(echo "$line" | awk '{print $6}')

        [[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue

        if [ "$percent" -gt 90 ]; then
            add_issue "HIGH" "DISK - ${mount} inode exhaustion" \
                "Inode usage: ${percent}%
Filesystem: ${filesystem}" \
                "Find directories with many small files:
  • find ${mount} -xdev -type d -exec sh -c 'echo \$(ls -A {} | wc -l) {}' \\; | sort -rn | head -10" \
                85
        fi
    done < <(df -i | tail -n +2)

    # Check for disk errors
    local disk_errors=$(dmesg | grep -i "I/O error\|sector\|SMART" | wc -l || echo "0")
    if [ "$disk_errors" -gt 0 ]; then
        local error_sample=$(dmesg | grep -i "I/O error\|sector\|SMART" | tail -3 | sed 's/^/  • /' || echo "  • Check dmesg")
        add_issue "CRITICAL" "DISK - Hardware errors detected" \
            "Found ${disk_errors} disk error messages
Recent errors:
${error_sample}" \
            "Check SMART status: smartctl -a /dev/sda
Backup data immediately if errors persist." \
            92
    fi
}

################################################################################
# Phase 3: CPU Analysis
################################################################################

analyze_cpu() {
    print_info "Analyzing CPU usage..."

    local cpu_cores=$(nproc)
    local load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
    local load_5min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | xargs)
    local load_15min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | xargs)

    # Calculate load per core
    local load_per_core=$(echo "$load_1min / $cpu_cores" | bc -l 2>/dev/null | awk '{printf "%.2f", $0}' || echo "0")

    # Calculate healthy load thresholds
    local healthy_load=$(echo "$cpu_cores * 0.7" | bc -l | awk '{printf "%.1f", $0}')
    local warning_load=$(echo "$cpu_cores * 1.0" | bc -l | awk '{printf "%.1f", $0}')
    local critical_load=$(echo "$cpu_cores * 2.0" | bc -l | awk '{printf "%.1f", $0}')

    # Detect load trend (increasing, stable, decreasing)
    local load_trend="stable"
    if (( $(echo "$load_1min > $load_5min * 1.2" | bc -l) )); then
        load_trend="increasing rapidly"
    elif (( $(echo "$load_1min > $load_5min" | bc -l) )); then
        load_trend="increasing"
    elif (( $(echo "$load_1min < $load_5min * 0.8" | bc -l) )); then
        load_trend="decreasing"
    fi

    # Check load average with intelligent thresholds
    if (( $(echo "$load_1min > $critical_load" | bc -l) )); then
        local top_cpu=$(ps aux --sort=-%cpu | head -6 | tail -5 | awk '{printf "  • %-15s %6s %s\n", $1, $3"%", $11}')
        add_issue "CRITICAL" "CPU - Extreme load" \
            "Load average: ${load_1min} / ${load_5min} / ${load_15min}
CPU cores: ${cpu_cores}
Load per core: ${load_per_core}
Trend: ${load_trend}

Healthy load for this server: < ${healthy_load}
Warning threshold: ${warning_load}
Critical threshold: ${critical_load}

Top CPU consumers:
${top_cpu}" \
            "Immediate action required:
  1. Identify runaway processes: ps aux --sort=-%cpu | head -20
  2. Kill if necessary: kill -9 [PID]
  3. Check if under attack: Main Menu → Security → Bot Analyzer" \
            92
    elif (( $(echo "$load_1min > $warning_load" | bc -l) )); then
        local top_cpu=$(ps aux --sort=-%cpu | head -4 | tail -3 | awk '{printf "  • %-15s %6s %s\n", $1, $3"%", $11}')
        add_issue "HIGH" "CPU - High load" \
            "Load average: ${load_1min} / ${load_5min} / ${load_15min}
CPU cores: ${cpu_cores}
Load per core: ${load_per_core}
Trend: ${load_trend}

Healthy load: < ${healthy_load}
Current: ${load_1min} (above warning threshold of ${warning_load})

Top CPU consumers:
${top_cpu}" \
            "Monitor and optimize:
  • Check: ps aux --sort=-%cpu | head -20
  • Review high-CPU processes and optimize if possible" \
            76
    elif (( $(echo "$load_1min > $healthy_load" | bc -l) )); then
        add_issue "MEDIUM" "CPU - Elevated load" \
            "Load average: ${load_1min} / ${load_5min} / ${load_15min}
Healthy threshold: < ${healthy_load}
Trend: ${load_trend}" \
            "Monitor trends. Load is elevated but not critical yet." \
            62
    fi

    # Get top CPU consumers
    ps aux --sort=-%cpu | head -11 | tail -10 > "$TEMP_DIR/top_cpu.txt"
}

################################################################################
# Phase 4: MySQL/MariaDB Health
################################################################################

analyze_mysql() {
    print_info "Analyzing MySQL/MariaDB..."

    # Check if MySQL is running
    if ! command -v mysql >/dev/null 2>&1; then
        add_issue "LOW" "MYSQL - Not installed" \
            "MySQL/MariaDB not found on system" \
            "No action needed if not using databases." \
            20
        return
    fi

    if ! pgrep -x "mysqld|mariadbd" >/dev/null; then
        add_issue "CRITICAL" "MYSQL - Service not running" \
            "MySQL/MariaDB process not found" \
            "Start service: systemctl start mysql / mariadb" \
            95
        return
    fi

    # Get MySQL stats (if we can connect)
    local mysql_stats=$(mysql -e "SHOW GLOBAL STATUS LIKE 'Threads_connected'; SHOW GLOBAL STATUS LIKE 'Max_used_connections'; SHOW VARIABLES LIKE 'max_connections';" 2>/dev/null)

    if [ -n "$mysql_stats" ]; then
        local threads_connected=$(echo "$mysql_stats" | grep "Threads_connected" | awk '{print $2}' || echo "0")
        local max_used=$(echo "$mysql_stats" | grep "Max_used_connections" | awk '{print $2}' || echo "0")
        local max_connections=$(echo "$mysql_stats" | grep "max_connections" | awk '{print $2}' || echo "0")

        local connection_percent=$((threads_connected * 100 / max_connections))

        if [ "$connection_percent" -gt 80 ]; then
            add_issue "HIGH" "MYSQL - Connection limit approaching" \
                "Current connections: ${threads_connected} / ${max_connections} (${connection_percent}%)
Max used: ${max_used}" \
                "Investigate and fix:
  1. Find slow queries: Main Menu → Performance → MySQL Query Analyzer
  2. Increase max_connections in /etc/my.cnf if needed
  3. Check for connection leaks: show processlist;" \
                78
        fi

        # Check InnoDB buffer pool vs RAM
        local innodb_buffer=$(mysql -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" 2>/dev/null | grep innodb | awk '{print $2}' || echo "0")
        local innodb_buffer_gb=$((innodb_buffer / 1024 / 1024 / 1024))
        local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024))

        if [ "$innodb_buffer_gb" -gt 0 ] && [ "$innodb_buffer_gb" -gt $((mem_total_gb - 2)) ]; then
            add_issue "HIGH" "MYSQL - InnoDB buffer pool too large" \
                "innodb_buffer_pool_size: ${innodb_buffer_gb}GB
Total RAM: ${mem_total_gb}GB
System has insufficient free RAM for OS and other services" \
                "Reduce innodb_buffer_pool_size to ~${mem_total_gb}GB * 0.6 = ~$((mem_total_gb * 6 / 10))GB
Edit /etc/my.cnf and restart MySQL" \
                82
        fi
    fi
}

################################################################################
# Phase 5: Apache Health
################################################################################

analyze_apache() {
    print_info "Analyzing Apache..."

    # Check if Apache is running
    if ! pgrep -x "httpd|apache2" >/dev/null; then
        add_issue "CRITICAL" "APACHE - Service not running" \
            "Apache process not found" \
            "Start service: systemctl start httpd / apache2" \
            95
        return
    fi

    # Check Apache error log for recent issues
    local apache_error_log=""
    if [ -f "/var/log/httpd/error_log" ]; then
        apache_error_log="/var/log/httpd/error_log"
    elif [ -f "/var/log/apache2/error.log" ]; then
        apache_error_log="/var/log/apache2/error.log"
    fi

    if [ -n "$apache_error_log" ]; then
        # Check for MaxRequestWorkers limit hits
        local max_workers_hits=$(grep -c "server reached MaxRequestWorkers" "$apache_error_log" 2>/dev/null || echo "0")
        max_workers_hits=$(echo "$max_workers_hits" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
        max_workers_hits=${max_workers_hits:-0}
        if [ "$max_workers_hits" -gt 20 ] 2>/dev/null; then
            add_issue "CRITICAL" "APACHE - MaxRequestWorkers limit hit frequently" \
                "Server reached MaxRequestWorkers limit ${max_workers_hits} times
This causes connection refusal and 'server busy' errors" \
                "Increase MaxRequestWorkers in Apache config (if RAM allows)
OR investigate slow PHP scripts / database queries causing workers to hang
Check: apachectl -M | grep mpm" \
                88
        elif [ "$max_workers_hits" -gt 5 ] 2>/dev/null; then
            add_issue "HIGH" "APACHE - MaxRequestWorkers limit reached" \
                "Limit hit ${max_workers_hits} times" \
                "Monitor and consider increasing MaxRequestWorkers." \
                72
        fi

        # Check for segfaults
        local segfaults=$(grep -c "segfault" "$apache_error_log" 2>/dev/null || echo "0")
        segfaults=$(echo "$segfaults" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
        segfaults=${segfaults:-0}
        if [ "$segfaults" -gt 0 ] 2>/dev/null; then
            add_issue "HIGH" "APACHE - Segmentation faults detected" \
                "Found ${segfaults} segfault events
May indicate corrupted modules or memory issues" \
                "Check error log: tail -100 ${apache_error_log} | grep segfault
Update/reinstall problematic Apache modules" \
                78
        fi
    fi

    # Check Apache process count
    local apache_procs=$(pgrep -c "httpd|apache2" || echo "0")
    if [ "$apache_procs" -gt 200 ]; then
        add_issue "MEDIUM" "APACHE - High process count" \
            "Apache processes: ${apache_procs}
May indicate connection buildup or slow backend" \
            "Check Apache status: apachectl fullstatus
Review MaxRequestWorkers setting" \
            65
    fi
}

################################################################################
# Phase 6: PHP-FPM Health (cPanel/Plesk)
################################################################################

analyze_php_fpm() {
    print_info "Analyzing PHP-FPM..."

    # Check if PHP-FPM is running
    if ! pgrep -f "php-fpm" >/dev/null; then
        add_issue "LOW" "PHP-FPM - Not running or not installed" \
            "PHP-FPM processes not found
May be using mod_php instead" \
            "No action needed if using Apache mod_php." \
            30
        return
    fi

    # Count PHP-FPM processes per user
    local fpm_user_counts=$(ps aux | grep "php-fpm: pool" | grep -v grep | awk '{print $1}' | sort | uniq -c | sort -rn | head -10)

    while IFS= read -r line; do
        [ -z "$line" ] && continue
        local count=$(echo "$line" | awk '{print $1}')
        local user=$(echo "$line" | awk '{print $2}')

        if [ -n "$count" ] && [ "$count" -gt 10 ] 2>/dev/null; then
            # Try to find which domain
            local user_domain=$(grep "^${user}:" /etc/trueuserdomains 2>/dev/null | cut -d: -f1 || echo "unknown")

            add_issue "HIGH" "PHP-FPM - User '${user}' has many processes" \
                "PHP-FPM processes: ${count}
User: ${user}
Domain: ${user_domain}
Possible causes: Stuck processes, heavy traffic, slow scripts, bot attacks, database issues" \
                "Investigate with specialized tools:
  1. Check for bot attacks: Main Menu → Security → Bot Analyzer (analyze ${user_domain})
  2. Check for slow MySQL queries: Main Menu → Performance → MySQL Query Analyzer
  3. Manual check: ps aux | grep ${user} | grep php-fpm
  4. Kill if stuck: pkill -9 -u ${user} php-fpm" \
                76
        fi
    done <<< "$fpm_user_counts"
}

################################################################################
# Phase 7: Log Analysis - Security
################################################################################

analyze_security_logs() {
    print_info "Analyzing security logs..."

    local secure_log=""
    if [ -f "/var/log/secure" ]; then
        secure_log="/var/log/secure"
    elif [ -f "/var/log/auth.log" ]; then
        secure_log="/var/log/auth.log"
    fi

    if [ -n "$secure_log" ]; then
        # Check for failed SSH attempts
        local failed_ssh=$(grep "Failed password" "$secure_log" 2>/dev/null | wc -l || echo "0")
        if [ "$failed_ssh" -gt 100 ]; then
            local top_ips=$(grep "Failed password" "$secure_log" | awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -5 | sed 's/^/  • /')

            # Check if cPHulkd is available (cPanel)
            local protection_cmd=""
            if [ "$SYS_CONTROL_PANEL" = "cpanel" ] && [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then
                # Check if cPHulkd is enabled
                local cphulk_enabled=$(/usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -i "enabled" || echo "disabled")
                if echo "$cphulk_enabled" | grep -qi "disabled"; then
                    protection_cmd="Enable cPHulk (cPanel's brute force protection):
  • Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection
  • Via command: bash modules/security/enable-cphulk.sh
  • Manual enable: /usr/local/cpanel/bin/cphulk_pam_ctl --enable
  • The setup wizard will automatically import your CSF whitelist to cPHulk"
                else
                    protection_cmd="cPHulk is enabled. Ensure trusted IPs are whitelisted:
  • Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection
  • Via command: bash modules/security/enable-cphulk.sh
  • Manual whitelist: whmapi1 cphulkd_add_whitelist ip=YOUR_IP
  • View blocked IPs: whmapi1 cphulkd_list_blocks"
                fi
            else
                protection_cmd="Install automatic blocking: yum install fail2ban"
            fi

            add_issue "HIGH" "SECURITY - High SSH brute force attempts" \
                "Failed SSH login attempts: ${failed_ssh}
Top attacking IPs:
${top_ips}" \
                "Investigate and block:
  1. Analyze attack patterns: Main Menu → Security → Bot Analyzer
  2. Block IPs manually: csf -d [IP]
  3. ${protection_cmd}" \
                80
        elif [ "$failed_ssh" -gt 50 ]; then
            add_issue "MEDIUM" "SECURITY - Moderate SSH brute force" \
                "Failed attempts: ${failed_ssh}" \
                "Monitor and consider IP blocking if it increases." \
                60
        fi

        # Check for successful root logins
        local root_logins=$(grep "Accepted.*root" "$secure_log" 2>/dev/null | wc -l || echo "0")
        if [ "$root_logins" -gt 0 ]; then
            local root_login_ips=$(grep "Accepted.*root" "$secure_log" | awk '{print $(NF-3)}' | sort -u | sed 's/^/  • /')
            local root_login_times=$(grep "Accepted.*root" "$secure_log" | awk '{print $1, $2, $3}' | tail -5 | sed 's/^/  • /')

            # Check for unusual IPs (not common admin IPs)
            add_issue "MEDIUM" "SECURITY - Root SSH logins detected" \
                "Successful root logins: ${root_logins}
Source IPs:
${root_login_ips}

Recent logins:
${root_login_times}" \
                "Review if these IPs are authorized:
  • Disable root SSH: Set 'PermitRootLogin no' in /etc/ssh/sshd_config
  • Use SSH keys instead of passwords
  • Check: last | grep root" \
                68
        fi

        # Check for suspicious sudo usage
        local sudo_attempts=$(grep "sudo.*COMMAND" "$secure_log" 2>/dev/null | wc -l || echo "0")
        if [ "$sudo_attempts" -gt 100 ]; then
            local top_sudo_users=$(grep "sudo.*COMMAND" "$secure_log" | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -rn | head -5 | sed 's/^/  • /')
            add_issue "MEDIUM" "SECURITY - High sudo activity" \
                "Sudo command executions: ${sudo_attempts}
Top users:
${top_sudo_users}" \
                "Review sudo logs for unusual activity:
  • grep sudo /var/log/secure | tail -50" \
                65
        fi
    fi

    # Check for potential rootkit indicators
    local suspicious_processes=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | wc -l || echo "0")
    if [ "$suspicious_processes" -gt 0 ]; then
        local sus_proc_list=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | head -5 | awk '{print $11}' | sed 's/^/  • /')
        add_issue "CRITICAL" "SECURITY - Suspicious processes detected" \
            "Found ${suspicious_processes} processes with suspicious paths:
${sus_proc_list}

May indicate rootkit or malware" \
            "Run rootkit scanner immediately:
  • Install chkrootkit: yum install chkrootkit
  • Or rkhunter: yum install rkhunter
  • Check processes: ps aux | grep -E '\.\./\.\.'" \
            95
    fi
}

################################################################################
# Phase 8: System Messages Log
################################################################################

analyze_messages_log() {
    print_info "Analyzing system messages..."

    if [ ! -f "/var/log/messages" ]; then
        return
    fi

    # Check for kernel panics
    local kernel_panics=$(grep "kernel panic\|Oops:" /var/log/messages 2>/dev/null | wc -l)
    if [ -n "$kernel_panics" ] && [ "$kernel_panics" -gt 0 ] 2>/dev/null; then
        add_issue "CRITICAL" "SYSTEM - Kernel panics detected" \
            "Found ${kernel_panics} kernel panic events
System stability compromised" \
            "Review: grep 'kernel panic' /var/log/messages
Update kernel or investigate hardware issues" \
            98
    fi

    # Check for hardware errors
    local hw_errors=$(grep "Hardware Error\|MCE\|ECC" /var/log/messages 2>/dev/null | wc -l)
    if [ -n "$hw_errors" ] && [ "$hw_errors" -gt 0 ] 2>/dev/null; then
        # Get actual error samples
        local error_samples=$(grep -E "Hardware Error|MCE|ECC" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/  /')

        add_issue "HIGH" "SYSTEM - Hardware errors detected" \
            "Found ${hw_errors} hardware error messages
May indicate failing hardware (RAM, CPU, disk)

Recent errors:
${error_samples}" \
            "Investigate hardware health:
  • Check RAM: dmidecode -t memory | grep -E 'Size|Type|Speed|Error'
  • Check disk SMART: smartctl -a /dev/sda | grep -E 'Error|Failed|Health'
  • Check dmesg: dmesg | grep -i 'error\|fail' | tail -20
  • Run memtest86+ if RAM errors detected
  • Contact hosting provider if persistent" \
            85
    fi

    # Check for network issues
    local net_errors=$(grep "link is down\|no carrier" /var/log/messages 2>/dev/null | wc -l)
    if [ -n "$net_errors" ] && [ "$net_errors" -gt 5 ] 2>/dev/null; then
        add_issue "MEDIUM" "NETWORK - Connection issues" \
            "Found ${net_errors} network link issues" \
            "Check network cable/switch, review: ip link show" \
            62
    fi
}

################################################################################
# Phase 9: Cron Job Health
################################################################################

analyze_cron() {
    print_info "Analyzing cron jobs..."

    local cron_log="/var/log/cron"
    if [ ! -f "$cron_log" ]; then
        return
    fi

    # Check for failed cron jobs (last 24 hours)
    local cron_errors=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed\|No such file" | wc -l)

    if [ -n "$cron_errors" ] && [ "$cron_errors" -gt 20 ] 2>/dev/null; then
        local error_sample=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed" | tail -5 | sed 's/^/  • /')
        add_issue "HIGH" "CRON - Many failed jobs" \
            "Found ${cron_errors} cron errors in last 24h
Sample errors:
${error_sample}" \
            "Review cron logs: tail -100 /var/log/cron
Fix failing scripts or disable problematic jobs" \
            75
    elif [ -n "$cron_errors" ] && [ "$cron_errors" -gt 5 ] 2>/dev/null; then
        add_issue "MEDIUM" "CRON - Some failed jobs" \
            "Found ${cron_errors} cron errors in last 24h" \
            "Review: grep error /var/log/cron | tail -20" \
            60
    fi
}

################################################################################
# Phase 10: Network Analysis
################################################################################

analyze_network() {
    print_info "Analyzing network..."

    # Check nf_conntrack usage
    if [ -f "/proc/sys/net/netfilter/nf_conntrack_count" ]; then
        local conntrack_count=$(cat /proc/sys/net/netfilter/nf_conntrack_count 2>/dev/null || echo "0")
        local conntrack_max=$(cat /proc/sys/net/netfilter/nf_conntrack_max 2>/dev/null || echo "100000")
        local conntrack_percent=$((conntrack_count * 100 / conntrack_max))

        if [ "$conntrack_percent" -gt 90 ]; then
            add_issue "CRITICAL" "NETWORK - Connection tracking table near limit" \
                "nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%)
New connections may be dropped" \
                "Increase limit:
  • sysctl -w net.netfilter.nf_conntrack_max=$((conntrack_max * 2))
  • Add to /etc/sysctl.conf: net.netfilter.nf_conntrack_max=$((conntrack_max * 2))
  • Or investigate connection leaks" \
                88
        elif [ "$conntrack_percent" -gt 75 ]; then
            add_issue "HIGH" "NETWORK - Connection tracking table high" \
                "nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%)" \
                "Monitor and consider increasing limit if it continues to grow." \
                72
        fi
    fi

    # Check for TCP retransmissions
    local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0")
    tcp_retrans=$(echo "$tcp_retrans" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
    tcp_retrans=${tcp_retrans:-0}
    local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1")
    tcp_out=$(echo "$tcp_out" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
    tcp_out=${tcp_out:-1}
    if [ "$tcp_out" -gt 1000000 ] 2>/dev/null; then
        local retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0")
        if (( $(echo "$retrans_percent > 5" | bc -l 2>/dev/null) )); then
            # Get current MTU
            local current_mtu=$(ip link show $(ip route | grep default | awk '{print $5}' | head -1) 2>/dev/null | grep mtu | awk '{print $5}')

            add_issue "HIGH" "NETWORK - High TCP retransmission rate" \
                "Retransmissions: ${retrans_percent}% of total segments (${tcp_retrans} of ${tcp_out})
This indicates network issues, congestion, or bandwidth problems

Common causes:
  • Network congestion or packet loss
  • MTU mismatch (current: ${current_mtu})
  • Bandwidth saturation
  • Faulty network hardware
  • ISP/hosting provider issues" \
                "Diagnose and resolve network issues:
  1. Test packet loss: ping -c 100 8.8.8.8 | grep loss
  2. Check MTU: ip link show | grep mtu (should be 1500 for most networks)
  3. Monitor bandwidth: install vnstat for tracking (yum install vnstat)
  4. Real-time monitor: iftop -i eth0 (install: yum install iftop)
  5. Check provider status: Contact hosting provider if persistent
  6. Analyze traffic: Main Menu → Security → Bot Analyzer (check for attacks)
  NOTE: For detailed bandwidth analysis, use: Main Menu → Network Diagnostics (when available)" \
                74
        fi
    fi

    # Check listen queue overflows
    local listen_overflows=$(netstat -s 2>/dev/null | grep "times the listen queue of a socket overflowed" | awk '{print $1}' | head -1)
    [ -z "$listen_overflows" ] && listen_overflows=0
    if [ "$listen_overflows" -gt 100 ] 2>/dev/null; then
        add_issue "MEDIUM" "NETWORK - Listen queue overflows detected" \
            "Listen queue overflows: ${listen_overflows}
Applications may be dropping connections" \
            "Increase net.core.somaxconn:
  • sysctl -w net.core.somaxconn=4096
  • Add to /etc/sysctl.conf" \
            68
    fi
}

################################################################################
# Phase 11: Time/NTP Analysis
################################################################################

analyze_time() {
    print_info "Analyzing time synchronization..."

    # Check if chronyd or ntpd is running
    local time_service=""
    if pgrep -x chronyd >/dev/null; then
        time_service="chronyd"
    elif pgrep -x ntpd >/dev/null; then
        time_service="ntpd"
    fi

    if [ -z "$time_service" ]; then
        add_issue "MEDIUM" "TIME - No NTP service running" \
            "Neither chronyd nor ntpd is running
Time drift can cause SSL certificate errors and authentication issues" \
            "Install and start NTP service:
  • AlmaLinux/RHEL: yum install chrony && systemctl enable --now chronyd
  • Check sync: chronyc tracking" \
            66
    else
        # Check time sync status
        if [ "$time_service" = "chronyd" ]; then
            local sync_status=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4, $5}' || echo "unknown")
            local offset_seconds=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4}' || echo "0")

            # Convert to absolute value for comparison
            offset_seconds=${offset_seconds#-}

            if (( $(echo "$offset_seconds > 1" | bc -l 2>/dev/null || echo "0") )); then
                add_issue "HIGH" "TIME - Clock offset detected" \
                    "Time offset: ${sync_status}
Significant time drift detected" \
                    "Force time sync:
  • chronyc -a makestep
  • Check sources: chronyc sources" \
                    75
            fi
        fi
    fi
}

################################################################################
# Phase 12: System Updates & Kernel
################################################################################

analyze_updates() {
    print_info "Analyzing system updates..."

    # Check if running kernel matches installed kernel
    local running_kernel=$(uname -r)
    local installed_kernel=$(rpm -q kernel --last 2>/dev/null | head -1 | awk '{print $1}' | sed 's/kernel-//' || echo "$running_kernel")

    if [ "$running_kernel" != "$installed_kernel" ]; then
        add_issue "MEDIUM" "SYSTEM - Reboot required" \
            "Running kernel: ${running_kernel}
Installed kernel: ${installed_kernel}
System needs reboot to use new kernel" \
            "Schedule maintenance window and reboot:
  • Check uptime: uptime
  • Reboot when ready: reboot" \
            64
    fi

    # Check for available security updates (if yum/dnf)
    if command -v yum >/dev/null 2>&1; then
        local security_updates=$(yum updateinfo list security 2>/dev/null | grep "^FEDORA\|^RHSA" | wc -l)
        if [ -n "$security_updates" ] && [ "$security_updates" -gt 10 ] 2>/dev/null; then
            add_issue "HIGH" "SYSTEM - Many security updates available" \
                "Security updates available: ${security_updates}
System may be vulnerable" \
                "Apply security updates:
  • yum update --security
  • Or schedule full update: yum update" \
                76
        elif [ -n "$security_updates" ] && [ "$security_updates" -gt 0 ] 2>/dev/null; then
            add_issue "MEDIUM" "SYSTEM - Security updates available" \
                "Security updates: ${security_updates}" \
                "Review and apply: yum updateinfo list security" \
                58
        fi
    fi

    # Check for control panel version
    if [ "$SYS_CONTROL_PANEL" = "cpanel" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then
        echo "cPanel version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt"
    elif [ "$SYS_CONTROL_PANEL" = "plesk" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then
        echo "Plesk version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt"
    elif [ "$SYS_CONTROL_PANEL" = "interworx" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then
        echo "InterWorx version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt"
    fi
}

################################################################################
# Phase 13: File Limits & Descriptors
################################################################################

analyze_file_limits() {
    print_info "Analyzing file descriptor limits..."

    # Check system-wide file descriptor usage
    local file_nr=$(cat /proc/sys/fs/file-nr 2>/dev/null || echo "0 0 100000")
    local used_fds=$(echo "$file_nr" | awk '{print $1}')
    local max_fds=$(echo "$file_nr" | awk '{print $3}')
    local fd_percent=$((used_fds * 100 / max_fds))

    if [ "$fd_percent" -gt 80 ]; then
        add_issue "HIGH" "FILE DESCRIPTORS - System limit approaching" \
            "Open file descriptors: ${used_fds} / ${max_fds} (${fd_percent}%)
Applications may fail to open files/sockets" \
            "Increase limit:
  • sysctl -w fs.file-max=$((max_fds * 2))
  • Add to /etc/sysctl.conf: fs.file-max=$((max_fds * 2))
  • Check per-process limits: ulimit -n" \
            78
    fi

    # Check for common services with high FD usage
    for service in httpd mysqld php-fpm; do
        if pgrep -x "$service" >/dev/null; then
            local service_fds=$(lsof -p $(pgrep -x "$service" | head -1) 2>/dev/null | wc -l || echo "0")
            if [ "$service_fds" -gt 10000 ]; then
                add_issue "MEDIUM" "$service - High file descriptor usage" \
                    "Service: $service
Open file descriptors: ${service_fds}
May indicate connection/file leaks" \
                    "Investigate $service:
  • lsof -p \$(pgrep -x $service | head -1) | head -100
  • Check for file/connection leaks" \
                    65
            fi
        fi
    done
}

################################################################################
# Phase 14: Email Queue Analysis
################################################################################

analyze_email_queue() {
    print_info "Analyzing email queue..."

    local queue_count=0
    local mail_system=""

    # Check for Exim (most common on cPanel)
    if command -v exim >/dev/null 2>&1; then
        mail_system="Exim"
        queue_count=$(exim -bpc 2>/dev/null || echo "0")
    # Check for Postfix
    elif command -v postqueue >/dev/null 2>&1; then
        mail_system="Postfix"
        queue_count=$(postqueue -p 2>/dev/null | tail -1 | awk '{print $5}' || echo "0")
    # Check for Sendmail
    elif command -v mailq >/dev/null 2>&1; then
        mail_system="Sendmail"
        queue_count=$(mailq 2>/dev/null | grep -c "^[A-Z]" || echo "0")
    fi

    [ -z "$queue_count" ] && queue_count=0

    if [ "$queue_count" -gt 5000 ]; then
        # Get sample of queued messages
        local queue_sample=""
        if [ "$mail_system" = "Exim" ]; then
            queue_sample=$(exim -bp | head -20 | sed 's/^/  /')
        fi

        add_issue "CRITICAL" "EMAIL - Massive mail queue" \
            "Mail system: ${mail_system}
Queue size: ${queue_count} messages
This can consume disk space and cause slow mail delivery

Sample:
${queue_sample}" \
            "Investigate and clear queue:
  • Check for spam/compromised accounts
  • Review: exim -bp | head -50 (for Exim)
  • Clear specific messages: exim -Mrm [message-id]
  • Force delivery attempts: exim -qff
  • Check for frozen messages: exim -bp | grep frozen" \
            92
    elif [ "$queue_count" -gt 1000 ]; then
        add_issue "HIGH" "EMAIL - Large mail queue" \
            "Mail system: ${mail_system}
Queue size: ${queue_count} messages" \
            "Review mail queue:
  • exim -bp | less (for Exim)
  • mailq | less (for Postfix/Sendmail)
  • Check for spam/compromised accounts
  • Review /var/log/exim_mainlog for errors" \
            78
    elif [ "$queue_count" -gt 100 ]; then
        add_issue "MEDIUM" "EMAIL - Growing mail queue" \
            "Queue size: ${queue_count} messages (${mail_system})" \
            "Monitor queue. May indicate delivery issues or spam." \
            58
    fi
}

################################################################################
# Phase 15: I/O Wait Analysis
################################################################################

analyze_iowait() {
    print_info "Analyzing disk I/O performance..."

    # Get current I/O wait from top
    local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1)

    if [ -z "$iowait" ]; then
        # Try alternative method with iostat if available
        if command -v iostat >/dev/null 2>&1; then
            iowait=$(iostat -c 1 2 | tail -1 | awk '{print $4}')
        else
            return
        fi
    fi

    # Remove any decimal point and convert to integer for comparison
    local iowait_int=$(echo "$iowait" | cut -d'.' -f1)
    [ -z "$iowait_int" ] && iowait_int=0

    if [ "$iowait_int" -gt 30 ] 2>/dev/null; then
        # Try to find which process is causing I/O
        local io_procs=$(iotop -b -n 1 2>/dev/null | head -20 || ps aux --sort=-pcpu | head -10)

        add_issue "CRITICAL" "DISK I/O - Extremely high I/O wait" \
            "Current I/O wait: ${iowait}%
System is waiting on disk operations - extreme performance impact" \
            "Identify I/O-heavy processes:
  • iotop -o (show only active I/O)
  • iostat -x 1 (detailed disk stats)
  • Check for failing drives: dmesg | grep -i error
  • Check disk health: smartctl -a /dev/sda" \
            95
    elif [ "$iowait_int" -gt 15 ] 2>/dev/null; then
        add_issue "HIGH" "DISK I/O - High I/O wait" \
            "I/O wait: ${iowait}%
System performance degraded by disk operations" \
            "Monitor disk I/O:
  • iostat -x 1 5 (watch for 5 seconds)
  • iotop (if installed: yum install iotop)
  • Check for large file operations
  • Review disk usage and fragmentation" \
            76
    elif [ "$iowait_int" -gt 5 ] 2>/dev/null; then
        add_issue "MEDIUM" "DISK I/O - Elevated I/O wait" \
            "I/O wait: ${iowait}%" \
            "Monitor disk performance. May indicate heavy disk activity." \
            55
    fi
}

################################################################################
# Phase 16: SELinux Analysis
################################################################################

analyze_selinux() {
    print_info "Analyzing SELinux status..."

    # Check if SELinux is enabled
    if ! command -v getenforce >/dev/null 2>&1; then
        return
    fi

    local selinux_status=$(getenforce 2>/dev/null || echo "Disabled")

    if [ "$selinux_status" = "Enforcing" ]; then
        # Check for recent denials
        local denials_count=0
        if [ -f "/var/log/audit/audit.log" ]; then
            denials_count=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | wc -l)
        fi

        if [ "$denials_count" -gt 50 ]; then
            local denial_sample=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | tail -5 | sed 's/^/  /')

            add_issue "HIGH" "SELINUX - Many denials detected" \
                "SELinux denials today: ${denials_count}
SELinux is blocking operations - may cause application failures

Recent denials:
${denial_sample}" \
                "Review and fix SELinux policies:
  • ausearch -m avc -ts today
  • audit2allow -a (generate policy)
  • audit2why -a (explain denials)
  • Temporarily: setenforce 0 (NOT recommended for production)" \
                82
        elif [ "$denials_count" -gt 10 ]; then
            add_issue "MEDIUM" "SELINUX - Some denials detected" \
                "SELinux denials today: ${denials_count}" \
                "Review: ausearch -m avc -ts today" \
                62
        fi
    fi
}

################################################################################
# Phase 17: Control Panel Services
################################################################################

analyze_control_panel_services() {
    print_info "Analyzing control panel services..."

    local panel_issues=""

    # Check cPanel services
    if [ "$SYS_PANEL" = "cpanel" ]; then
        # Key cPanel services to check
        local cpanel_services=("cpanel" "whostmgrd" "cpsrvd" "tailwatchd" "dnsadmin")

        for service in "${cpanel_services[@]}"; do
            if ! systemctl is-active --quiet "$service" 2>/dev/null; then
                panel_issues="${panel_issues}  • ${service} is not running\n"
            fi
        done

        if [ -n "$panel_issues" ]; then
            add_issue "HIGH" "CPANEL - Critical services down" \
                "The following cPanel services are not running:
${panel_issues}
This affects control panel functionality" \
                "Restart services:
  • systemctl restart cpanel
  • /scripts/restartsrv_cpanel
  • Check logs: tail -50 /usr/local/cpanel/logs/error_log" \
                85
        fi

    # Check Plesk services
    elif [ "$SYS_PANEL" = "plesk" ]; then
        if ! systemctl is-active --quiet psa 2>/dev/null; then
            add_issue "HIGH" "PLESK - Panel service down" \
                "Plesk service is not running" \
                "Restart Plesk: systemctl restart psa" \
                85
        fi
    fi

    # Check web server
    if ! pgrep -x httpd >/dev/null && ! pgrep -x apache2 >/dev/null; then
        add_issue "CRITICAL" "WEB SERVER - Apache/httpd not running" \
            "Web server process not found
All websites are down" \
            "Start web server:
  • systemctl restart httpd (CentOS/AlmaLinux)
  • systemctl restart apache2 (Debian/Ubuntu)
  • Check logs: tail -50 /var/log/httpd/error_log" \
            98
    fi

    # Check database server
    if ! pgrep -x mysqld >/dev/null && ! pgrep -x mariadbd >/dev/null; then
        add_issue "CRITICAL" "DATABASE - MySQL/MariaDB not running" \
            "Database server process not found
Database-driven sites are down" \
            "Start database:
  • systemctl restart mariadb
  • systemctl restart mysql
  • Check logs: tail -50 /var/log/mariadb/mariadb.log
  • Check disk space in /var/lib/mysql" \
            98
    fi
}

################################################################################
# Phase 18: DNS Resolution Check
################################################################################

analyze_dns() {
    print_info "Analyzing DNS resolution..."

    # Test resolving a few critical domains
    local test_domains=("google.com" "cloudflare.com" "8.8.8.8")
    local failed_count=0
    local slow_count=0
    local failed_domains=""

    for domain in "${test_domains[@]}"; do
        local start_time=$(date +%s%N)
        if ! host "$domain" >/dev/null 2>&1; then
            failed_count=$((failed_count + 1))
            failed_domains="${failed_domains}  • ${domain}\n"
        else
            local end_time=$(date +%s%N)
            local duration_ms=$(( (end_time - start_time) / 1000000 ))

            if [ "$duration_ms" -gt 2000 ]; then
                slow_count=$((slow_count + 1))
            fi
        fi
    done

    if [ "$failed_count" -gt 0 ]; then
        add_issue "CRITICAL" "DNS - Resolution failures detected" \
            "Failed to resolve ${failed_count} test domains:
${failed_domains}
DNS issues cause slow loading and failures across all services" \
            "Check DNS configuration:
  • cat /etc/resolv.conf
  • Test: dig google.com
  • Try alternate DNS: echo 'nameserver 8.8.8.8' >> /etc/resolv.conf
  • Restart networking: systemctl restart network" \
            94
    elif [ "$slow_count" -gt 1 ]; then
        add_issue "HIGH" "DNS - Slow resolution detected" \
            "DNS queries taking >2 seconds
This slows down all network operations" \
            "Check DNS servers:
  • cat /etc/resolv.conf
  • Consider faster DNS: 8.8.8.8, 1.1.1.1
  • Test: dig @8.8.8.8 google.com" \
            72
    fi
}

################################################################################
# Phase 19: Zombie Process Check
################################################################################

analyze_zombie_processes() {
    print_info "Analyzing zombie processes..."

    # Count zombie (defunct) processes
    local zombie_count=$(ps aux | awk '$8 ~ /Z/ {print $0}' | wc -l)

    if [ "$zombie_count" -gt 50 ]; then
        local zombie_sample=$(ps aux | awk '$8 ~ /Z/ {print $0}' | head -5 | awk '{print "  • " $11 " (PID " $2 ", PPID via pstree)"}')

        add_issue "HIGH" "PROCESSES - Many zombie processes detected" \
            "Zombie (defunct) processes: ${zombie_count}
Indicates parent processes not properly cleaning up children

Sample zombies:
${zombie_sample}" \
            "Investigate parent processes:
  • ps aux | awk '\$8 ~ /Z/'
  • pstree -p | grep defunct
  • Kill parent process or reboot if persistent
  • Common causes: Apache, PHP-FPM, custom scripts" \
            78
    elif [ "$zombie_count" -gt 10 ]; then
        add_issue "MEDIUM" "PROCESSES - Zombie processes detected" \
            "Zombie processes: ${zombie_count}
May indicate stuck parent processes" \
            "Review: ps aux | awk '\$8 ~ /Z/'" \
            58
    fi
}

################################################################################
# Phase 20: Firewall Status Check
################################################################################

analyze_firewall() {
    print_info "Analyzing firewall status..."

    local firewall_status=""
    local firewall_active=0

    # Check for CSF (ConfigServer Security & Firewall)
    if [ -x "/usr/sbin/csf" ]; then
        if csf -l >/dev/null 2>&1; then
            firewall_active=1
            firewall_status="CSF"

            # Check if CSF is in testing mode
            if grep -q "TESTING = \"1\"" /etc/csf/csf.conf 2>/dev/null; then
                add_issue "MEDIUM" "FIREWALL - CSF in testing mode" \
                    "CSF firewall is in TESTING mode
Blocks will auto-expire - not suitable for production" \
                    "Disable testing mode:
  • Edit /etc/csf/csf.conf
  • Set TESTING = \"0\"
  • Restart: csf -r" \
                    62
            fi

            # Check for high deny count (might indicate attack)
            local deny_count=$(csf -d 2>/dev/null | grep -c "^" || echo "0")
            if [ "$deny_count" -gt 1000 ]; then
                add_issue "MEDIUM" "FIREWALL - Many blocked IPs" \
                    "CSF has ${deny_count} denied IPs
Server may be under attack or CSF may need tuning" \
                    "Review blocked IPs:
  • csf -d | less
  • Check for false positives
  • Consider: Main Menu → Security → Bot Analyzer" \
                    65
            fi
        fi
    fi

    # Check iptables
    if [ "$firewall_active" -eq 0 ]; then
        if systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then
            firewall_active=1
            firewall_status="iptables"

            # Check if iptables has rules
            local rule_count=$(iptables -L | grep -c "^Chain\|^target" || echo "0")
            if [ "$rule_count" -lt 5 ]; then
                add_issue "MEDIUM" "FIREWALL - iptables active but minimal rules" \
                    "iptables is running but has very few rules
Server may not be properly protected" \
                    "Review firewall rules: iptables -L -n -v" \
                    55
            fi
        fi
    fi

    # Check firewalld
    if [ "$firewall_active" -eq 0 ]; then
        if systemctl is-active --quiet firewalld 2>/dev/null; then
            firewall_active=1
            firewall_status="firewalld"
        fi
    fi

    # Warn if no firewall detected
    if [ "$firewall_active" -eq 0 ]; then
        add_issue "HIGH" "FIREWALL - No active firewall detected" \
            "No firewall found (CSF, iptables, firewalld)
Server is exposed to attacks" \
            "Install and configure a firewall:
  • CSF (recommended for cPanel):
    cd /usr/src && wget https://download.configserver.com/csf.tgz
    tar -xzf csf.tgz && cd csf && sh install.sh
  • Or enable firewalld: systemctl enable --now firewalld" \
            82
    fi
}

################################################################################
# Phase 21: Network Connectivity Check
################################################################################

analyze_network_connectivity() {
    print_info "Analyzing network connectivity..."

    # Test outbound connectivity
    local connectivity_failed=0
    local test_ips=("8.8.8.8" "1.1.1.1")

    for ip in "${test_ips[@]}"; do
        if ! ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
            connectivity_failed=$((connectivity_failed + 1))
        fi
    done

    if [ "$connectivity_failed" -eq ${#test_ips[@]} ]; then
        add_issue "CRITICAL" "NETWORK - No internet connectivity" \
            "Cannot reach external IPs (${test_ips[*]})
Server has no internet access - critical services affected" \
            "Check network:
  • ip link show (check interfaces)
  • ip route show (check routing)
  • systemctl status network
  • Check gateway: ping gateway IP
  • Contact hosting provider if persistent" \
            96
    elif [ "$connectivity_failed" -gt 0 ]; then
        add_issue "HIGH" "NETWORK - Intermittent connectivity issues" \
            "Some external IPs unreachable
Network may be unstable" \
            "Check network stability:
  • ping -c 10 8.8.8.8
  • mtr 8.8.8.8 (install: yum install mtr)
  • Check: ip route show" \
            74
    fi

    # Test HTTP/HTTPS connectivity (can the server download updates?)
    if ! timeout 5 curl -s -o /dev/null https://google.com 2>/dev/null; then
        add_issue "MEDIUM" "NETWORK - HTTP/HTTPS connectivity issues" \
            "Cannot establish HTTPS connections
May affect updates, let's encrypt, external API calls" \
            "Test connectivity:
  • curl -v https://google.com
  • Check firewall rules
  • Check proxy settings: echo \$http_proxy" \
            68
    fi
}

################################################################################
# Phase 22: CloudLinux Specific Checks
################################################################################

analyze_cloudlinux() {
    if [ "$SYS_CLOUDLINUX" != "yes" ]; then
        return
    fi

    print_info "Analyzing CloudLinux LVE limits..."

    # Check if lvectl exists
    if ! command -v lvectl >/dev/null 2>&1; then
        return
    fi

    # Get users hitting LVE limits
    local lve_faults=$(lvectl list --by-fault 2>/dev/null | head -10)
    if [ -n "$lve_faults" ]; then
        local top_faults=$(echo "$lve_faults" | head -5 | sed 's/^/  • /')
        add_issue "HIGH" "CLOUDLINUX - Users hitting LVE limits" \
            "Top users hitting resource limits:
${top_faults}

This causes 503 errors and slow websites" \
            "Review limits: lvectl list
Increase limits for affected users or optimize their sites:
  • lvectl set [USER] --cpu=200 --pmem=2G" \
            78
    fi
}

################################################################################
# Main Analysis Function
################################################################################

run_analysis() {
    clear
    print_banner "System Health Check"
    echo ""
    print_info "Starting comprehensive system analysis..."
    echo ""

    # Run all analysis phases
    analyze_memory
    analyze_memory_config
    analyze_disk
    analyze_cpu
    analyze_mysql
    analyze_apache
    analyze_php_fpm
    analyze_security_logs
    analyze_messages_log
    analyze_cron
    analyze_network
    analyze_time
    analyze_updates
    analyze_file_limits
    analyze_email_queue
    analyze_iowait
    analyze_selinux
    analyze_control_panel_services
    analyze_dns
    analyze_zombie_processes
    analyze_firewall
    analyze_network_connectivity
    analyze_cloudlinux

    print_success "Analysis complete!"
    echo ""
}

################################################################################
# Report Generation
################################################################################

generate_report() {
    {
        echo "=============================================================================="
        echo "SERVER HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')"
        echo "=============================================================================="
        echo ""
        echo "System: $(hostname)"
        echo "Control Panel: ${SYS_CONTROL_PANEL:-none} ${SYS_CONTROL_PANEL_VERSION:-}"
        echo "OS: ${SYS_OS_TYPE:-unknown} ${SYS_OS_VERSION:-}"
        echo "Kernel: $(uname -r)"
        echo ""
        echo "SEVERITY SUMMARY:"
        echo "  CRITICAL: ${#CRITICAL_ISSUES[@]} issues"
        echo "  HIGH:     ${#HIGH_ISSUES[@]} issues"
        echo "  MEDIUM:   ${#MEDIUM_ISSUES[@]} issues"
        echo "  LOW:      ${#LOW_ISSUES[@]} issues"
        echo ""

        # Critical issues
        if [ ${#CRITICAL_ISSUES[@]} -gt 0 ]; then
            echo "=============================================================================="
            echo "CRITICAL ISSUES (Immediate Action Required)"
            echo "=============================================================================="
            echo ""
            for issue in "${CRITICAL_ISSUES[@]}"; do
                echo "$issue"
                echo "------------------------------------------------------------------------------"
                echo ""
            done
        fi

        # High issues
        if [ ${#HIGH_ISSUES[@]} -gt 0 ]; then
            echo "=============================================================================="
            echo "HIGH ISSUES (Action Recommended)"
            echo "=============================================================================="
            echo ""
            for issue in "${HIGH_ISSUES[@]}"; do
                echo "$issue"
                echo "------------------------------------------------------------------------------"
                echo ""
            done
        fi

        # Medium issues
        if [ ${#MEDIUM_ISSUES[@]} -gt 0 ]; then
            echo "=============================================================================="
            echo "MEDIUM ISSUES (Monitor Closely)"
            echo "=============================================================================="
            echo ""
            for issue in "${MEDIUM_ISSUES[@]}"; do
                echo "$issue"
                echo "------------------------------------------------------------------------------"
                echo ""
            done
        fi

        # Low issues
        if [ ${#LOW_ISSUES[@]} -gt 0 ]; then
            echo "=============================================================================="
            echo "LOW ISSUES (Informational)"
            echo "=============================================================================="
            echo ""
            for issue in "${LOW_ISSUES[@]}"; do
                echo "$issue"
                echo "------------------------------------------------------------------------------"
                echo ""
            done
        fi

        # Summary
        echo "=============================================================================="
        echo "NEXT STEPS"
        echo "=============================================================================="
        echo ""
        echo "Priority Actions:"
        echo "  1. Address all CRITICAL issues immediately"
        echo "  2. Plan fixes for HIGH issues"
        echo "  3. Monitor MEDIUM issues for trends"
        echo ""
        echo "Detailed Analysis Available:"
        echo "  • Bot Analyzer (Menu → Security) for traffic/attack analysis"
        echo "  • MySQL Query Analyzer (Menu → Performance) for database optimization"
        echo ""
        echo "Report saved to: $REPORT_FILE"
        echo ""

    } | tee "$REPORT_FILE"
}

################################################################################
# Save Health Baseline to Reference Database
################################################################################

save_health_baseline() {
    # Only save if reference database exists
    if [ ! -f "$SYSREF_DB" ]; then
        return
    fi

    print_info "Saving health baseline to reference database..."

    # Remove old health baseline section
    sed -i '/^\[HEALTH_BASELINE\]/,/^$/d' "$SYSREF_DB" 2>/dev/null

    # Collect current metrics
    local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
    local mem_used=$(free -m | awk '/^Mem:/ {print $3}')
    local mem_percent=$((mem_used * 100 / mem_total))

    local cpu_load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)

    local disk_percent=$(df -h / | tail -1 | awk '{print $5}' | tr -d '%')

    local email_queue=0
    if command -v exim >/dev/null 2>&1; then
        email_queue=$(exim -bpc 2>/dev/null || echo "0")
    fi

    local httpd_status="stopped"
    if pgrep -x httpd >/dev/null || pgrep -x apache2 >/dev/null; then
        httpd_status="running"
    fi

    local mysql_status="stopped"
    if pgrep -x mysqld >/dev/null || pgrep -x mariadbd >/dev/null; then
        mysql_status="running"
    fi

    local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1 | cut -d'.' -f1)
    [ -z "$iowait" ] && iowait=0

    local zombie_count=$(ps aux | awk '$8 ~ /Z/' | wc -l)

    local firewall_active="none"
    if [ -x "/usr/sbin/csf" ] && csf -l >/dev/null 2>&1; then
        firewall_active="csf"
    elif systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then
        firewall_active="iptables"
    elif systemctl is-active --quiet firewalld 2>/dev/null; then
        firewall_active="firewalld"
    fi

    local current_date=$(date '+%Y-%m-%d')
    local current_datetime=$(date '+%Y-%m-%d %H:%M:%S')

    # Collect network metrics
    local network_interface=$(ip route | grep default | awk '{print $5}' | head -1)
    local network_mtu=$(ip link show "$network_interface" 2>/dev/null | grep mtu | awk '{print $5}' || echo "unknown")
    local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0")
    tcp_retrans=$(echo "$tcp_retrans" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
    tcp_retrans=${tcp_retrans:-0}
    local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1")
    tcp_out=$(echo "$tcp_out" | tr -d '\n\r' | grep -o '[0-9]*' | head -1)
    tcp_out=${tcp_out:-1}
    local tcp_retrans_percent="0"
    if [ "$tcp_out" -gt 1000000 ] 2>/dev/null; then
        tcp_retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0")
    fi

    local rx_errors=0
    local tx_errors=0
    local rx_dropped=0
    local tx_dropped=0
    if [ -n "$network_interface" ] && [ -d "/sys/class/net/$network_interface/statistics" ]; then
        rx_errors=$(cat "/sys/class/net/$network_interface/statistics/rx_errors" 2>/dev/null || echo "0")
        tx_errors=$(cat "/sys/class/net/$network_interface/statistics/tx_errors" 2>/dev/null || echo "0")
        rx_dropped=$(cat "/sys/class/net/$network_interface/statistics/rx_dropped" 2>/dev/null || echo "0")
        tx_dropped=$(cat "/sys/class/net/$network_interface/statistics/tx_dropped" 2>/dev/null || echo "0")
    fi

    # Collect hardware status
    local disk_smart_status="unknown"
    local disk_errors_count=0
    if command -v smartctl >/dev/null 2>&1; then
        local primary_disk=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1; exit}')
        if [ -n "$primary_disk" ]; then
            disk_smart_status=$(smartctl -H "$primary_disk" 2>/dev/null | grep "SMART overall-health" | awk '{print $NF}' || echo "unknown")
        fi
    fi
    disk_errors_count=$(dmesg | grep -i "I/O error\|sector\|SMART\|Hardware Error" | wc -l || echo "0")

    # Collect security metrics
    local ssh_failed_attempts=0
    local ssh_attacks_today=0
    if [ -f "/var/log/secure" ]; then
        ssh_failed_attempts=$(grep "Failed password" /var/log/secure 2>/dev/null | wc -l || echo "0")
        ssh_attacks_today=$(grep "Failed password" /var/log/secure 2>/dev/null | grep "$(date '+%b %e')" | wc -l || echo "0")
    fi

    local cphulk_status="not_installed"
    if [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then
        if /usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -qi "enabled"; then
            cphulk_status="enabled"
        else
            cphulk_status="disabled"
        fi
    fi

    # Append new health baseline section
    {
        echo ""
        echo "[HEALTH_BASELINE]"
        echo "HEALTH|TIMESTAMP|$current_datetime|"

        # System resources
        echo "HEALTH|MEMORY_TOTAL_MB|$mem_total|$current_date"
        echo "HEALTH|MEMORY_USED_PERCENT|$mem_percent|$current_date"
        echo "HEALTH|CPU_LOAD_1MIN|$cpu_load_1min|$current_date"
        echo "HEALTH|CPU_CORES|$CPU_CORES|$current_date"
        echo "HEALTH|DISK_USED_PERCENT|$disk_percent|$current_date"
        echo "HEALTH|IOWAIT_PERCENT|$iowait|$current_date"

        # Services
        echo "HEALTH|EMAIL_QUEUE_SIZE|$email_queue|$current_date"
        echo "HEALTH|ZOMBIE_PROCESSES|$zombie_count|$current_date"
        echo "HEALTH|HTTPD_STATUS|$httpd_status|$current_date"
        echo "HEALTH|MYSQL_STATUS|$mysql_status|$current_date"
        echo "HEALTH|FIREWALL_STATUS|$firewall_active|$current_date"

        # Network status
        echo "HEALTH|NETWORK_INTERFACE|$network_interface|$current_date"
        echo "HEALTH|NETWORK_MTU|$network_mtu|$current_date"
        echo "HEALTH|NETWORK_RX_ERRORS|$rx_errors|$current_date"
        echo "HEALTH|NETWORK_TX_ERRORS|$tx_errors|$current_date"
        echo "HEALTH|NETWORK_RX_DROPPED|$rx_dropped|$current_date"
        echo "HEALTH|NETWORK_TX_DROPPED|$tx_dropped|$current_date"
        echo "HEALTH|TCP_RETRANS_PERCENT|$tcp_retrans_percent|$current_date"

        # Hardware status
        echo "HEALTH|DISK_SMART_STATUS|$disk_smart_status|$current_date"
        echo "HEALTH|HARDWARE_ERRORS|$disk_errors_count|$current_date"

        # Security status
        echo "HEALTH|SSH_FAILED_ATTEMPTS_TOTAL|$ssh_failed_attempts|$current_date"
        echo "HEALTH|SSH_ATTACKS_TODAY|$ssh_attacks_today|$current_date"
        echo "HEALTH|CPHULK_STATUS|$cphulk_status|$current_date"

        # Issue counts
        echo "HEALTH|CRITICAL_ISSUES|${#CRITICAL_ISSUES[@]}|$current_date"
        echo "HEALTH|HIGH_ISSUES|${#HIGH_ISSUES[@]}|$current_date"
        echo "HEALTH|MEDIUM_ISSUES|${#MEDIUM_ISSUES[@]}|$current_date"
        echo "HEALTH|LOW_ISSUES|${#LOW_ISSUES[@]}|$current_date"
        echo ""
    } >> "$SYSREF_DB"
}

################################################################################
# Display Report
################################################################################

display_report() {
    if [ ${#CRITICAL_ISSUES[@]} -eq 0 ] && [ ${#HIGH_ISSUES[@]} -eq 0 ] && [ ${#MEDIUM_ISSUES[@]} -eq 0 ] && [ ${#LOW_ISSUES[@]} -eq 0 ]; then
        echo ""
        print_success "No issues detected! System is healthy."
        echo ""
    else
        generate_report | less -R
    fi

    echo ""
    print_info "Full report saved to: $REPORT_FILE"
    echo ""
    read -p "Press Enter to continue..."
}

################################################################################
# Main
################################################################################

run_analysis
save_health_baseline
display_report

# Cleanup
rm -rf "$TEMP_DIR"