#!/bin/bash ################################################################################ # System Health Check - Universal Diagnostics ################################################################################ # Purpose: Comprehensive server health analysis with severity-based reporting # Supports: cPanel, Plesk, InterWorx, CloudLinux, AlmaLinux # Author: Server Toolkit # Version: 1.0.0 ################################################################################ # Load common functions SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" source "$SCRIPT_DIR/lib/common-functions.sh" source "$SCRIPT_DIR/lib/system-detect.sh" # Output files REPORT_FILE="/tmp/system_health_report_$(date +%Y%m%d_%H%M%S).txt" TEMP_DIR=$(mktemp -d /tmp/health-check-XXXXXX) # Issue tracking declare -a CRITICAL_ISSUES=() declare -a HIGH_ISSUES=() declare -a MEDIUM_ISSUES=() declare -a LOW_ISSUES=() ################################################################################ # Helper Functions ################################################################################ add_issue() { local severity=$1 local title=$2 local details=$3 local recommendation=$4 local score=$5 local issue="[SEVERITY: ${score}%] ${title} ${details} Recommendation: ${recommendation} " case $severity in CRITICAL) CRITICAL_ISSUES+=("$issue") ;; HIGH) HIGH_ISSUES+=("$issue") ;; MEDIUM) MEDIUM_ISSUES+=("$issue") ;; LOW) LOW_ISSUES+=("$issue") ;; esac } ################################################################################ # Phase 1: Memory Analysis ################################################################################ analyze_memory() { print_info "Analyzing memory usage..." # Get memory stats local mem_total=$(free -m | awk '/^Mem:/ {print $2}') local mem_used=$(free -m | awk '/^Mem:/ {print $3}') local mem_free=$(free -m | awk '/^Mem:/ {print $4}') local mem_available=$(free -m | awk '/^Mem:/ {print $7}') local swap_total=$(free -m | awk '/^Swap:/ {print $2}') local swap_used=$(free -m | awk '/^Swap:/ {print $3}') local mem_percent=$((mem_used * 100 / mem_total)) local swap_percent=0 [ "$swap_total" -gt 0 ] && swap_percent=$((swap_used * 100 / swap_total)) # Check for critical memory issues if [ "$mem_percent" -gt 90 ]; then add_issue "CRITICAL" "MEMORY - Critical memory usage" \ "Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%) Available: ${mem_available}MB" \ "Free up memory immediately. Check top memory consumers." \ 95 elif [ "$mem_percent" -gt 80 ]; then add_issue "HIGH" "MEMORY - High memory usage" \ "Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%) Available: ${mem_available}MB" \ "Monitor memory usage. Consider adding RAM or optimizing services." \ 75 fi # Check swap usage if [ "$swap_used" -gt 0 ] && [ "$swap_percent" -gt 50 ]; then add_issue "HIGH" "MEMORY - High swap usage" \ "Swap usage: ${swap_used}MB / ${swap_total}MB (${swap_percent}%) System is swapping - performance degradation likely" \ "Identify memory-heavy processes. Reduce memory usage or add RAM." \ 80 fi # Check for OOM events local oom_count=$(dmesg | grep -i "killed process\|out of memory" | wc -l || echo "0") if [ "$oom_count" -gt 0 ]; then local recent_oom=$(dmesg | grep -i "killed process" | tail -3 | sed 's/^/ • /' || echo " • Details not available") add_issue "CRITICAL" "MEMORY - OOM Killer Active" \ "OOM killer invoked ${oom_count} times since boot Recent events: ${recent_oom}" \ "Critical memory pressure. Reduce MySQL/Apache memory limits or add RAM. Check: dmesg | grep -i 'killed process'" \ 95 fi # Get top memory consumers local top_mem=$(ps aux --sort=-%mem | head -6 | tail -5 | awk '{printf " • %-15s %6s %s\n", $1, $4"%", $11}') echo "$top_mem" > "$TEMP_DIR/top_memory.txt" } ################################################################################ # Phase 1.5: Memory Configuration Analysis ################################################################################ analyze_memory_config() { print_info "Analyzing memory configuration..." # Check if swap exists local swap_total=$(free -m | awk '/^Swap:/ {print $2}') if [ "$swap_total" -eq 0 ]; then local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024)) if [ "$mem_total_gb" -lt 4 ]; then add_issue "HIGH" "MEMORY CONFIG - No swap configured" \ "System has no swap space Total RAM: ${mem_total_gb}GB For servers with <4GB RAM, swap is recommended" \ "Create swap file: • dd if=/dev/zero of=/swapfile bs=1G count=2 • chmod 600 /swapfile • mkswap /swapfile • swapon /swapfile • Add to /etc/fstab: /swapfile none swap sw 0 0" \ 78 else add_issue "MEDIUM" "MEMORY CONFIG - No swap configured" \ "System has no swap space (${mem_total_gb}GB RAM) Swap provides safety net for memory pressure" \ "Consider adding swap even with sufficient RAM." \ 55 fi fi # Check swappiness value local swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null || echo "60") if [ "$swappiness" -gt 30 ] && [ "$swap_total" -gt 0 ]; then add_issue "MEDIUM" "MEMORY CONFIG - Swappiness too high" \ "Current swappiness: ${swappiness} Recommended for servers: 10-30 High swappiness causes aggressive swapping, degrading performance" \ "Reduce swappiness: • Temporary: sysctl vm.swappiness=10 • Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf • Apply: sysctl -p" \ 65 fi # Check if OOM killer is enabled local oom_kill_allocating=$(cat /proc/sys/vm/oom_kill_allocating_task 2>/dev/null || echo "0") local overcommit_memory=$(cat /proc/sys/vm/overcommit_memory 2>/dev/null || echo "0") if [ "$overcommit_memory" -eq 2 ]; then add_issue "LOW" "MEMORY CONFIG - Overcommit disabled" \ "vm.overcommit_memory = 2 (disabled) OOM killer may not activate, risking system lockup" \ "For most servers, vm.overcommit_memory=0 (heuristic) is recommended. Only use strict accounting (2) if you understand the implications." \ 45 fi # Check for huge pages (can cause memory fragmentation) local transparent_hugepage=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+' || echo "unknown") if [ "$transparent_hugepage" = "always" ]; then add_issue "LOW" "MEMORY CONFIG - Transparent Huge Pages enabled" \ "THP can cause memory fragmentation and latency spikes Recommended: madvise or never for database servers" \ "Disable THP: • echo never > /sys/kernel/mm/transparent_hugepage/enabled • Make permanent in /etc/rc.local or systemd" \ 48 fi } ################################################################################ # Phase 2: Disk Analysis ################################################################################ analyze_disk() { print_info "Analyzing disk usage..." # Check all mounted filesystems while IFS= read -r line; do local filesystem=$(echo "$line" | awk '{print $1}') local size=$(echo "$line" | awk '{print $2}') local used=$(echo "$line" | awk '{print $3}') local avail=$(echo "$line" | awk '{print $4}') local percent=$(echo "$line" | awk '{print $5}' | tr -d '%') local mount=$(echo "$line" | awk '{print $6}') # Skip tmpfs and other virtual filesystems [[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue if [ "$percent" -gt 95 ]; then # Analyze what's consuming space local analysis="" # Top level directories local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -5 | sed 's/^/ • /') # Common problem areas - check each local problem_areas="" # Check for large log files if [ -d "$mount/var/log" ]; then local large_logs=$(find "$mount/var/log" -type f -size +100M 2>/dev/null | head -5) if [ -n "$large_logs" ]; then problem_areas="${problem_areas}Large log files found:\n$(echo "$large_logs" | sed 's/^/ • /' | head -3)\n\n" fi fi # Check for email queue (common issue) if [ -d "$mount/var/spool/exim" ]; then local queue_count=$(find "$mount/var/spool/exim/input" -type f 2>/dev/null | wc -l) if [ "$queue_count" -gt 1000 ]; then problem_areas="${problem_areas}Large email queue: ${queue_count} messages\n\n" fi fi # Check for backups in home directories if [ -d "$mount/home" ]; then local backup_size=$(find "$mount/home" -type f \( -name "*.tar.gz" -o -name "*.zip" -o -name "*.sql.gz" -o -name "backup-*" \) -size +100M 2>/dev/null | wc -l) if [ "$backup_size" -gt 0 ]; then problem_areas="${problem_areas}Found ${backup_size} large backup/archive files in /home\n\n" fi fi # Check for old backups in /backup if [ -d "$mount/backup" ]; then local old_backups=$(find "$mount/backup" -type f -mtime +30 -size +100M 2>/dev/null | wc -l) if [ "$old_backups" -gt 0 ]; then problem_areas="${problem_areas}Found ${old_backups} old backups (>30 days, >100MB) in /backup\n\n" fi fi # Check for core dumps local core_dumps=$(find "$mount" -maxdepth 3 -name "core.*" -o -name "core" 2>/dev/null | wc -l) if [ "$core_dumps" -gt 5 ]; then problem_areas="${problem_areas}Found ${core_dumps} core dump files\n\n" fi analysis="Top directories by size:\n${top_dirs}\n\n${problem_areas}" add_issue "CRITICAL" "DISK - ${mount} critically full" \ "Filesystem: ${filesystem} Usage: ${used} / ${size} (${percent}%) Available: ${avail} ${analysis}" \ "Free up space immediately: • Review top consumers above • Clean old logs: find /var/log -type f -mtime +30 -delete • Clean package cache: yum clean all • Clean email queue: exim -bp | wc -l (check queue) • Remove old backups: find /backup -mtime +30 -delete • Find large files: find ${mount} -type f -size +100M -exec ls -lh {} \\;" \ 98 elif [ "$percent" -gt 85 ]; then # Quick analysis for high usage local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -3 | sed 's/^/ • /') add_issue "HIGH" "DISK - ${mount} high usage" \ "Filesystem: ${filesystem} Usage: ${used} / ${size} (${percent}%) Available: ${avail} Top directories: ${top_dirs}" \ "Investigate disk usage: • du -sh ${mount}/* | sort -rh | head -10 • Clean up unnecessary files or expand partition • Check logs: ls -lh /var/log/*.log • Check email queue: exim -bpc (if using Exim)" \ 75 elif [ "$percent" -gt 75 ]; then add_issue "MEDIUM" "DISK - ${mount} approaching capacity" \ "Usage: ${used} / ${size} (${percent}%)" \ "Monitor disk usage. Plan for expansion." \ 65 fi done < <(df -h | tail -n +2) # Check inode usage while IFS= read -r line; do local filesystem=$(echo "$line" | awk '{print $1}') local percent=$(echo "$line" | awk '{print $5}' | tr -d '%') local mount=$(echo "$line" | awk '{print $6}') [[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue if [ "$percent" -gt 90 ]; then add_issue "HIGH" "DISK - ${mount} inode exhaustion" \ "Inode usage: ${percent}% Filesystem: ${filesystem}" \ "Find directories with many small files: • find ${mount} -xdev -type d -exec sh -c 'echo \$(ls -A {} | wc -l) {}' \\; | sort -rn | head -10" \ 85 fi done < <(df -i | tail -n +2) # Check for disk errors local disk_errors=$(dmesg | grep -i "I/O error\|sector\|SMART" | wc -l || echo "0") if [ "$disk_errors" -gt 0 ]; then local error_sample=$(dmesg | grep -i "I/O error\|sector\|SMART" | tail -3 | sed 's/^/ • /' || echo " • Check dmesg") add_issue "CRITICAL" "DISK - Hardware errors detected" \ "Found ${disk_errors} disk error messages Recent errors: ${error_sample}" \ "Check SMART status: smartctl -a /dev/sda Backup data immediately if errors persist." \ 92 fi } ################################################################################ # Phase 3: CPU Analysis ################################################################################ analyze_cpu() { print_info "Analyzing CPU usage..." local cpu_cores=$(nproc) local load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs) local load_5min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | xargs) local load_15min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | xargs) # Calculate load per core local load_per_core=$(echo "$load_1min / $cpu_cores" | bc -l 2>/dev/null | awk '{printf "%.2f", $0}' || echo "0") # Calculate healthy load thresholds local healthy_load=$(echo "$cpu_cores * 0.7" | bc -l | awk '{printf "%.1f", $0}') local warning_load=$(echo "$cpu_cores * 1.0" | bc -l | awk '{printf "%.1f", $0}') local critical_load=$(echo "$cpu_cores * 2.0" | bc -l | awk '{printf "%.1f", $0}') # Detect load trend (increasing, stable, decreasing) local load_trend="stable" if (( $(echo "$load_1min > $load_5min * 1.2" | bc -l) )); then load_trend="increasing rapidly" elif (( $(echo "$load_1min > $load_5min" | bc -l) )); then load_trend="increasing" elif (( $(echo "$load_1min < $load_5min * 0.8" | bc -l) )); then load_trend="decreasing" fi # Check load average with intelligent thresholds if (( $(echo "$load_1min > $critical_load" | bc -l) )); then local top_cpu=$(ps aux --sort=-%cpu | head -6 | tail -5 | awk '{printf " • %-15s %6s %s\n", $1, $3"%", $11}') add_issue "CRITICAL" "CPU - Extreme load" \ "Load average: ${load_1min} / ${load_5min} / ${load_15min} CPU cores: ${cpu_cores} Load per core: ${load_per_core} Trend: ${load_trend} Healthy load for this server: < ${healthy_load} Warning threshold: ${warning_load} Critical threshold: ${critical_load} Top CPU consumers: ${top_cpu}" \ "Immediate action required: 1. Identify runaway processes: ps aux --sort=-%cpu | head -20 2. Kill if necessary: kill -9 [PID] 3. Check if under attack: Main Menu → Security → Bot Analyzer" \ 92 elif (( $(echo "$load_1min > $warning_load" | bc -l) )); then local top_cpu=$(ps aux --sort=-%cpu | head -4 | tail -3 | awk '{printf " • %-15s %6s %s\n", $1, $3"%", $11}') add_issue "HIGH" "CPU - High load" \ "Load average: ${load_1min} / ${load_5min} / ${load_15min} CPU cores: ${cpu_cores} Load per core: ${load_per_core} Trend: ${load_trend} Healthy load: < ${healthy_load} Current: ${load_1min} (above warning threshold of ${warning_load}) Top CPU consumers: ${top_cpu}" \ "Monitor and optimize: • Check: ps aux --sort=-%cpu | head -20 • Review high-CPU processes and optimize if possible" \ 76 elif (( $(echo "$load_1min > $healthy_load" | bc -l) )); then add_issue "MEDIUM" "CPU - Elevated load" \ "Load average: ${load_1min} / ${load_5min} / ${load_15min} Healthy threshold: < ${healthy_load} Trend: ${load_trend}" \ "Monitor trends. Load is elevated but not critical yet." \ 62 fi # Get top CPU consumers ps aux --sort=-%cpu | head -11 | tail -10 > "$TEMP_DIR/top_cpu.txt" } ################################################################################ # Phase 4: MySQL/MariaDB Health ################################################################################ analyze_mysql() { print_info "Analyzing MySQL/MariaDB..." # Check if MySQL is running if ! command -v mysql >/dev/null 2>&1; then add_issue "LOW" "MYSQL - Not installed" \ "MySQL/MariaDB not found on system" \ "No action needed if not using databases." \ 20 return fi if ! pgrep -x "mysqld|mariadbd" >/dev/null; then add_issue "CRITICAL" "MYSQL - Service not running" \ "MySQL/MariaDB process not found" \ "Start service: systemctl start mysql / mariadb" \ 95 return fi # Get MySQL stats (if we can connect) local mysql_stats=$(mysql -e "SHOW GLOBAL STATUS LIKE 'Threads_connected'; SHOW GLOBAL STATUS LIKE 'Max_used_connections'; SHOW VARIABLES LIKE 'max_connections';" 2>/dev/null) if [ -n "$mysql_stats" ]; then local threads_connected=$(echo "$mysql_stats" | grep "Threads_connected" | awk '{print $2}' || echo "0") local max_used=$(echo "$mysql_stats" | grep "Max_used_connections" | awk '{print $2}' || echo "0") local max_connections=$(echo "$mysql_stats" | grep "max_connections" | awk '{print $2}' || echo "0") local connection_percent=$((threads_connected * 100 / max_connections)) if [ "$connection_percent" -gt 80 ]; then add_issue "HIGH" "MYSQL - Connection limit approaching" \ "Current connections: ${threads_connected} / ${max_connections} (${connection_percent}%) Max used: ${max_used}" \ "Investigate and fix: 1. Find slow queries: Main Menu → Performance → MySQL Query Analyzer 2. Increase max_connections in /etc/my.cnf if needed 3. Check for connection leaks: show processlist;" \ 78 fi # Check InnoDB buffer pool vs RAM local innodb_buffer=$(mysql -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" 2>/dev/null | grep innodb | awk '{print $2}' || echo "0") local innodb_buffer_gb=$((innodb_buffer / 1024 / 1024 / 1024)) local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024)) if [ "$innodb_buffer_gb" -gt 0 ] && [ "$innodb_buffer_gb" -gt $((mem_total_gb - 2)) ]; then add_issue "HIGH" "MYSQL - InnoDB buffer pool too large" \ "innodb_buffer_pool_size: ${innodb_buffer_gb}GB Total RAM: ${mem_total_gb}GB System has insufficient free RAM for OS and other services" \ "Reduce innodb_buffer_pool_size to ~${mem_total_gb}GB * 0.6 = ~$((mem_total_gb * 6 / 10))GB Edit /etc/my.cnf and restart MySQL" \ 82 fi fi } ################################################################################ # Phase 5: Apache Health ################################################################################ analyze_apache() { print_info "Analyzing Apache..." # Check if Apache is running if ! pgrep -x "httpd|apache2" >/dev/null; then add_issue "CRITICAL" "APACHE - Service not running" \ "Apache process not found" \ "Start service: systemctl start httpd / apache2" \ 95 return fi # Check Apache error log for recent issues local apache_error_log="" if [ -f "/var/log/httpd/error_log" ]; then apache_error_log="/var/log/httpd/error_log" elif [ -f "/var/log/apache2/error.log" ]; then apache_error_log="/var/log/apache2/error.log" fi if [ -n "$apache_error_log" ]; then # Check for MaxRequestWorkers limit hits local max_workers_hits=$(grep -c "server reached MaxRequestWorkers" "$apache_error_log" 2>/dev/null || echo "0") max_workers_hits=$(echo "$max_workers_hits" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) max_workers_hits=${max_workers_hits:-0} if [ "$max_workers_hits" -gt 20 ] 2>/dev/null; then add_issue "CRITICAL" "APACHE - MaxRequestWorkers limit hit frequently" \ "Server reached MaxRequestWorkers limit ${max_workers_hits} times This causes connection refusal and 'server busy' errors" \ "Increase MaxRequestWorkers in Apache config (if RAM allows) OR investigate slow PHP scripts / database queries causing workers to hang Check: apachectl -M | grep mpm" \ 88 elif [ "$max_workers_hits" -gt 5 ] 2>/dev/null; then add_issue "HIGH" "APACHE - MaxRequestWorkers limit reached" \ "Limit hit ${max_workers_hits} times" \ "Monitor and consider increasing MaxRequestWorkers." \ 72 fi # Check for segfaults local segfaults=$(grep -c "segfault" "$apache_error_log" 2>/dev/null || echo "0") segfaults=$(echo "$segfaults" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) segfaults=${segfaults:-0} if [ "$segfaults" -gt 0 ] 2>/dev/null; then add_issue "HIGH" "APACHE - Segmentation faults detected" \ "Found ${segfaults} segfault events May indicate corrupted modules or memory issues" \ "Check error log: tail -100 ${apache_error_log} | grep segfault Update/reinstall problematic Apache modules" \ 78 fi fi # Check Apache process count local apache_procs=$(pgrep -c "httpd|apache2" || echo "0") if [ "$apache_procs" -gt 200 ]; then add_issue "MEDIUM" "APACHE - High process count" \ "Apache processes: ${apache_procs} May indicate connection buildup or slow backend" \ "Check Apache status: apachectl fullstatus Review MaxRequestWorkers setting" \ 65 fi } ################################################################################ # Phase 6: PHP-FPM Health (cPanel/Plesk) ################################################################################ analyze_php_fpm() { print_info "Analyzing PHP-FPM..." # Check if PHP-FPM is running if ! pgrep -f "php-fpm" >/dev/null; then add_issue "LOW" "PHP-FPM - Not running or not installed" \ "PHP-FPM processes not found May be using mod_php instead" \ "No action needed if using Apache mod_php." \ 30 return fi # Count PHP-FPM processes per user local fpm_user_counts=$(ps aux | grep "php-fpm: pool" | grep -v grep | awk '{print $1}' | sort | uniq -c | sort -rn | head -10) while IFS= read -r line; do [ -z "$line" ] && continue local count=$(echo "$line" | awk '{print $1}') local user=$(echo "$line" | awk '{print $2}') if [ -n "$count" ] && [ "$count" -gt 10 ] 2>/dev/null; then # Try to find which domain local user_domain=$(grep "^${user}:" /etc/trueuserdomains 2>/dev/null | cut -d: -f1 || echo "unknown") add_issue "HIGH" "PHP-FPM - User '${user}' has many processes" \ "PHP-FPM processes: ${count} User: ${user} Domain: ${user_domain} Possible causes: Stuck processes, heavy traffic, slow scripts, bot attacks, database issues" \ "Investigate with specialized tools: 1. Check for bot attacks: Main Menu → Security → Bot Analyzer (analyze ${user_domain}) 2. Check for slow MySQL queries: Main Menu → Performance → MySQL Query Analyzer 3. Manual check: ps aux | grep ${user} | grep php-fpm 4. Kill if stuck: pkill -9 -u ${user} php-fpm" \ 76 fi done <<< "$fpm_user_counts" } ################################################################################ # Phase 7: Log Analysis - Security ################################################################################ analyze_security_logs() { print_info "Analyzing security logs..." local secure_log="" if [ -f "/var/log/secure" ]; then secure_log="/var/log/secure" elif [ -f "/var/log/auth.log" ]; then secure_log="/var/log/auth.log" fi if [ -n "$secure_log" ]; then # Check for failed SSH attempts local failed_ssh=$(grep "Failed password" "$secure_log" 2>/dev/null | wc -l || echo "0") if [ "$failed_ssh" -gt 100 ]; then local top_ips=$(grep "Failed password" "$secure_log" | awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -5 | sed 's/^/ • /') # Check if cPHulkd is available (cPanel) local protection_cmd="" if [ "$SYS_CONTROL_PANEL" = "cpanel" ] && [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then # Check if cPHulkd is enabled local cphulk_enabled=$(/usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -i "enabled" || echo "disabled") if echo "$cphulk_enabled" | grep -qi "disabled"; then protection_cmd="Enable cPHulk (cPanel's brute force protection): • Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection • Via command: bash modules/security/enable-cphulk.sh • Manual enable: /usr/local/cpanel/bin/cphulk_pam_ctl --enable • The setup wizard will automatically import your CSF whitelist to cPHulk" else protection_cmd="cPHulk is enabled. Ensure trusted IPs are whitelisted: • Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection • Via command: bash modules/security/enable-cphulk.sh • Manual whitelist: whmapi1 cphulkd_add_whitelist ip=YOUR_IP • View blocked IPs: whmapi1 cphulkd_list_blocks" fi else protection_cmd="Install automatic blocking: yum install fail2ban" fi add_issue "HIGH" "SECURITY - High SSH brute force attempts" \ "Failed SSH login attempts: ${failed_ssh} Top attacking IPs: ${top_ips}" \ "Investigate and block: 1. Analyze attack patterns: Main Menu → Security → Bot Analyzer 2. Block IPs manually: csf -d [IP] 3. ${protection_cmd}" \ 80 elif [ "$failed_ssh" -gt 50 ]; then add_issue "MEDIUM" "SECURITY - Moderate SSH brute force" \ "Failed attempts: ${failed_ssh}" \ "Monitor and consider IP blocking if it increases." \ 60 fi # Check for successful root logins local root_logins=$(grep "Accepted.*root" "$secure_log" 2>/dev/null | wc -l || echo "0") if [ "$root_logins" -gt 0 ]; then local root_login_ips=$(grep "Accepted.*root" "$secure_log" | awk '{print $(NF-3)}' | sort -u | sed 's/^/ • /') local root_login_times=$(grep "Accepted.*root" "$secure_log" | awk '{print $1, $2, $3}' | tail -5 | sed 's/^/ • /') # Check for unusual IPs (not common admin IPs) add_issue "MEDIUM" "SECURITY - Root SSH logins detected" \ "Successful root logins: ${root_logins} Source IPs: ${root_login_ips} Recent logins: ${root_login_times}" \ "Review if these IPs are authorized: • Disable root SSH: Set 'PermitRootLogin no' in /etc/ssh/sshd_config • Use SSH keys instead of passwords • Check: last | grep root" \ 68 fi # Check for suspicious sudo usage local sudo_attempts=$(grep "sudo.*COMMAND" "$secure_log" 2>/dev/null | wc -l || echo "0") if [ "$sudo_attempts" -gt 100 ]; then local top_sudo_users=$(grep "sudo.*COMMAND" "$secure_log" | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -rn | head -5 | sed 's/^/ • /') add_issue "MEDIUM" "SECURITY - High sudo activity" \ "Sudo command executions: ${sudo_attempts} Top users: ${top_sudo_users}" \ "Review sudo logs for unusual activity: • grep sudo /var/log/secure | tail -50" \ 65 fi fi # Check for potential rootkit indicators local suspicious_processes=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | wc -l || echo "0") if [ "$suspicious_processes" -gt 0 ]; then local sus_proc_list=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | head -5 | awk '{print $11}' | sed 's/^/ • /') add_issue "CRITICAL" "SECURITY - Suspicious processes detected" \ "Found ${suspicious_processes} processes with suspicious paths: ${sus_proc_list} May indicate rootkit or malware" \ "Run rootkit scanner immediately: • Install chkrootkit: yum install chkrootkit • Or rkhunter: yum install rkhunter • Check processes: ps aux | grep -E '\.\./\.\.'" \ 95 fi } ################################################################################ # Phase 8: System Messages Log ################################################################################ analyze_messages_log() { print_info "Analyzing system messages..." if [ ! -f "/var/log/messages" ]; then return fi # Check for kernel panics local kernel_panics=$(grep "kernel panic\|Oops:" /var/log/messages 2>/dev/null | wc -l) if [ -n "$kernel_panics" ] && [ "$kernel_panics" -gt 0 ] 2>/dev/null; then add_issue "CRITICAL" "SYSTEM - Kernel panics detected" \ "Found ${kernel_panics} kernel panic events System stability compromised" \ "Review: grep 'kernel panic' /var/log/messages Update kernel or investigate hardware issues" \ 98 fi # Check for hardware errors local hw_errors=$(grep "Hardware Error\|MCE\|ECC" /var/log/messages 2>/dev/null | wc -l) if [ -n "$hw_errors" ] && [ "$hw_errors" -gt 0 ] 2>/dev/null; then # Get actual error samples local error_samples=$(grep -E "Hardware Error|MCE|ECC" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /') add_issue "HIGH" "SYSTEM - Hardware errors detected" \ "Found ${hw_errors} hardware error messages May indicate failing hardware (RAM, CPU, disk) Recent errors: ${error_samples}" \ "Investigate hardware health: • Check RAM: dmidecode -t memory | grep -E 'Size|Type|Speed|Error' • Check disk SMART: smartctl -a /dev/sda | grep -E 'Error|Failed|Health' • Check dmesg: dmesg | grep -i 'error\|fail' | tail -20 • Run memtest86+ if RAM errors detected • Contact hosting provider if persistent" \ 85 fi # Check for network issues local net_errors=$(grep "link is down\|no carrier" /var/log/messages 2>/dev/null | wc -l) if [ -n "$net_errors" ] && [ "$net_errors" -gt 5 ] 2>/dev/null; then add_issue "MEDIUM" "NETWORK - Connection issues" \ "Found ${net_errors} network link issues" \ "Check network cable/switch, review: ip link show" \ 62 fi } ################################################################################ # Phase 9: Cron Job Health ################################################################################ analyze_cron() { print_info "Analyzing cron jobs..." local cron_log="/var/log/cron" if [ ! -f "$cron_log" ]; then return fi # Check for failed cron jobs (last 24 hours) local cron_errors=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed\|No such file" | wc -l) if [ -n "$cron_errors" ] && [ "$cron_errors" -gt 20 ] 2>/dev/null; then local error_sample=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed" | tail -5 | sed 's/^/ • /') add_issue "HIGH" "CRON - Many failed jobs" \ "Found ${cron_errors} cron errors in last 24h Sample errors: ${error_sample}" \ "Review cron logs: tail -100 /var/log/cron Fix failing scripts or disable problematic jobs" \ 75 elif [ -n "$cron_errors" ] && [ "$cron_errors" -gt 5 ] 2>/dev/null; then add_issue "MEDIUM" "CRON - Some failed jobs" \ "Found ${cron_errors} cron errors in last 24h" \ "Review: grep error /var/log/cron | tail -20" \ 60 fi } ################################################################################ # Phase 10: Network Analysis ################################################################################ analyze_network() { print_info "Analyzing network..." # Check nf_conntrack usage if [ -f "/proc/sys/net/netfilter/nf_conntrack_count" ]; then local conntrack_count=$(cat /proc/sys/net/netfilter/nf_conntrack_count 2>/dev/null || echo "0") local conntrack_max=$(cat /proc/sys/net/netfilter/nf_conntrack_max 2>/dev/null || echo "100000") local conntrack_percent=$((conntrack_count * 100 / conntrack_max)) if [ "$conntrack_percent" -gt 90 ]; then add_issue "CRITICAL" "NETWORK - Connection tracking table near limit" \ "nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%) New connections may be dropped" \ "Increase limit: • sysctl -w net.netfilter.nf_conntrack_max=$((conntrack_max * 2)) • Add to /etc/sysctl.conf: net.netfilter.nf_conntrack_max=$((conntrack_max * 2)) • Or investigate connection leaks" \ 88 elif [ "$conntrack_percent" -gt 75 ]; then add_issue "HIGH" "NETWORK - Connection tracking table high" \ "nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%)" \ "Monitor and consider increasing limit if it continues to grow." \ 72 fi fi # Check for TCP retransmissions local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0") tcp_retrans=$(echo "$tcp_retrans" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) tcp_retrans=${tcp_retrans:-0} local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1") tcp_out=$(echo "$tcp_out" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) tcp_out=${tcp_out:-1} if [ "$tcp_out" -gt 1000000 ] 2>/dev/null; then local retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0") if (( $(echo "$retrans_percent > 5" | bc -l 2>/dev/null) )); then # Get current MTU local current_mtu=$(ip link show $(ip route | grep default | awk '{print $5}' | head -1) 2>/dev/null | grep mtu | awk '{print $5}') add_issue "HIGH" "NETWORK - High TCP retransmission rate" \ "Retransmissions: ${retrans_percent}% of total segments (${tcp_retrans} of ${tcp_out}) This indicates network issues, congestion, or bandwidth problems Common causes: • Network congestion or packet loss • MTU mismatch (current: ${current_mtu}) • Bandwidth saturation • Faulty network hardware • ISP/hosting provider issues" \ "Diagnose and resolve network issues: 1. Test packet loss: ping -c 100 8.8.8.8 | grep loss 2. Check MTU: ip link show | grep mtu (should be 1500 for most networks) 3. Monitor bandwidth: install vnstat for tracking (yum install vnstat) 4. Real-time monitor: iftop -i eth0 (install: yum install iftop) 5. Check provider status: Contact hosting provider if persistent 6. Analyze traffic: Main Menu → Security → Bot Analyzer (check for attacks) NOTE: For detailed bandwidth analysis, use: Main Menu → Network Diagnostics (when available)" \ 74 fi fi # Check listen queue overflows local listen_overflows=$(netstat -s 2>/dev/null | grep "times the listen queue of a socket overflowed" | awk '{print $1}' | head -1) [ -z "$listen_overflows" ] && listen_overflows=0 if [ "$listen_overflows" -gt 100 ] 2>/dev/null; then add_issue "MEDIUM" "NETWORK - Listen queue overflows detected" \ "Listen queue overflows: ${listen_overflows} Applications may be dropping connections" \ "Increase net.core.somaxconn: • sysctl -w net.core.somaxconn=4096 • Add to /etc/sysctl.conf" \ 68 fi } ################################################################################ # Phase 11: Time/NTP Analysis ################################################################################ analyze_time() { print_info "Analyzing time synchronization..." # Check if chronyd or ntpd is running local time_service="" if pgrep -x chronyd >/dev/null; then time_service="chronyd" elif pgrep -x ntpd >/dev/null; then time_service="ntpd" fi if [ -z "$time_service" ]; then add_issue "MEDIUM" "TIME - No NTP service running" \ "Neither chronyd nor ntpd is running Time drift can cause SSL certificate errors and authentication issues" \ "Install and start NTP service: • AlmaLinux/RHEL: yum install chrony && systemctl enable --now chronyd • Check sync: chronyc tracking" \ 66 else # Check time sync status if [ "$time_service" = "chronyd" ]; then local sync_status=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4, $5}' || echo "unknown") local offset_seconds=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4}' || echo "0") # Convert to absolute value for comparison offset_seconds=${offset_seconds#-} if (( $(echo "$offset_seconds > 1" | bc -l 2>/dev/null || echo "0") )); then add_issue "HIGH" "TIME - Clock offset detected" \ "Time offset: ${sync_status} Significant time drift detected" \ "Force time sync: • chronyc -a makestep • Check sources: chronyc sources" \ 75 fi fi fi } ################################################################################ # Phase 12: System Updates & Kernel ################################################################################ analyze_updates() { print_info "Analyzing system updates..." # Check if running kernel matches installed kernel local running_kernel=$(uname -r) local installed_kernel=$(rpm -q kernel --last 2>/dev/null | head -1 | awk '{print $1}' | sed 's/kernel-//' || echo "$running_kernel") if [ "$running_kernel" != "$installed_kernel" ]; then add_issue "MEDIUM" "SYSTEM - Reboot required" \ "Running kernel: ${running_kernel} Installed kernel: ${installed_kernel} System needs reboot to use new kernel" \ "Schedule maintenance window and reboot: • Check uptime: uptime • Reboot when ready: reboot" \ 64 fi # Check for available security updates (if yum/dnf) if command -v yum >/dev/null 2>&1; then local security_updates=$(yum updateinfo list security 2>/dev/null | grep "^FEDORA\|^RHSA" | wc -l) if [ -n "$security_updates" ] && [ "$security_updates" -gt 10 ] 2>/dev/null; then add_issue "HIGH" "SYSTEM - Many security updates available" \ "Security updates available: ${security_updates} System may be vulnerable" \ "Apply security updates: • yum update --security • Or schedule full update: yum update" \ 76 elif [ -n "$security_updates" ] && [ "$security_updates" -gt 0 ] 2>/dev/null; then add_issue "MEDIUM" "SYSTEM - Security updates available" \ "Security updates: ${security_updates}" \ "Review and apply: yum updateinfo list security" \ 58 fi fi # Check for control panel version if [ "$SYS_CONTROL_PANEL" = "cpanel" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then echo "cPanel version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt" elif [ "$SYS_CONTROL_PANEL" = "plesk" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then echo "Plesk version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt" elif [ "$SYS_CONTROL_PANEL" = "interworx" ] && [ -n "$SYS_CONTROL_PANEL_VERSION" ]; then echo "InterWorx version: $SYS_CONTROL_PANEL_VERSION" >> "$TEMP_DIR/system_info.txt" fi } ################################################################################ # Phase 13: File Limits & Descriptors ################################################################################ analyze_file_limits() { print_info "Analyzing file descriptor limits..." # Check system-wide file descriptor usage local file_nr=$(cat /proc/sys/fs/file-nr 2>/dev/null || echo "0 0 100000") local used_fds=$(echo "$file_nr" | awk '{print $1}') local max_fds=$(echo "$file_nr" | awk '{print $3}') local fd_percent=$((used_fds * 100 / max_fds)) if [ "$fd_percent" -gt 80 ]; then add_issue "HIGH" "FILE DESCRIPTORS - System limit approaching" \ "Open file descriptors: ${used_fds} / ${max_fds} (${fd_percent}%) Applications may fail to open files/sockets" \ "Increase limit: • sysctl -w fs.file-max=$((max_fds * 2)) • Add to /etc/sysctl.conf: fs.file-max=$((max_fds * 2)) • Check per-process limits: ulimit -n" \ 78 fi # Check for common services with high FD usage for service in httpd mysqld php-fpm; do if pgrep -x "$service" >/dev/null; then local service_fds=$(lsof -p $(pgrep -x "$service" | head -1) 2>/dev/null | wc -l || echo "0") if [ "$service_fds" -gt 10000 ]; then add_issue "MEDIUM" "$service - High file descriptor usage" \ "Service: $service Open file descriptors: ${service_fds} May indicate connection/file leaks" \ "Investigate $service: • lsof -p \$(pgrep -x $service | head -1) | head -100 • Check for file/connection leaks" \ 65 fi fi done } ################################################################################ # Phase 14: Email Queue Analysis ################################################################################ analyze_email_queue() { print_info "Analyzing email queue..." local queue_count=0 local mail_system="" # Check for Exim (most common on cPanel) if command -v exim >/dev/null 2>&1; then mail_system="Exim" queue_count=$(exim -bpc 2>/dev/null || echo "0") # Check for Postfix elif command -v postqueue >/dev/null 2>&1; then mail_system="Postfix" queue_count=$(postqueue -p 2>/dev/null | tail -1 | awk '{print $5}' || echo "0") # Check for Sendmail elif command -v mailq >/dev/null 2>&1; then mail_system="Sendmail" queue_count=$(mailq 2>/dev/null | grep -c "^[A-Z]" || echo "0") fi [ -z "$queue_count" ] && queue_count=0 if [ "$queue_count" -gt 5000 ]; then # Get sample of queued messages local queue_sample="" if [ "$mail_system" = "Exim" ]; then queue_sample=$(exim -bp | head -20 | sed 's/^/ /') fi add_issue "CRITICAL" "EMAIL - Massive mail queue" \ "Mail system: ${mail_system} Queue size: ${queue_count} messages This can consume disk space and cause slow mail delivery Sample: ${queue_sample}" \ "Investigate and clear queue: • Check for spam/compromised accounts • Review: exim -bp | head -50 (for Exim) • Clear specific messages: exim -Mrm [message-id] • Force delivery attempts: exim -qff • Check for frozen messages: exim -bp | grep frozen" \ 92 elif [ "$queue_count" -gt 1000 ]; then add_issue "HIGH" "EMAIL - Large mail queue" \ "Mail system: ${mail_system} Queue size: ${queue_count} messages" \ "Review mail queue: • exim -bp | less (for Exim) • mailq | less (for Postfix/Sendmail) • Check for spam/compromised accounts • Review /var/log/exim_mainlog for errors" \ 78 elif [ "$queue_count" -gt 100 ]; then add_issue "MEDIUM" "EMAIL - Growing mail queue" \ "Queue size: ${queue_count} messages (${mail_system})" \ "Monitor queue. May indicate delivery issues or spam." \ 58 fi } ################################################################################ # Phase 15: I/O Wait Analysis ################################################################################ analyze_iowait() { print_info "Analyzing disk I/O performance..." # Get current I/O wait from top local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1) if [ -z "$iowait" ]; then # Try alternative method with iostat if available if command -v iostat >/dev/null 2>&1; then iowait=$(iostat -c 1 2 | tail -1 | awk '{print $4}') else return fi fi # Remove any decimal point and convert to integer for comparison local iowait_int=$(echo "$iowait" | cut -d'.' -f1) [ -z "$iowait_int" ] && iowait_int=0 if [ "$iowait_int" -gt 30 ] 2>/dev/null; then # Try to find which process is causing I/O local io_procs=$(iotop -b -n 1 2>/dev/null | head -20 || ps aux --sort=-pcpu | head -10) add_issue "CRITICAL" "DISK I/O - Extremely high I/O wait" \ "Current I/O wait: ${iowait}% System is waiting on disk operations - extreme performance impact" \ "Identify I/O-heavy processes: • iotop -o (show only active I/O) • iostat -x 1 (detailed disk stats) • Check for failing drives: dmesg | grep -i error • Check disk health: smartctl -a /dev/sda" \ 95 elif [ "$iowait_int" -gt 15 ] 2>/dev/null; then add_issue "HIGH" "DISK I/O - High I/O wait" \ "I/O wait: ${iowait}% System performance degraded by disk operations" \ "Monitor disk I/O: • iostat -x 1 5 (watch for 5 seconds) • iotop (if installed: yum install iotop) • Check for large file operations • Review disk usage and fragmentation" \ 76 elif [ "$iowait_int" -gt 5 ] 2>/dev/null; then add_issue "MEDIUM" "DISK I/O - Elevated I/O wait" \ "I/O wait: ${iowait}%" \ "Monitor disk performance. May indicate heavy disk activity." \ 55 fi } ################################################################################ # Phase 16: SELinux Analysis ################################################################################ analyze_selinux() { print_info "Analyzing SELinux status..." # Check if SELinux is enabled if ! command -v getenforce >/dev/null 2>&1; then return fi local selinux_status=$(getenforce 2>/dev/null || echo "Disabled") if [ "$selinux_status" = "Enforcing" ]; then # Check for recent denials local denials_count=0 if [ -f "/var/log/audit/audit.log" ]; then denials_count=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | wc -l) fi if [ "$denials_count" -gt 50 ]; then local denial_sample=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | tail -5 | sed 's/^/ /') add_issue "HIGH" "SELINUX - Many denials detected" \ "SELinux denials today: ${denials_count} SELinux is blocking operations - may cause application failures Recent denials: ${denial_sample}" \ "Review and fix SELinux policies: • ausearch -m avc -ts today • audit2allow -a (generate policy) • audit2why -a (explain denials) • Temporarily: setenforce 0 (NOT recommended for production)" \ 82 elif [ "$denials_count" -gt 10 ]; then add_issue "MEDIUM" "SELINUX - Some denials detected" \ "SELinux denials today: ${denials_count}" \ "Review: ausearch -m avc -ts today" \ 62 fi fi } ################################################################################ # Phase 17: Control Panel Services ################################################################################ analyze_control_panel_services() { print_info "Analyzing control panel services..." local panel_issues="" # Check cPanel services if [ "$SYS_PANEL" = "cpanel" ]; then # Key cPanel services to check local cpanel_services=("cpanel" "whostmgrd" "cpsrvd" "tailwatchd" "dnsadmin") for service in "${cpanel_services[@]}"; do if ! systemctl is-active --quiet "$service" 2>/dev/null; then panel_issues="${panel_issues} • ${service} is not running\n" fi done if [ -n "$panel_issues" ]; then add_issue "HIGH" "CPANEL - Critical services down" \ "The following cPanel services are not running: ${panel_issues} This affects control panel functionality" \ "Restart services: • systemctl restart cpanel • /scripts/restartsrv_cpanel • Check logs: tail -50 /usr/local/cpanel/logs/error_log" \ 85 fi # Check Plesk services elif [ "$SYS_PANEL" = "plesk" ]; then if ! systemctl is-active --quiet psa 2>/dev/null; then add_issue "HIGH" "PLESK - Panel service down" \ "Plesk service is not running" \ "Restart Plesk: systemctl restart psa" \ 85 fi fi # Check web server if ! pgrep -x httpd >/dev/null && ! pgrep -x apache2 >/dev/null; then add_issue "CRITICAL" "WEB SERVER - Apache/httpd not running" \ "Web server process not found All websites are down" \ "Start web server: • systemctl restart httpd (CentOS/AlmaLinux) • systemctl restart apache2 (Debian/Ubuntu) • Check logs: tail -50 /var/log/httpd/error_log" \ 98 fi # Check database server if ! pgrep -x mysqld >/dev/null && ! pgrep -x mariadbd >/dev/null; then add_issue "CRITICAL" "DATABASE - MySQL/MariaDB not running" \ "Database server process not found Database-driven sites are down" \ "Start database: • systemctl restart mariadb • systemctl restart mysql • Check logs: tail -50 /var/log/mariadb/mariadb.log • Check disk space in /var/lib/mysql" \ 98 fi } ################################################################################ # Phase 18: DNS Resolution Check ################################################################################ analyze_dns() { print_info "Analyzing DNS resolution..." # Test resolving a few critical domains local test_domains=("google.com" "cloudflare.com" "8.8.8.8") local failed_count=0 local slow_count=0 local failed_domains="" for domain in "${test_domains[@]}"; do local start_time=$(date +%s%N) if ! host "$domain" >/dev/null 2>&1; then failed_count=$((failed_count + 1)) failed_domains="${failed_domains} • ${domain}\n" else local end_time=$(date +%s%N) local duration_ms=$(( (end_time - start_time) / 1000000 )) if [ "$duration_ms" -gt 2000 ]; then slow_count=$((slow_count + 1)) fi fi done if [ "$failed_count" -gt 0 ]; then add_issue "CRITICAL" "DNS - Resolution failures detected" \ "Failed to resolve ${failed_count} test domains: ${failed_domains} DNS issues cause slow loading and failures across all services" \ "Check DNS configuration: • cat /etc/resolv.conf • Test: dig google.com • Try alternate DNS: echo 'nameserver 8.8.8.8' >> /etc/resolv.conf • Restart networking: systemctl restart network" \ 94 elif [ "$slow_count" -gt 1 ]; then add_issue "HIGH" "DNS - Slow resolution detected" \ "DNS queries taking >2 seconds This slows down all network operations" \ "Check DNS servers: • cat /etc/resolv.conf • Consider faster DNS: 8.8.8.8, 1.1.1.1 • Test: dig @8.8.8.8 google.com" \ 72 fi } ################################################################################ # Phase 19: Zombie Process Check ################################################################################ analyze_zombie_processes() { print_info "Analyzing zombie processes..." # Count zombie (defunct) processes local zombie_count=$(ps aux | awk '$8 ~ /Z/ {print $0}' | wc -l) if [ "$zombie_count" -gt 50 ]; then local zombie_sample=$(ps aux | awk '$8 ~ /Z/ {print $0}' | head -5 | awk '{print " • " $11 " (PID " $2 ", PPID via pstree)"}') add_issue "HIGH" "PROCESSES - Many zombie processes detected" \ "Zombie (defunct) processes: ${zombie_count} Indicates parent processes not properly cleaning up children Sample zombies: ${zombie_sample}" \ "Investigate parent processes: • ps aux | awk '\$8 ~ /Z/' • pstree -p | grep defunct • Kill parent process or reboot if persistent • Common causes: Apache, PHP-FPM, custom scripts" \ 78 elif [ "$zombie_count" -gt 10 ]; then add_issue "MEDIUM" "PROCESSES - Zombie processes detected" \ "Zombie processes: ${zombie_count} May indicate stuck parent processes" \ "Review: ps aux | awk '\$8 ~ /Z/'" \ 58 fi } ################################################################################ # Phase 20: Firewall Status Check ################################################################################ analyze_firewall() { print_info "Analyzing firewall status..." local firewall_status="" local firewall_active=0 # Check for CSF (ConfigServer Security & Firewall) if [ -x "/usr/sbin/csf" ]; then if csf -l >/dev/null 2>&1; then firewall_active=1 firewall_status="CSF" # Check if CSF is in testing mode if grep -q "TESTING = \"1\"" /etc/csf/csf.conf 2>/dev/null; then add_issue "MEDIUM" "FIREWALL - CSF in testing mode" \ "CSF firewall is in TESTING mode Blocks will auto-expire - not suitable for production" \ "Disable testing mode: • Edit /etc/csf/csf.conf • Set TESTING = \"0\" • Restart: csf -r" \ 62 fi # Check for high deny count (might indicate attack) local deny_count=$(csf -d 2>/dev/null | grep -c "^" || echo "0") if [ "$deny_count" -gt 1000 ]; then add_issue "MEDIUM" "FIREWALL - Many blocked IPs" \ "CSF has ${deny_count} denied IPs Server may be under attack or CSF may need tuning" \ "Review blocked IPs: • csf -d | less • Check for false positives • Consider: Main Menu → Security → Bot Analyzer" \ 65 fi fi fi # Check iptables if [ "$firewall_active" -eq 0 ]; then if systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then firewall_active=1 firewall_status="iptables" # Check if iptables has rules local rule_count=$(iptables -L | grep -c "^Chain\|^target" || echo "0") if [ "$rule_count" -lt 5 ]; then add_issue "MEDIUM" "FIREWALL - iptables active but minimal rules" \ "iptables is running but has very few rules Server may not be properly protected" \ "Review firewall rules: iptables -L -n -v" \ 55 fi fi fi # Check firewalld if [ "$firewall_active" -eq 0 ]; then if systemctl is-active --quiet firewalld 2>/dev/null; then firewall_active=1 firewall_status="firewalld" fi fi # Warn if no firewall detected if [ "$firewall_active" -eq 0 ]; then add_issue "HIGH" "FIREWALL - No active firewall detected" \ "No firewall found (CSF, iptables, firewalld) Server is exposed to attacks" \ "Install and configure a firewall: • CSF (recommended for cPanel): cd /usr/src && wget https://download.configserver.com/csf.tgz tar -xzf csf.tgz && cd csf && sh install.sh • Or enable firewalld: systemctl enable --now firewalld" \ 82 fi } ################################################################################ # Phase 21: Network Connectivity Check ################################################################################ analyze_network_connectivity() { print_info "Analyzing network connectivity..." # Test outbound connectivity local connectivity_failed=0 local test_ips=("8.8.8.8" "1.1.1.1") for ip in "${test_ips[@]}"; do if ! ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then connectivity_failed=$((connectivity_failed + 1)) fi done if [ "$connectivity_failed" -eq ${#test_ips[@]} ]; then add_issue "CRITICAL" "NETWORK - No internet connectivity" \ "Cannot reach external IPs (${test_ips[*]}) Server has no internet access - critical services affected" \ "Check network: • ip link show (check interfaces) • ip route show (check routing) • systemctl status network • Check gateway: ping gateway IP • Contact hosting provider if persistent" \ 96 elif [ "$connectivity_failed" -gt 0 ]; then add_issue "HIGH" "NETWORK - Intermittent connectivity issues" \ "Some external IPs unreachable Network may be unstable" \ "Check network stability: • ping -c 10 8.8.8.8 • mtr 8.8.8.8 (install: yum install mtr) • Check: ip route show" \ 74 fi # Test HTTP/HTTPS connectivity (can the server download updates?) if ! timeout 5 curl -s -o /dev/null https://google.com 2>/dev/null; then add_issue "MEDIUM" "NETWORK - HTTP/HTTPS connectivity issues" \ "Cannot establish HTTPS connections May affect updates, let's encrypt, external API calls" \ "Test connectivity: • curl -v https://google.com • Check firewall rules • Check proxy settings: echo \$http_proxy" \ 68 fi } ################################################################################ # Phase 22: CloudLinux Specific Checks ################################################################################ analyze_cloudlinux() { if [ "$SYS_CLOUDLINUX" != "yes" ]; then return fi print_info "Analyzing CloudLinux LVE limits..." # Check if lvectl exists if ! command -v lvectl >/dev/null 2>&1; then return fi # Get users hitting LVE limits local lve_faults=$(lvectl list --by-fault 2>/dev/null | head -10) if [ -n "$lve_faults" ]; then local top_faults=$(echo "$lve_faults" | head -5 | sed 's/^/ • /') add_issue "HIGH" "CLOUDLINUX - Users hitting LVE limits" \ "Top users hitting resource limits: ${top_faults} This causes 503 errors and slow websites" \ "Review limits: lvectl list Increase limits for affected users or optimize their sites: • lvectl set [USER] --cpu=200 --pmem=2G" \ 78 fi } ################################################################################ # Main Analysis Function ################################################################################ run_analysis() { clear print_banner "System Health Check" echo "" print_info "Starting comprehensive system analysis..." echo "" # Run all analysis phases analyze_memory analyze_memory_config analyze_disk analyze_cpu analyze_mysql analyze_apache analyze_php_fpm analyze_security_logs analyze_messages_log analyze_cron analyze_network analyze_time analyze_updates analyze_file_limits analyze_email_queue analyze_iowait analyze_selinux analyze_control_panel_services analyze_dns analyze_zombie_processes analyze_firewall analyze_network_connectivity analyze_cloudlinux print_success "Analysis complete!" echo "" } ################################################################################ # Report Generation ################################################################################ generate_report() { { echo "==============================================================================" echo "SERVER HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')" echo "==============================================================================" echo "" echo "System: $(hostname)" echo "Control Panel: ${SYS_CONTROL_PANEL:-none} ${SYS_CONTROL_PANEL_VERSION:-}" echo "OS: ${SYS_OS_TYPE:-unknown} ${SYS_OS_VERSION:-}" echo "Kernel: $(uname -r)" echo "" echo "SEVERITY SUMMARY:" echo " CRITICAL: ${#CRITICAL_ISSUES[@]} issues" echo " HIGH: ${#HIGH_ISSUES[@]} issues" echo " MEDIUM: ${#MEDIUM_ISSUES[@]} issues" echo " LOW: ${#LOW_ISSUES[@]} issues" echo "" # Critical issues if [ ${#CRITICAL_ISSUES[@]} -gt 0 ]; then echo "==============================================================================" echo "CRITICAL ISSUES (Immediate Action Required)" echo "==============================================================================" echo "" for issue in "${CRITICAL_ISSUES[@]}"; do echo "$issue" echo "------------------------------------------------------------------------------" echo "" done fi # High issues if [ ${#HIGH_ISSUES[@]} -gt 0 ]; then echo "==============================================================================" echo "HIGH ISSUES (Action Recommended)" echo "==============================================================================" echo "" for issue in "${HIGH_ISSUES[@]}"; do echo "$issue" echo "------------------------------------------------------------------------------" echo "" done fi # Medium issues if [ ${#MEDIUM_ISSUES[@]} -gt 0 ]; then echo "==============================================================================" echo "MEDIUM ISSUES (Monitor Closely)" echo "==============================================================================" echo "" for issue in "${MEDIUM_ISSUES[@]}"; do echo "$issue" echo "------------------------------------------------------------------------------" echo "" done fi # Low issues if [ ${#LOW_ISSUES[@]} -gt 0 ]; then echo "==============================================================================" echo "LOW ISSUES (Informational)" echo "==============================================================================" echo "" for issue in "${LOW_ISSUES[@]}"; do echo "$issue" echo "------------------------------------------------------------------------------" echo "" done fi # Summary echo "==============================================================================" echo "NEXT STEPS" echo "==============================================================================" echo "" echo "Priority Actions:" echo " 1. Address all CRITICAL issues immediately" echo " 2. Plan fixes for HIGH issues" echo " 3. Monitor MEDIUM issues for trends" echo "" echo "Detailed Analysis Available:" echo " • Bot Analyzer (Menu → Security) for traffic/attack analysis" echo " • MySQL Query Analyzer (Menu → Performance) for database optimization" echo "" echo "Report saved to: $REPORT_FILE" echo "" } | tee "$REPORT_FILE" } ################################################################################ # Save Health Baseline to Reference Database ################################################################################ save_health_baseline() { # Only save if reference database exists if [ ! -f "$SYSREF_DB" ]; then return fi print_info "Saving health baseline to reference database..." # Remove old health baseline section sed -i '/^\[HEALTH_BASELINE\]/,/^$/d' "$SYSREF_DB" 2>/dev/null # Collect current metrics local mem_total=$(free -m | awk '/^Mem:/ {print $2}') local mem_used=$(free -m | awk '/^Mem:/ {print $3}') local mem_percent=$((mem_used * 100 / mem_total)) local cpu_load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs) local disk_percent=$(df -h / | tail -1 | awk '{print $5}' | tr -d '%') local email_queue=0 if command -v exim >/dev/null 2>&1; then email_queue=$(exim -bpc 2>/dev/null || echo "0") fi local httpd_status="stopped" if pgrep -x httpd >/dev/null || pgrep -x apache2 >/dev/null; then httpd_status="running" fi local mysql_status="stopped" if pgrep -x mysqld >/dev/null || pgrep -x mariadbd >/dev/null; then mysql_status="running" fi local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1 | cut -d'.' -f1) [ -z "$iowait" ] && iowait=0 local zombie_count=$(ps aux | awk '$8 ~ /Z/' | wc -l) local firewall_active="none" if [ -x "/usr/sbin/csf" ] && csf -l >/dev/null 2>&1; then firewall_active="csf" elif systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then firewall_active="iptables" elif systemctl is-active --quiet firewalld 2>/dev/null; then firewall_active="firewalld" fi local current_date=$(date '+%Y-%m-%d') local current_datetime=$(date '+%Y-%m-%d %H:%M:%S') # Collect network metrics local network_interface=$(ip route | grep default | awk '{print $5}' | head -1) local network_mtu=$(ip link show "$network_interface" 2>/dev/null | grep mtu | awk '{print $5}' || echo "unknown") local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0") tcp_retrans=$(echo "$tcp_retrans" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) tcp_retrans=${tcp_retrans:-0} local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1") tcp_out=$(echo "$tcp_out" | tr -d '\n\r' | grep -o '[0-9]*' | head -1) tcp_out=${tcp_out:-1} local tcp_retrans_percent="0" if [ "$tcp_out" -gt 1000000 ] 2>/dev/null; then tcp_retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0") fi local rx_errors=0 local tx_errors=0 local rx_dropped=0 local tx_dropped=0 if [ -n "$network_interface" ] && [ -d "/sys/class/net/$network_interface/statistics" ]; then rx_errors=$(cat "/sys/class/net/$network_interface/statistics/rx_errors" 2>/dev/null || echo "0") tx_errors=$(cat "/sys/class/net/$network_interface/statistics/tx_errors" 2>/dev/null || echo "0") rx_dropped=$(cat "/sys/class/net/$network_interface/statistics/rx_dropped" 2>/dev/null || echo "0") tx_dropped=$(cat "/sys/class/net/$network_interface/statistics/tx_dropped" 2>/dev/null || echo "0") fi # Collect hardware status local disk_smart_status="unknown" local disk_errors_count=0 if command -v smartctl >/dev/null 2>&1; then local primary_disk=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1; exit}') if [ -n "$primary_disk" ]; then disk_smart_status=$(smartctl -H "$primary_disk" 2>/dev/null | grep "SMART overall-health" | awk '{print $NF}' || echo "unknown") fi fi disk_errors_count=$(dmesg | grep -i "I/O error\|sector\|SMART\|Hardware Error" | wc -l || echo "0") # Collect security metrics local ssh_failed_attempts=0 local ssh_attacks_today=0 if [ -f "/var/log/secure" ]; then ssh_failed_attempts=$(grep "Failed password" /var/log/secure 2>/dev/null | wc -l || echo "0") ssh_attacks_today=$(grep "Failed password" /var/log/secure 2>/dev/null | grep "$(date '+%b %e')" | wc -l || echo "0") fi local cphulk_status="not_installed" if [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then if /usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -qi "enabled"; then cphulk_status="enabled" else cphulk_status="disabled" fi fi # Append new health baseline section { echo "" echo "[HEALTH_BASELINE]" echo "HEALTH|TIMESTAMP|$current_datetime|" # System resources echo "HEALTH|MEMORY_TOTAL_MB|$mem_total|$current_date" echo "HEALTH|MEMORY_USED_PERCENT|$mem_percent|$current_date" echo "HEALTH|CPU_LOAD_1MIN|$cpu_load_1min|$current_date" echo "HEALTH|CPU_CORES|$CPU_CORES|$current_date" echo "HEALTH|DISK_USED_PERCENT|$disk_percent|$current_date" echo "HEALTH|IOWAIT_PERCENT|$iowait|$current_date" # Services echo "HEALTH|EMAIL_QUEUE_SIZE|$email_queue|$current_date" echo "HEALTH|ZOMBIE_PROCESSES|$zombie_count|$current_date" echo "HEALTH|HTTPD_STATUS|$httpd_status|$current_date" echo "HEALTH|MYSQL_STATUS|$mysql_status|$current_date" echo "HEALTH|FIREWALL_STATUS|$firewall_active|$current_date" # Network status echo "HEALTH|NETWORK_INTERFACE|$network_interface|$current_date" echo "HEALTH|NETWORK_MTU|$network_mtu|$current_date" echo "HEALTH|NETWORK_RX_ERRORS|$rx_errors|$current_date" echo "HEALTH|NETWORK_TX_ERRORS|$tx_errors|$current_date" echo "HEALTH|NETWORK_RX_DROPPED|$rx_dropped|$current_date" echo "HEALTH|NETWORK_TX_DROPPED|$tx_dropped|$current_date" echo "HEALTH|TCP_RETRANS_PERCENT|$tcp_retrans_percent|$current_date" # Hardware status echo "HEALTH|DISK_SMART_STATUS|$disk_smart_status|$current_date" echo "HEALTH|HARDWARE_ERRORS|$disk_errors_count|$current_date" # Security status echo "HEALTH|SSH_FAILED_ATTEMPTS_TOTAL|$ssh_failed_attempts|$current_date" echo "HEALTH|SSH_ATTACKS_TODAY|$ssh_attacks_today|$current_date" echo "HEALTH|CPHULK_STATUS|$cphulk_status|$current_date" # Issue counts echo "HEALTH|CRITICAL_ISSUES|${#CRITICAL_ISSUES[@]}|$current_date" echo "HEALTH|HIGH_ISSUES|${#HIGH_ISSUES[@]}|$current_date" echo "HEALTH|MEDIUM_ISSUES|${#MEDIUM_ISSUES[@]}|$current_date" echo "HEALTH|LOW_ISSUES|${#LOW_ISSUES[@]}|$current_date" echo "" } >> "$SYSREF_DB" } ################################################################################ # Display Report ################################################################################ display_report() { if [ ${#CRITICAL_ISSUES[@]} -eq 0 ] && [ ${#HIGH_ISSUES[@]} -eq 0 ] && [ ${#MEDIUM_ISSUES[@]} -eq 0 ] && [ ${#LOW_ISSUES[@]} -eq 0 ]; then echo "" print_success "No issues detected! System is healthy." echo "" else generate_report | less -R fi echo "" print_info "Full report saved to: $REPORT_FILE" echo "" read -p "Press Enter to continue..." } ################################################################################ # Main ################################################################################ run_analysis save_health_baseline display_report # Cleanup rm -rf "$TEMP_DIR"