Files
Linux-Server-Management-Too…/modules/diagnostics/system-health-check.sh
T
cschantz a51d968185 Initial commit: Server Management Toolkit v2.0
- Complete security menu restructure (3-mode: Analysis/Actions/Live)
- Intelligent cPHulk enablement with CSF whitelist import
- Live network security monitoring dashboard
- Multi-source threat detection and classification
- 50+ organized security tools across 4-level menu hierarchy
- System health diagnostics with cPanel/WHM integration
- Reference database for cross-module intelligence sharing
2025-11-03 18:21:40 -05:00

1791 lines
70 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# System Health Check - Universal Diagnostics
################################################################################
# Purpose: Comprehensive server health analysis with severity-based reporting
# Supports: cPanel, Plesk, InterWorx, CloudLinux, AlmaLinux
# Author: Server Toolkit
# Version: 1.0.0
################################################################################
# Load common functions
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "$SCRIPT_DIR/lib/common-functions.sh"
source "$SCRIPT_DIR/lib/system-detect.sh"
# Output files
REPORT_FILE="/tmp/system_health_report_$(date +%Y%m%d_%H%M%S).txt"
TEMP_DIR=$(mktemp -d /tmp/health-check-XXXXXX)
# Issue tracking
declare -a CRITICAL_ISSUES=()
declare -a HIGH_ISSUES=()
declare -a MEDIUM_ISSUES=()
declare -a LOW_ISSUES=()
################################################################################
# Helper Functions
################################################################################
add_issue() {
local severity=$1
local title=$2
local details=$3
local recommendation=$4
local score=$5
local issue="[SEVERITY: ${score}%] ${title}
${details}
Recommendation: ${recommendation}
"
case $severity in
CRITICAL)
CRITICAL_ISSUES+=("$issue")
;;
HIGH)
HIGH_ISSUES+=("$issue")
;;
MEDIUM)
MEDIUM_ISSUES+=("$issue")
;;
LOW)
LOW_ISSUES+=("$issue")
;;
esac
}
################################################################################
# Phase 1: Memory Analysis
################################################################################
analyze_memory() {
print_info "Analyzing memory usage..."
# Get memory stats
local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
local mem_used=$(free -m | awk '/^Mem:/ {print $3}')
local mem_free=$(free -m | awk '/^Mem:/ {print $4}')
local mem_available=$(free -m | awk '/^Mem:/ {print $7}')
local swap_total=$(free -m | awk '/^Swap:/ {print $2}')
local swap_used=$(free -m | awk '/^Swap:/ {print $3}')
local mem_percent=$((mem_used * 100 / mem_total))
local swap_percent=0
[ "$swap_total" -gt 0 ] && swap_percent=$((swap_used * 100 / swap_total))
# Check for critical memory issues
if [ "$mem_percent" -gt 90 ]; then
add_issue "CRITICAL" "MEMORY - Critical memory usage" \
"Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)
Available: ${mem_available}MB" \
"Free up memory immediately. Check top memory consumers." \
95
elif [ "$mem_percent" -gt 80 ]; then
add_issue "HIGH" "MEMORY - High memory usage" \
"Memory usage: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)
Available: ${mem_available}MB" \
"Monitor memory usage. Consider adding RAM or optimizing services." \
75
fi
# Check swap usage
if [ "$swap_used" -gt 0 ] && [ "$swap_percent" -gt 50 ]; then
add_issue "HIGH" "MEMORY - High swap usage" \
"Swap usage: ${swap_used}MB / ${swap_total}MB (${swap_percent}%)
System is swapping - performance degradation likely" \
"Identify memory-heavy processes. Reduce memory usage or add RAM." \
80
fi
# Check for OOM events
local oom_count=$(dmesg | grep -i "killed process\|out of memory" | wc -l || echo "0")
if [ "$oom_count" -gt 0 ]; then
local recent_oom=$(dmesg | grep -i "killed process" | tail -3 | sed 's/^/ • /' || echo " • Details not available")
add_issue "CRITICAL" "MEMORY - OOM Killer Active" \
"OOM killer invoked ${oom_count} times since boot
Recent events:
${recent_oom}" \
"Critical memory pressure. Reduce MySQL/Apache memory limits or add RAM.
Check: dmesg | grep -i 'killed process'" \
95
fi
# Get top memory consumers
local top_mem=$(ps aux --sort=-%mem | head -6 | tail -5 | awk '{printf " • %-15s %6s %s\n", $1, $4"%", $11}')
echo "$top_mem" > "$TEMP_DIR/top_memory.txt"
}
################################################################################
# Phase 1.5: Memory Configuration Analysis
################################################################################
analyze_memory_config() {
print_info "Analyzing memory configuration..."
# Check if swap exists
local swap_total=$(free -m | awk '/^Swap:/ {print $2}')
if [ "$swap_total" -eq 0 ]; then
local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024))
if [ "$mem_total_gb" -lt 4 ]; then
add_issue "HIGH" "MEMORY CONFIG - No swap configured" \
"System has no swap space
Total RAM: ${mem_total_gb}GB
For servers with <4GB RAM, swap is recommended" \
"Create swap file:
• dd if=/dev/zero of=/swapfile bs=1G count=2
• chmod 600 /swapfile
• mkswap /swapfile
• swapon /swapfile
• Add to /etc/fstab: /swapfile none swap sw 0 0" \
78
else
add_issue "MEDIUM" "MEMORY CONFIG - No swap configured" \
"System has no swap space (${mem_total_gb}GB RAM)
Swap provides safety net for memory pressure" \
"Consider adding swap even with sufficient RAM." \
55
fi
fi
# Check swappiness value
local swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null || echo "60")
if [ "$swappiness" -gt 30 ] && [ "$swap_total" -gt 0 ]; then
add_issue "MEDIUM" "MEMORY CONFIG - Swappiness too high" \
"Current swappiness: ${swappiness}
Recommended for servers: 10-30
High swappiness causes aggressive swapping, degrading performance" \
"Reduce swappiness:
• Temporary: sysctl vm.swappiness=10
• Permanent: echo 'vm.swappiness=10' >> /etc/sysctl.conf
• Apply: sysctl -p" \
65
fi
# Check if OOM killer is enabled
local oom_kill_allocating=$(cat /proc/sys/vm/oom_kill_allocating_task 2>/dev/null || echo "0")
local overcommit_memory=$(cat /proc/sys/vm/overcommit_memory 2>/dev/null || echo "0")
if [ "$overcommit_memory" -eq 2 ]; then
add_issue "LOW" "MEMORY CONFIG - Overcommit disabled" \
"vm.overcommit_memory = 2 (disabled)
OOM killer may not activate, risking system lockup" \
"For most servers, vm.overcommit_memory=0 (heuristic) is recommended.
Only use strict accounting (2) if you understand the implications." \
45
fi
# Check for huge pages (can cause memory fragmentation)
local transparent_hugepage=$(cat /sys/kernel/mm/transparent_hugepage/enabled 2>/dev/null | grep -oP '\[\K[^\]]+' || echo "unknown")
if [ "$transparent_hugepage" = "always" ]; then
add_issue "LOW" "MEMORY CONFIG - Transparent Huge Pages enabled" \
"THP can cause memory fragmentation and latency spikes
Recommended: madvise or never for database servers" \
"Disable THP:
• echo never > /sys/kernel/mm/transparent_hugepage/enabled
• Make permanent in /etc/rc.local or systemd" \
48
fi
}
################################################################################
# Phase 2: Disk Analysis
################################################################################
analyze_disk() {
print_info "Analyzing disk usage..."
# Check all mounted filesystems
while IFS= read -r line; do
local filesystem=$(echo "$line" | awk '{print $1}')
local size=$(echo "$line" | awk '{print $2}')
local used=$(echo "$line" | awk '{print $3}')
local avail=$(echo "$line" | awk '{print $4}')
local percent=$(echo "$line" | awk '{print $5}' | tr -d '%')
local mount=$(echo "$line" | awk '{print $6}')
# Skip tmpfs and other virtual filesystems
[[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue
if [ "$percent" -gt 95 ]; then
# Analyze what's consuming space
local analysis=""
# Top level directories
local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -5 | sed 's/^/ • /')
# Common problem areas - check each
local problem_areas=""
# Check for large log files
if [ -d "$mount/var/log" ]; then
local large_logs=$(find "$mount/var/log" -type f -size +100M 2>/dev/null | head -5)
if [ -n "$large_logs" ]; then
problem_areas="${problem_areas}Large log files found:\n$(echo "$large_logs" | sed 's/^/ • /' | head -3)\n\n"
fi
fi
# Check for email queue (common issue)
if [ -d "$mount/var/spool/exim" ]; then
local queue_count=$(find "$mount/var/spool/exim/input" -type f 2>/dev/null | wc -l)
if [ "$queue_count" -gt 1000 ]; then
problem_areas="${problem_areas}Large email queue: ${queue_count} messages\n\n"
fi
fi
# Check for backups in home directories
if [ -d "$mount/home" ]; then
local backup_size=$(find "$mount/home" -type f \( -name "*.tar.gz" -o -name "*.zip" -o -name "*.sql.gz" -o -name "backup-*" \) -size +100M 2>/dev/null | wc -l)
if [ "$backup_size" -gt 0 ]; then
problem_areas="${problem_areas}Found ${backup_size} large backup/archive files in /home\n\n"
fi
fi
# Check for old backups in /backup
if [ -d "$mount/backup" ]; then
local old_backups=$(find "$mount/backup" -type f -mtime +30 -size +100M 2>/dev/null | wc -l)
if [ "$old_backups" -gt 0 ]; then
problem_areas="${problem_areas}Found ${old_backups} old backups (>30 days, >100MB) in /backup\n\n"
fi
fi
# Check for core dumps
local core_dumps=$(find "$mount" -maxdepth 3 -name "core.*" -o -name "core" 2>/dev/null | wc -l)
if [ "$core_dumps" -gt 5 ]; then
problem_areas="${problem_areas}Found ${core_dumps} core dump files\n\n"
fi
analysis="Top directories by size:\n${top_dirs}\n\n${problem_areas}"
add_issue "CRITICAL" "DISK - ${mount} critically full" \
"Filesystem: ${filesystem}
Usage: ${used} / ${size} (${percent}%)
Available: ${avail}
${analysis}" \
"Free up space immediately:
• Review top consumers above
• Clean old logs: find /var/log -type f -mtime +30 -delete
• Clean package cache: yum clean all
• Clean email queue: exim -bp | wc -l (check queue)
• Remove old backups: find /backup -mtime +30 -delete
• Find large files: find ${mount} -type f -size +100M -exec ls -lh {} \\;" \
98
elif [ "$percent" -gt 85 ]; then
# Quick analysis for high usage
local top_dirs=$(du -sh "$mount"/* 2>/dev/null | sort -rh | head -3 | sed 's/^/ • /')
add_issue "HIGH" "DISK - ${mount} high usage" \
"Filesystem: ${filesystem}
Usage: ${used} / ${size} (${percent}%)
Available: ${avail}
Top directories:
${top_dirs}" \
"Investigate disk usage:
• du -sh ${mount}/* | sort -rh | head -10
• Clean up unnecessary files or expand partition
• Check logs: ls -lh /var/log/*.log
• Check email queue: exim -bpc (if using Exim)" \
75
elif [ "$percent" -gt 75 ]; then
add_issue "MEDIUM" "DISK - ${mount} approaching capacity" \
"Usage: ${used} / ${size} (${percent}%)" \
"Monitor disk usage. Plan for expansion." \
65
fi
done < <(df -h | tail -n +2)
# Check inode usage
while IFS= read -r line; do
local filesystem=$(echo "$line" | awk '{print $1}')
local percent=$(echo "$line" | awk '{print $5}' | tr -d '%')
local mount=$(echo "$line" | awk '{print $6}')
[[ "$filesystem" =~ ^(tmpfs|devtmpfs|none) ]] && continue
if [ "$percent" -gt 90 ]; then
add_issue "HIGH" "DISK - ${mount} inode exhaustion" \
"Inode usage: ${percent}%
Filesystem: ${filesystem}" \
"Find directories with many small files:
• find ${mount} -xdev -type d -exec sh -c 'echo \$(ls -A {} | wc -l) {}' \\; | sort -rn | head -10" \
85
fi
done < <(df -i | tail -n +2)
# Check for disk errors
local disk_errors=$(dmesg | grep -i "I/O error\|sector\|SMART" | wc -l || echo "0")
if [ "$disk_errors" -gt 0 ]; then
local error_sample=$(dmesg | grep -i "I/O error\|sector\|SMART" | tail -3 | sed 's/^/ • /' || echo " • Check dmesg")
add_issue "CRITICAL" "DISK - Hardware errors detected" \
"Found ${disk_errors} disk error messages
Recent errors:
${error_sample}" \
"Check SMART status: smartctl -a /dev/sda
Backup data immediately if errors persist." \
92
fi
}
################################################################################
# Phase 3: CPU Analysis
################################################################################
analyze_cpu() {
print_info "Analyzing CPU usage..."
local cpu_cores=$(nproc)
local load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
local load_5min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | xargs)
local load_15min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | xargs)
# Calculate load per core
local load_per_core=$(echo "$load_1min / $cpu_cores" | bc -l 2>/dev/null | awk '{printf "%.2f", $0}' || echo "0")
# Calculate healthy load thresholds
local healthy_load=$(echo "$cpu_cores * 0.7" | bc -l | awk '{printf "%.1f", $0}')
local warning_load=$(echo "$cpu_cores * 1.0" | bc -l | awk '{printf "%.1f", $0}')
local critical_load=$(echo "$cpu_cores * 2.0" | bc -l | awk '{printf "%.1f", $0}')
# Detect load trend (increasing, stable, decreasing)
local load_trend="stable"
if (( $(echo "$load_1min > $load_5min * 1.2" | bc -l) )); then
load_trend="increasing rapidly"
elif (( $(echo "$load_1min > $load_5min" | bc -l) )); then
load_trend="increasing"
elif (( $(echo "$load_1min < $load_5min * 0.8" | bc -l) )); then
load_trend="decreasing"
fi
# Check load average with intelligent thresholds
if (( $(echo "$load_1min > $critical_load" | bc -l) )); then
local top_cpu=$(ps aux --sort=-%cpu | head -6 | tail -5 | awk '{printf " • %-15s %6s %s\n", $1, $3"%", $11}')
add_issue "CRITICAL" "CPU - Extreme load" \
"Load average: ${load_1min} / ${load_5min} / ${load_15min}
CPU cores: ${cpu_cores}
Load per core: ${load_per_core}
Trend: ${load_trend}
Healthy load for this server: < ${healthy_load}
Warning threshold: ${warning_load}
Critical threshold: ${critical_load}
Top CPU consumers:
${top_cpu}" \
"Immediate action required:
1. Identify runaway processes: ps aux --sort=-%cpu | head -20
2. Kill if necessary: kill -9 [PID]
3. Check if under attack: Main Menu → Security → Bot Analyzer" \
92
elif (( $(echo "$load_1min > $warning_load" | bc -l) )); then
local top_cpu=$(ps aux --sort=-%cpu | head -4 | tail -3 | awk '{printf " • %-15s %6s %s\n", $1, $3"%", $11}')
add_issue "HIGH" "CPU - High load" \
"Load average: ${load_1min} / ${load_5min} / ${load_15min}
CPU cores: ${cpu_cores}
Load per core: ${load_per_core}
Trend: ${load_trend}
Healthy load: < ${healthy_load}
Current: ${load_1min} (above warning threshold of ${warning_load})
Top CPU consumers:
${top_cpu}" \
"Monitor and optimize:
• Check: ps aux --sort=-%cpu | head -20
• Review high-CPU processes and optimize if possible" \
76
elif (( $(echo "$load_1min > $healthy_load" | bc -l) )); then
add_issue "MEDIUM" "CPU - Elevated load" \
"Load average: ${load_1min} / ${load_5min} / ${load_15min}
Healthy threshold: < ${healthy_load}
Trend: ${load_trend}" \
"Monitor trends. Load is elevated but not critical yet." \
62
fi
# Get top CPU consumers
ps aux --sort=-%cpu | head -11 | tail -10 > "$TEMP_DIR/top_cpu.txt"
}
################################################################################
# Phase 4: MySQL/MariaDB Health
################################################################################
analyze_mysql() {
print_info "Analyzing MySQL/MariaDB..."
# Check if MySQL is running
if ! command -v mysql >/dev/null 2>&1; then
add_issue "LOW" "MYSQL - Not installed" \
"MySQL/MariaDB not found on system" \
"No action needed if not using databases." \
20
return
fi
if ! pgrep -x "mysqld|mariadbd" >/dev/null; then
add_issue "CRITICAL" "MYSQL - Service not running" \
"MySQL/MariaDB process not found" \
"Start service: systemctl start mysql / mariadb" \
95
return
fi
# Get MySQL stats (if we can connect)
local mysql_stats=$(mysql -e "SHOW GLOBAL STATUS LIKE 'Threads_connected'; SHOW GLOBAL STATUS LIKE 'Max_used_connections'; SHOW VARIABLES LIKE 'max_connections';" 2>/dev/null)
if [ -n "$mysql_stats" ]; then
local threads_connected=$(echo "$mysql_stats" | grep "Threads_connected" | awk '{print $2}' || echo "0")
local max_used=$(echo "$mysql_stats" | grep "Max_used_connections" | awk '{print $2}' || echo "0")
local max_connections=$(echo "$mysql_stats" | grep "max_connections" | awk '{print $2}' || echo "0")
local connection_percent=$((threads_connected * 100 / max_connections))
if [ "$connection_percent" -gt 80 ]; then
add_issue "HIGH" "MYSQL - Connection limit approaching" \
"Current connections: ${threads_connected} / ${max_connections} (${connection_percent}%)
Max used: ${max_used}" \
"Investigate and fix:
1. Find slow queries: Main Menu → Performance → MySQL Query Analyzer
2. Increase max_connections in /etc/my.cnf if needed
3. Check for connection leaks: show processlist;" \
78
fi
# Check InnoDB buffer pool vs RAM
local innodb_buffer=$(mysql -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" 2>/dev/null | grep innodb | awk '{print $2}' || echo "0")
local innodb_buffer_gb=$((innodb_buffer / 1024 / 1024 / 1024))
local mem_total_gb=$(($(free -m | awk '/^Mem:/ {print $2}') / 1024))
if [ "$innodb_buffer_gb" -gt 0 ] && [ "$innodb_buffer_gb" -gt $((mem_total_gb - 2)) ]; then
add_issue "HIGH" "MYSQL - InnoDB buffer pool too large" \
"innodb_buffer_pool_size: ${innodb_buffer_gb}GB
Total RAM: ${mem_total_gb}GB
System has insufficient free RAM for OS and other services" \
"Reduce innodb_buffer_pool_size to ~${mem_total_gb}GB * 0.6 = ~$((mem_total_gb * 6 / 10))GB
Edit /etc/my.cnf and restart MySQL" \
82
fi
fi
}
################################################################################
# Phase 5: Apache Health
################################################################################
analyze_apache() {
print_info "Analyzing Apache..."
# Check if Apache is running
if ! pgrep -x "httpd|apache2" >/dev/null; then
add_issue "CRITICAL" "APACHE - Service not running" \
"Apache process not found" \
"Start service: systemctl start httpd / apache2" \
95
return
fi
# Check Apache error log for recent issues
local apache_error_log=""
if [ -f "/var/log/httpd/error_log" ]; then
apache_error_log="/var/log/httpd/error_log"
elif [ -f "/var/log/apache2/error.log" ]; then
apache_error_log="/var/log/apache2/error.log"
fi
if [ -n "$apache_error_log" ]; then
# Check for MaxRequestWorkers limit hits
local max_workers_hits=$(grep -c "server reached MaxRequestWorkers" "$apache_error_log" 2>/dev/null || echo "0")
if [ "$max_workers_hits" -gt 20 ]; then
add_issue "CRITICAL" "APACHE - MaxRequestWorkers limit hit frequently" \
"Server reached MaxRequestWorkers limit ${max_workers_hits} times
This causes connection refusal and 'server busy' errors" \
"Increase MaxRequestWorkers in Apache config (if RAM allows)
OR investigate slow PHP scripts / database queries causing workers to hang
Check: apachectl -M | grep mpm" \
88
elif [ "$max_workers_hits" -gt 5 ]; then
add_issue "HIGH" "APACHE - MaxRequestWorkers limit reached" \
"Limit hit ${max_workers_hits} times" \
"Monitor and consider increasing MaxRequestWorkers." \
72
fi
# Check for segfaults
local segfaults=$(grep -c "segfault" "$apache_error_log" 2>/dev/null || echo "0")
if [ "$segfaults" -gt 0 ]; then
add_issue "HIGH" "APACHE - Segmentation faults detected" \
"Found ${segfaults} segfault events
May indicate corrupted modules or memory issues" \
"Check error log: tail -100 ${apache_error_log} | grep segfault
Update/reinstall problematic Apache modules" \
78
fi
fi
# Check Apache process count
local apache_procs=$(pgrep -c "httpd|apache2" || echo "0")
if [ "$apache_procs" -gt 200 ]; then
add_issue "MEDIUM" "APACHE - High process count" \
"Apache processes: ${apache_procs}
May indicate connection buildup or slow backend" \
"Check Apache status: apachectl fullstatus
Review MaxRequestWorkers setting" \
65
fi
}
################################################################################
# Phase 6: PHP-FPM Health (cPanel/Plesk)
################################################################################
analyze_php_fpm() {
print_info "Analyzing PHP-FPM..."
# Check if PHP-FPM is running
if ! pgrep -f "php-fpm" >/dev/null; then
add_issue "LOW" "PHP-FPM - Not running or not installed" \
"PHP-FPM processes not found
May be using mod_php instead" \
"No action needed if using Apache mod_php." \
30
return
fi
# Count PHP-FPM processes per user
local fpm_user_counts=$(ps aux | grep "php-fpm: pool" | grep -v grep | awk '{print $1}' | sort | uniq -c | sort -rn | head -10)
while IFS= read -r line; do
[ -z "$line" ] && continue
local count=$(echo "$line" | awk '{print $1}')
local user=$(echo "$line" | awk '{print $2}')
if [ -n "$count" ] && [ "$count" -gt 10 ] 2>/dev/null; then
# Try to find which domain
local user_domain=$(grep "^${user}:" /etc/trueuserdomains 2>/dev/null | cut -d: -f1 || echo "unknown")
add_issue "HIGH" "PHP-FPM - User '${user}' has many processes" \
"PHP-FPM processes: ${count}
User: ${user}
Domain: ${user_domain}
Possible causes: Stuck processes, heavy traffic, slow scripts, bot attacks, database issues" \
"Investigate with specialized tools:
1. Check for bot attacks: Main Menu → Security → Bot Analyzer (analyze ${user_domain})
2. Check for slow MySQL queries: Main Menu → Performance → MySQL Query Analyzer
3. Manual check: ps aux | grep ${user} | grep php-fpm
4. Kill if stuck: pkill -9 -u ${user} php-fpm" \
76
fi
done <<< "$fpm_user_counts"
}
################################################################################
# Phase 7: Log Analysis - Security
################################################################################
analyze_security_logs() {
print_info "Analyzing security logs..."
local secure_log=""
if [ -f "/var/log/secure" ]; then
secure_log="/var/log/secure"
elif [ -f "/var/log/auth.log" ]; then
secure_log="/var/log/auth.log"
fi
if [ -n "$secure_log" ]; then
# Check for failed SSH attempts
local failed_ssh=$(grep "Failed password" "$secure_log" 2>/dev/null | wc -l || echo "0")
if [ "$failed_ssh" -gt 100 ]; then
local top_ips=$(grep "Failed password" "$secure_log" | awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -5 | sed 's/^/ • /')
# Check if cPHulkd is available (cPanel)
local protection_cmd=""
if [ "$SYS_CONTROL_PANEL" = "cpanel" ] && [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then
# Check if cPHulkd is enabled
local cphulk_enabled=$(/usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -i "enabled" || echo "disabled")
if echo "$cphulk_enabled" | grep -qi "disabled"; then
protection_cmd="Enable cPHulk (cPanel's brute force protection):
• Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection
• Via command: bash modules/security/enable-cphulk.sh
• Manual enable: /usr/local/cpanel/bin/cphulk_pam_ctl --enable
• The setup wizard will automatically import your CSF whitelist to cPHulk"
else
protection_cmd="cPHulk is enabled. Ensure trusted IPs are whitelisted:
• Via menu: Main Menu → Security → Authentication Security → Enable cPHulk Protection
• Via command: bash modules/security/enable-cphulk.sh
• Manual whitelist: whmapi1 cphulkd_add_whitelist ip=YOUR_IP
• View blocked IPs: whmapi1 cphulkd_list_blocks"
fi
else
protection_cmd="Install automatic blocking: yum install fail2ban"
fi
add_issue "HIGH" "SECURITY - High SSH brute force attempts" \
"Failed SSH login attempts: ${failed_ssh}
Top attacking IPs:
${top_ips}" \
"Investigate and block:
1. Analyze attack patterns: Main Menu → Security → Bot Analyzer
2. Block IPs manually: csf -d [IP]
3. ${protection_cmd}" \
80
elif [ "$failed_ssh" -gt 50 ]; then
add_issue "MEDIUM" "SECURITY - Moderate SSH brute force" \
"Failed attempts: ${failed_ssh}" \
"Monitor and consider IP blocking if it increases." \
60
fi
# Check for successful root logins
local root_logins=$(grep "Accepted.*root" "$secure_log" 2>/dev/null | wc -l || echo "0")
if [ "$root_logins" -gt 0 ]; then
local root_login_ips=$(grep "Accepted.*root" "$secure_log" | awk '{print $(NF-3)}' | sort -u | sed 's/^/ • /')
local root_login_times=$(grep "Accepted.*root" "$secure_log" | awk '{print $1, $2, $3}' | tail -5 | sed 's/^/ • /')
# Check for unusual IPs (not common admin IPs)
add_issue "MEDIUM" "SECURITY - Root SSH logins detected" \
"Successful root logins: ${root_logins}
Source IPs:
${root_login_ips}
Recent logins:
${root_login_times}" \
"Review if these IPs are authorized:
• Disable root SSH: Set 'PermitRootLogin no' in /etc/ssh/sshd_config
• Use SSH keys instead of passwords
• Check: last | grep root" \
68
fi
# Check for suspicious sudo usage
local sudo_attempts=$(grep "sudo.*COMMAND" "$secure_log" 2>/dev/null | wc -l || echo "0")
if [ "$sudo_attempts" -gt 100 ]; then
local top_sudo_users=$(grep "sudo.*COMMAND" "$secure_log" | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -rn | head -5 | sed 's/^/ • /')
add_issue "MEDIUM" "SECURITY - High sudo activity" \
"Sudo command executions: ${sudo_attempts}
Top users:
${top_sudo_users}" \
"Review sudo logs for unusual activity:
• grep sudo /var/log/secure | tail -50" \
65
fi
fi
# Check for potential rootkit indicators
local suspicious_processes=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | wc -l || echo "0")
if [ "$suspicious_processes" -gt 0 ]; then
local sus_proc_list=$(ps aux | grep -E "\.\.\/\.\.|^\.|\.\.\/" | grep -v grep | head -5 | awk '{print $11}' | sed 's/^/ • /')
add_issue "CRITICAL" "SECURITY - Suspicious processes detected" \
"Found ${suspicious_processes} processes with suspicious paths:
${sus_proc_list}
May indicate rootkit or malware" \
"Run rootkit scanner immediately:
• Install chkrootkit: yum install chkrootkit
• Or rkhunter: yum install rkhunter
• Check processes: ps aux | grep -E '\.\./\.\.'" \
95
fi
}
################################################################################
# Phase 8: System Messages Log
################################################################################
analyze_messages_log() {
print_info "Analyzing system messages..."
if [ ! -f "/var/log/messages" ]; then
return
fi
# Check for kernel panics
local kernel_panics=$(grep "kernel panic\|Oops:" /var/log/messages 2>/dev/null | wc -l)
if [ -n "$kernel_panics" ] && [ "$kernel_panics" -gt 0 ] 2>/dev/null; then
add_issue "CRITICAL" "SYSTEM - Kernel panics detected" \
"Found ${kernel_panics} kernel panic events
System stability compromised" \
"Review: grep 'kernel panic' /var/log/messages
Update kernel or investigate hardware issues" \
98
fi
# Check for hardware errors
local hw_errors=$(grep "Hardware Error\|MCE\|ECC" /var/log/messages 2>/dev/null | wc -l)
if [ -n "$hw_errors" ] && [ "$hw_errors" -gt 0 ] 2>/dev/null; then
# Get actual error samples
local error_samples=$(grep -E "Hardware Error|MCE|ECC" /var/log/messages 2>/dev/null | tail -5 | sed 's/^/ /')
add_issue "HIGH" "SYSTEM - Hardware errors detected" \
"Found ${hw_errors} hardware error messages
May indicate failing hardware (RAM, CPU, disk)
Recent errors:
${error_samples}" \
"Investigate hardware health:
• Check RAM: dmidecode -t memory | grep -E 'Size|Type|Speed|Error'
• Check disk SMART: smartctl -a /dev/sda | grep -E 'Error|Failed|Health'
• Check dmesg: dmesg | grep -i 'error\|fail' | tail -20
• Run memtest86+ if RAM errors detected
• Contact hosting provider if persistent" \
85
fi
# Check for network issues
local net_errors=$(grep "link is down\|no carrier" /var/log/messages 2>/dev/null | wc -l)
if [ -n "$net_errors" ] && [ "$net_errors" -gt 5 ] 2>/dev/null; then
add_issue "MEDIUM" "NETWORK - Connection issues" \
"Found ${net_errors} network link issues" \
"Check network cable/switch, review: ip link show" \
62
fi
}
################################################################################
# Phase 9: Cron Job Health
################################################################################
analyze_cron() {
print_info "Analyzing cron jobs..."
local cron_log="/var/log/cron"
if [ ! -f "$cron_log" ]; then
return
fi
# Check for failed cron jobs (last 24 hours)
local cron_errors=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed\|No such file" | wc -l)
if [ -n "$cron_errors" ] && [ "$cron_errors" -gt 20 ] 2>/dev/null; then
local error_sample=$(grep "$(date -d '24 hours ago' '+%b %d')" "$cron_log" 2>/dev/null | grep "error\|failed" | tail -5 | sed 's/^/ • /')
add_issue "HIGH" "CRON - Many failed jobs" \
"Found ${cron_errors} cron errors in last 24h
Sample errors:
${error_sample}" \
"Review cron logs: tail -100 /var/log/cron
Fix failing scripts or disable problematic jobs" \
75
elif [ -n "$cron_errors" ] && [ "$cron_errors" -gt 5 ] 2>/dev/null; then
add_issue "MEDIUM" "CRON - Some failed jobs" \
"Found ${cron_errors} cron errors in last 24h" \
"Review: grep error /var/log/cron | tail -20" \
60
fi
}
################################################################################
# Phase 10: Network Analysis
################################################################################
analyze_network() {
print_info "Analyzing network..."
# Check nf_conntrack usage
if [ -f "/proc/sys/net/netfilter/nf_conntrack_count" ]; then
local conntrack_count=$(cat /proc/sys/net/netfilter/nf_conntrack_count 2>/dev/null || echo "0")
local conntrack_max=$(cat /proc/sys/net/netfilter/nf_conntrack_max 2>/dev/null || echo "100000")
local conntrack_percent=$((conntrack_count * 100 / conntrack_max))
if [ "$conntrack_percent" -gt 90 ]; then
add_issue "CRITICAL" "NETWORK - Connection tracking table near limit" \
"nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%)
New connections may be dropped" \
"Increase limit:
• sysctl -w net.netfilter.nf_conntrack_max=$((conntrack_max * 2))
• Add to /etc/sysctl.conf: net.netfilter.nf_conntrack_max=$((conntrack_max * 2))
• Or investigate connection leaks" \
88
elif [ "$conntrack_percent" -gt 75 ]; then
add_issue "HIGH" "NETWORK - Connection tracking table high" \
"nf_conntrack usage: ${conntrack_count} / ${conntrack_max} (${conntrack_percent}%)" \
"Monitor and consider increasing limit if it continues to grow." \
72
fi
fi
# Check for TCP retransmissions
local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0")
local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1")
if [ "$tcp_out" -gt 1000000 ]; then
local retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0")
if (( $(echo "$retrans_percent > 5" | bc -l 2>/dev/null) )); then
# Get current MTU
local current_mtu=$(ip link show $(ip route | grep default | awk '{print $5}' | head -1) 2>/dev/null | grep mtu | awk '{print $5}')
add_issue "HIGH" "NETWORK - High TCP retransmission rate" \
"Retransmissions: ${retrans_percent}% of total segments (${tcp_retrans} of ${tcp_out})
This indicates network issues, congestion, or bandwidth problems
Common causes:
• Network congestion or packet loss
• MTU mismatch (current: ${current_mtu})
• Bandwidth saturation
• Faulty network hardware
• ISP/hosting provider issues" \
"Diagnose and resolve network issues:
1. Test packet loss: ping -c 100 8.8.8.8 | grep loss
2. Check MTU: ip link show | grep mtu (should be 1500 for most networks)
3. Monitor bandwidth: install vnstat for tracking (yum install vnstat)
4. Real-time monitor: iftop -i eth0 (install: yum install iftop)
5. Check provider status: Contact hosting provider if persistent
6. Analyze traffic: Main Menu → Security → Bot Analyzer (check for attacks)
NOTE: For detailed bandwidth analysis, use: Main Menu → Network Diagnostics (when available)" \
74
fi
fi
# Check listen queue overflows
local listen_overflows=$(netstat -s 2>/dev/null | grep "times the listen queue of a socket overflowed" | awk '{print $1}' | head -1)
[ -z "$listen_overflows" ] && listen_overflows=0
if [ "$listen_overflows" -gt 100 ] 2>/dev/null; then
add_issue "MEDIUM" "NETWORK - Listen queue overflows detected" \
"Listen queue overflows: ${listen_overflows}
Applications may be dropping connections" \
"Increase net.core.somaxconn:
• sysctl -w net.core.somaxconn=4096
• Add to /etc/sysctl.conf" \
68
fi
}
################################################################################
# Phase 11: Time/NTP Analysis
################################################################################
analyze_time() {
print_info "Analyzing time synchronization..."
# Check if chronyd or ntpd is running
local time_service=""
if pgrep -x chronyd >/dev/null; then
time_service="chronyd"
elif pgrep -x ntpd >/dev/null; then
time_service="ntpd"
fi
if [ -z "$time_service" ]; then
add_issue "MEDIUM" "TIME - No NTP service running" \
"Neither chronyd nor ntpd is running
Time drift can cause SSL certificate errors and authentication issues" \
"Install and start NTP service:
• AlmaLinux/RHEL: yum install chrony && systemctl enable --now chronyd
• Check sync: chronyc tracking" \
66
else
# Check time sync status
if [ "$time_service" = "chronyd" ]; then
local sync_status=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4, $5}' || echo "unknown")
local offset_seconds=$(chronyc tracking 2>/dev/null | grep "System time" | awk '{print $4}' || echo "0")
# Convert to absolute value for comparison
offset_seconds=${offset_seconds#-}
if (( $(echo "$offset_seconds > 1" | bc -l 2>/dev/null || echo "0") )); then
add_issue "HIGH" "TIME - Clock offset detected" \
"Time offset: ${sync_status}
Significant time drift detected" \
"Force time sync:
• chronyc -a makestep
• Check sources: chronyc sources" \
75
fi
fi
fi
}
################################################################################
# Phase 12: System Updates & Kernel
################################################################################
analyze_updates() {
print_info "Analyzing system updates..."
# Check if running kernel matches installed kernel
local running_kernel=$(uname -r)
local installed_kernel=$(rpm -q kernel --last 2>/dev/null | head -1 | awk '{print $1}' | sed 's/kernel-//' || echo "$running_kernel")
if [ "$running_kernel" != "$installed_kernel" ]; then
add_issue "MEDIUM" "SYSTEM - Reboot required" \
"Running kernel: ${running_kernel}
Installed kernel: ${installed_kernel}
System needs reboot to use new kernel" \
"Schedule maintenance window and reboot:
• Check uptime: uptime
• Reboot when ready: reboot" \
64
fi
# Check for available security updates (if yum/dnf)
if command -v yum >/dev/null 2>&1; then
local security_updates=$(yum updateinfo list security 2>/dev/null | grep "^FEDORA\|^RHSA" | wc -l)
if [ -n "$security_updates" ] && [ "$security_updates" -gt 10 ] 2>/dev/null; then
add_issue "HIGH" "SYSTEM - Many security updates available" \
"Security updates available: ${security_updates}
System may be vulnerable" \
"Apply security updates:
• yum update --security
• Or schedule full update: yum update" \
76
elif [ -n "$security_updates" ] && [ "$security_updates" -gt 0 ] 2>/dev/null; then
add_issue "MEDIUM" "SYSTEM - Security updates available" \
"Security updates: ${security_updates}" \
"Review and apply: yum updateinfo list security" \
58
fi
fi
# Check for cPanel updates (if cPanel)
if [ -f "/usr/local/cpanel/version" ]; then
local cpanel_version=$(cat /usr/local/cpanel/version)
# Note: We can't easily check if update is available without WHM API
# Just record the version
echo "cPanel version: $cpanel_version" >> "$TEMP_DIR/system_info.txt"
fi
}
################################################################################
# Phase 13: File Limits & Descriptors
################################################################################
analyze_file_limits() {
print_info "Analyzing file descriptor limits..."
# Check system-wide file descriptor usage
local file_nr=$(cat /proc/sys/fs/file-nr 2>/dev/null || echo "0 0 100000")
local used_fds=$(echo "$file_nr" | awk '{print $1}')
local max_fds=$(echo "$file_nr" | awk '{print $3}')
local fd_percent=$((used_fds * 100 / max_fds))
if [ "$fd_percent" -gt 80 ]; then
add_issue "HIGH" "FILE DESCRIPTORS - System limit approaching" \
"Open file descriptors: ${used_fds} / ${max_fds} (${fd_percent}%)
Applications may fail to open files/sockets" \
"Increase limit:
• sysctl -w fs.file-max=$((max_fds * 2))
• Add to /etc/sysctl.conf: fs.file-max=$((max_fds * 2))
• Check per-process limits: ulimit -n" \
78
fi
# Check for common services with high FD usage
for service in httpd mysqld php-fpm; do
if pgrep -x "$service" >/dev/null; then
local service_fds=$(lsof -p $(pgrep -x "$service" | head -1) 2>/dev/null | wc -l || echo "0")
if [ "$service_fds" -gt 10000 ]; then
add_issue "MEDIUM" "$service - High file descriptor usage" \
"Service: $service
Open file descriptors: ${service_fds}
May indicate connection/file leaks" \
"Investigate $service:
• lsof -p \$(pgrep -x $service | head -1) | head -100
• Check for file/connection leaks" \
65
fi
fi
done
}
################################################################################
# Phase 14: Email Queue Analysis
################################################################################
analyze_email_queue() {
print_info "Analyzing email queue..."
local queue_count=0
local mail_system=""
# Check for Exim (most common on cPanel)
if command -v exim >/dev/null 2>&1; then
mail_system="Exim"
queue_count=$(exim -bpc 2>/dev/null || echo "0")
# Check for Postfix
elif command -v postqueue >/dev/null 2>&1; then
mail_system="Postfix"
queue_count=$(postqueue -p 2>/dev/null | tail -1 | awk '{print $5}' || echo "0")
# Check for Sendmail
elif command -v mailq >/dev/null 2>&1; then
mail_system="Sendmail"
queue_count=$(mailq 2>/dev/null | grep -c "^[A-Z]" || echo "0")
fi
[ -z "$queue_count" ] && queue_count=0
if [ "$queue_count" -gt 5000 ]; then
# Get sample of queued messages
local queue_sample=""
if [ "$mail_system" = "Exim" ]; then
queue_sample=$(exim -bp | head -20 | sed 's/^/ /')
fi
add_issue "CRITICAL" "EMAIL - Massive mail queue" \
"Mail system: ${mail_system}
Queue size: ${queue_count} messages
This can consume disk space and cause slow mail delivery
Sample:
${queue_sample}" \
"Investigate and clear queue:
• Check for spam/compromised accounts
• Review: exim -bp | head -50 (for Exim)
• Clear specific messages: exim -Mrm [message-id]
• Force delivery attempts: exim -qff
• Check for frozen messages: exim -bp | grep frozen" \
92
elif [ "$queue_count" -gt 1000 ]; then
add_issue "HIGH" "EMAIL - Large mail queue" \
"Mail system: ${mail_system}
Queue size: ${queue_count} messages" \
"Review mail queue:
• exim -bp | less (for Exim)
• mailq | less (for Postfix/Sendmail)
• Check for spam/compromised accounts
• Review /var/log/exim_mainlog for errors" \
78
elif [ "$queue_count" -gt 100 ]; then
add_issue "MEDIUM" "EMAIL - Growing mail queue" \
"Queue size: ${queue_count} messages (${mail_system})" \
"Monitor queue. May indicate delivery issues or spam." \
58
fi
}
################################################################################
# Phase 15: I/O Wait Analysis
################################################################################
analyze_iowait() {
print_info "Analyzing disk I/O performance..."
# Get current I/O wait from top
local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1)
if [ -z "$iowait" ]; then
# Try alternative method with iostat if available
if command -v iostat >/dev/null 2>&1; then
iowait=$(iostat -c 1 2 | tail -1 | awk '{print $4}')
else
return
fi
fi
# Remove any decimal point and convert to integer for comparison
local iowait_int=$(echo "$iowait" | cut -d'.' -f1)
[ -z "$iowait_int" ] && iowait_int=0
if [ "$iowait_int" -gt 30 ] 2>/dev/null; then
# Try to find which process is causing I/O
local io_procs=$(iotop -b -n 1 2>/dev/null | head -20 || ps aux --sort=-pcpu | head -10)
add_issue "CRITICAL" "DISK I/O - Extremely high I/O wait" \
"Current I/O wait: ${iowait}%
System is waiting on disk operations - extreme performance impact" \
"Identify I/O-heavy processes:
• iotop -o (show only active I/O)
• iostat -x 1 (detailed disk stats)
• Check for failing drives: dmesg | grep -i error
• Check disk health: smartctl -a /dev/sda" \
95
elif [ "$iowait_int" -gt 15 ] 2>/dev/null; then
add_issue "HIGH" "DISK I/O - High I/O wait" \
"I/O wait: ${iowait}%
System performance degraded by disk operations" \
"Monitor disk I/O:
• iostat -x 1 5 (watch for 5 seconds)
• iotop (if installed: yum install iotop)
• Check for large file operations
• Review disk usage and fragmentation" \
76
elif [ "$iowait_int" -gt 5 ] 2>/dev/null; then
add_issue "MEDIUM" "DISK I/O - Elevated I/O wait" \
"I/O wait: ${iowait}%" \
"Monitor disk performance. May indicate heavy disk activity." \
55
fi
}
################################################################################
# Phase 16: SELinux Analysis
################################################################################
analyze_selinux() {
print_info "Analyzing SELinux status..."
# Check if SELinux is enabled
if ! command -v getenforce >/dev/null 2>&1; then
return
fi
local selinux_status=$(getenforce 2>/dev/null || echo "Disabled")
if [ "$selinux_status" = "Enforcing" ]; then
# Check for recent denials
local denials_count=0
if [ -f "/var/log/audit/audit.log" ]; then
denials_count=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | wc -l)
fi
if [ "$denials_count" -gt 50 ]; then
local denial_sample=$(grep "denied" /var/log/audit/audit.log 2>/dev/null | grep "$(date +%b\ %d)" | tail -5 | sed 's/^/ /')
add_issue "HIGH" "SELINUX - Many denials detected" \
"SELinux denials today: ${denials_count}
SELinux is blocking operations - may cause application failures
Recent denials:
${denial_sample}" \
"Review and fix SELinux policies:
• ausearch -m avc -ts today
• audit2allow -a (generate policy)
• audit2why -a (explain denials)
• Temporarily: setenforce 0 (NOT recommended for production)" \
82
elif [ "$denials_count" -gt 10 ]; then
add_issue "MEDIUM" "SELINUX - Some denials detected" \
"SELinux denials today: ${denials_count}" \
"Review: ausearch -m avc -ts today" \
62
fi
fi
}
################################################################################
# Phase 17: Control Panel Services
################################################################################
analyze_control_panel_services() {
print_info "Analyzing control panel services..."
local panel_issues=""
# Check cPanel services
if [ "$SYS_PANEL" = "cpanel" ]; then
# Key cPanel services to check
local cpanel_services=("cpanel" "whostmgrd" "cpsrvd" "tailwatchd" "dnsadmin")
for service in "${cpanel_services[@]}"; do
if ! systemctl is-active --quiet "$service" 2>/dev/null; then
panel_issues="${panel_issues}${service} is not running\n"
fi
done
if [ -n "$panel_issues" ]; then
add_issue "HIGH" "CPANEL - Critical services down" \
"The following cPanel services are not running:
${panel_issues}
This affects control panel functionality" \
"Restart services:
• systemctl restart cpanel
• /scripts/restartsrv_cpanel
• Check logs: tail -50 /usr/local/cpanel/logs/error_log" \
85
fi
# Check Plesk services
elif [ "$SYS_PANEL" = "plesk" ]; then
if ! systemctl is-active --quiet psa 2>/dev/null; then
add_issue "HIGH" "PLESK - Panel service down" \
"Plesk service is not running" \
"Restart Plesk: systemctl restart psa" \
85
fi
fi
# Check web server
if ! pgrep -x httpd >/dev/null && ! pgrep -x apache2 >/dev/null; then
add_issue "CRITICAL" "WEB SERVER - Apache/httpd not running" \
"Web server process not found
All websites are down" \
"Start web server:
• systemctl restart httpd (CentOS/AlmaLinux)
• systemctl restart apache2 (Debian/Ubuntu)
• Check logs: tail -50 /var/log/httpd/error_log" \
98
fi
# Check database server
if ! pgrep -x mysqld >/dev/null && ! pgrep -x mariadbd >/dev/null; then
add_issue "CRITICAL" "DATABASE - MySQL/MariaDB not running" \
"Database server process not found
Database-driven sites are down" \
"Start database:
• systemctl restart mariadb
• systemctl restart mysql
• Check logs: tail -50 /var/log/mariadb/mariadb.log
• Check disk space in /var/lib/mysql" \
98
fi
}
################################################################################
# Phase 18: DNS Resolution Check
################################################################################
analyze_dns() {
print_info "Analyzing DNS resolution..."
# Test resolving a few critical domains
local test_domains=("google.com" "cloudflare.com" "8.8.8.8")
local failed_count=0
local slow_count=0
local failed_domains=""
for domain in "${test_domains[@]}"; do
local start_time=$(date +%s%N)
if ! host "$domain" >/dev/null 2>&1; then
failed_count=$((failed_count + 1))
failed_domains="${failed_domains}${domain}\n"
else
local end_time=$(date +%s%N)
local duration_ms=$(( (end_time - start_time) / 1000000 ))
if [ "$duration_ms" -gt 2000 ]; then
slow_count=$((slow_count + 1))
fi
fi
done
if [ "$failed_count" -gt 0 ]; then
add_issue "CRITICAL" "DNS - Resolution failures detected" \
"Failed to resolve ${failed_count} test domains:
${failed_domains}
DNS issues cause slow loading and failures across all services" \
"Check DNS configuration:
• cat /etc/resolv.conf
• Test: dig google.com
• Try alternate DNS: echo 'nameserver 8.8.8.8' >> /etc/resolv.conf
• Restart networking: systemctl restart network" \
94
elif [ "$slow_count" -gt 1 ]; then
add_issue "HIGH" "DNS - Slow resolution detected" \
"DNS queries taking >2 seconds
This slows down all network operations" \
"Check DNS servers:
• cat /etc/resolv.conf
• Consider faster DNS: 8.8.8.8, 1.1.1.1
• Test: dig @8.8.8.8 google.com" \
72
fi
}
################################################################################
# Phase 19: Zombie Process Check
################################################################################
analyze_zombie_processes() {
print_info "Analyzing zombie processes..."
# Count zombie (defunct) processes
local zombie_count=$(ps aux | awk '$8 ~ /Z/ {print $0}' | wc -l)
if [ "$zombie_count" -gt 50 ]; then
local zombie_sample=$(ps aux | awk '$8 ~ /Z/ {print $0}' | head -5 | awk '{print " • " $11 " (PID " $2 ", PPID via pstree)"}')
add_issue "HIGH" "PROCESSES - Many zombie processes detected" \
"Zombie (defunct) processes: ${zombie_count}
Indicates parent processes not properly cleaning up children
Sample zombies:
${zombie_sample}" \
"Investigate parent processes:
• ps aux | awk '\$8 ~ /Z/'
• pstree -p | grep defunct
• Kill parent process or reboot if persistent
• Common causes: Apache, PHP-FPM, custom scripts" \
78
elif [ "$zombie_count" -gt 10 ]; then
add_issue "MEDIUM" "PROCESSES - Zombie processes detected" \
"Zombie processes: ${zombie_count}
May indicate stuck parent processes" \
"Review: ps aux | awk '\$8 ~ /Z/'" \
58
fi
}
################################################################################
# Phase 20: Firewall Status Check
################################################################################
analyze_firewall() {
print_info "Analyzing firewall status..."
local firewall_status=""
local firewall_active=0
# Check for CSF (ConfigServer Security & Firewall)
if [ -x "/usr/sbin/csf" ]; then
if csf -l >/dev/null 2>&1; then
firewall_active=1
firewall_status="CSF"
# Check if CSF is in testing mode
if grep -q "TESTING = \"1\"" /etc/csf/csf.conf 2>/dev/null; then
add_issue "MEDIUM" "FIREWALL - CSF in testing mode" \
"CSF firewall is in TESTING mode
Blocks will auto-expire - not suitable for production" \
"Disable testing mode:
• Edit /etc/csf/csf.conf
• Set TESTING = \"0\"
• Restart: csf -r" \
62
fi
# Check for high deny count (might indicate attack)
local deny_count=$(csf -d 2>/dev/null | grep -c "^" || echo "0")
if [ "$deny_count" -gt 1000 ]; then
add_issue "MEDIUM" "FIREWALL - Many blocked IPs" \
"CSF has ${deny_count} denied IPs
Server may be under attack or CSF may need tuning" \
"Review blocked IPs:
• csf -d | less
• Check for false positives
• Consider: Main Menu → Security → Bot Analyzer" \
65
fi
fi
fi
# Check iptables
if [ "$firewall_active" -eq 0 ]; then
if systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then
firewall_active=1
firewall_status="iptables"
# Check if iptables has rules
local rule_count=$(iptables -L | grep -c "^Chain\|^target" || echo "0")
if [ "$rule_count" -lt 5 ]; then
add_issue "MEDIUM" "FIREWALL - iptables active but minimal rules" \
"iptables is running but has very few rules
Server may not be properly protected" \
"Review firewall rules: iptables -L -n -v" \
55
fi
fi
fi
# Check firewalld
if [ "$firewall_active" -eq 0 ]; then
if systemctl is-active --quiet firewalld 2>/dev/null; then
firewall_active=1
firewall_status="firewalld"
fi
fi
# Warn if no firewall detected
if [ "$firewall_active" -eq 0 ]; then
add_issue "HIGH" "FIREWALL - No active firewall detected" \
"No firewall found (CSF, iptables, firewalld)
Server is exposed to attacks" \
"Install and configure a firewall:
• CSF (recommended for cPanel):
cd /usr/src && wget https://download.configserver.com/csf.tgz
tar -xzf csf.tgz && cd csf && sh install.sh
• Or enable firewalld: systemctl enable --now firewalld" \
82
fi
}
################################################################################
# Phase 21: Network Connectivity Check
################################################################################
analyze_network_connectivity() {
print_info "Analyzing network connectivity..."
# Test outbound connectivity
local connectivity_failed=0
local test_ips=("8.8.8.8" "1.1.1.1")
for ip in "${test_ips[@]}"; do
if ! ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
connectivity_failed=$((connectivity_failed + 1))
fi
done
if [ "$connectivity_failed" -eq ${#test_ips[@]} ]; then
add_issue "CRITICAL" "NETWORK - No internet connectivity" \
"Cannot reach external IPs (${test_ips[*]})
Server has no internet access - critical services affected" \
"Check network:
• ip link show (check interfaces)
• ip route show (check routing)
• systemctl status network
• Check gateway: ping gateway IP
• Contact hosting provider if persistent" \
96
elif [ "$connectivity_failed" -gt 0 ]; then
add_issue "HIGH" "NETWORK - Intermittent connectivity issues" \
"Some external IPs unreachable
Network may be unstable" \
"Check network stability:
• ping -c 10 8.8.8.8
• mtr 8.8.8.8 (install: yum install mtr)
• Check: ip route show" \
74
fi
# Test HTTP/HTTPS connectivity (can the server download updates?)
if ! timeout 5 curl -s -o /dev/null https://google.com 2>/dev/null; then
add_issue "MEDIUM" "NETWORK - HTTP/HTTPS connectivity issues" \
"Cannot establish HTTPS connections
May affect updates, let's encrypt, external API calls" \
"Test connectivity:
• curl -v https://google.com
• Check firewall rules
• Check proxy settings: echo \$http_proxy" \
68
fi
}
################################################################################
# Phase 22: CloudLinux Specific Checks
################################################################################
analyze_cloudlinux() {
if [ "$SYS_CLOUDLINUX" != "yes" ]; then
return
fi
print_info "Analyzing CloudLinux LVE limits..."
# Check if lvectl exists
if ! command -v lvectl >/dev/null 2>&1; then
return
fi
# Get users hitting LVE limits
local lve_faults=$(lvectl list --by-fault 2>/dev/null | head -10)
if [ -n "$lve_faults" ]; then
local top_faults=$(echo "$lve_faults" | head -5 | sed 's/^/ • /')
add_issue "HIGH" "CLOUDLINUX - Users hitting LVE limits" \
"Top users hitting resource limits:
${top_faults}
This causes 503 errors and slow websites" \
"Review limits: lvectl list
Increase limits for affected users or optimize their sites:
• lvectl set [USER] --cpu=200 --pmem=2G" \
78
fi
}
################################################################################
# Main Analysis Function
################################################################################
run_analysis() {
clear
print_banner "System Health Check"
echo ""
print_info "Starting comprehensive system analysis..."
echo ""
# Run all analysis phases
analyze_memory
analyze_memory_config
analyze_disk
analyze_cpu
analyze_mysql
analyze_apache
analyze_php_fpm
analyze_security_logs
analyze_messages_log
analyze_cron
analyze_network
analyze_time
analyze_updates
analyze_file_limits
analyze_email_queue
analyze_iowait
analyze_selinux
analyze_control_panel_services
analyze_dns
analyze_zombie_processes
analyze_firewall
analyze_network_connectivity
analyze_cloudlinux
print_success "Analysis complete!"
echo ""
}
################################################################################
# Report Generation
################################################################################
generate_report() {
{
echo "=============================================================================="
echo "SERVER HEALTH CHECK - $(date '+%Y-%m-%d %H:%M:%S')"
echo "=============================================================================="
echo ""
echo "System: $(hostname)"
echo "Control Panel: ${SYS_CONTROL_PANEL:-none} ${SYS_CONTROL_PANEL_VERSION:-}"
echo "OS: ${SYS_OS_TYPE:-unknown} ${SYS_OS_VERSION:-}"
echo "Kernel: $(uname -r)"
echo ""
echo "SEVERITY SUMMARY:"
echo " CRITICAL: ${#CRITICAL_ISSUES[@]} issues"
echo " HIGH: ${#HIGH_ISSUES[@]} issues"
echo " MEDIUM: ${#MEDIUM_ISSUES[@]} issues"
echo " LOW: ${#LOW_ISSUES[@]} issues"
echo ""
# Critical issues
if [ ${#CRITICAL_ISSUES[@]} -gt 0 ]; then
echo "=============================================================================="
echo "CRITICAL ISSUES (Immediate Action Required)"
echo "=============================================================================="
echo ""
for issue in "${CRITICAL_ISSUES[@]}"; do
echo "$issue"
echo "------------------------------------------------------------------------------"
echo ""
done
fi
# High issues
if [ ${#HIGH_ISSUES[@]} -gt 0 ]; then
echo "=============================================================================="
echo "HIGH ISSUES (Action Recommended)"
echo "=============================================================================="
echo ""
for issue in "${HIGH_ISSUES[@]}"; do
echo "$issue"
echo "------------------------------------------------------------------------------"
echo ""
done
fi
# Medium issues
if [ ${#MEDIUM_ISSUES[@]} -gt 0 ]; then
echo "=============================================================================="
echo "MEDIUM ISSUES (Monitor Closely)"
echo "=============================================================================="
echo ""
for issue in "${MEDIUM_ISSUES[@]}"; do
echo "$issue"
echo "------------------------------------------------------------------------------"
echo ""
done
fi
# Low issues
if [ ${#LOW_ISSUES[@]} -gt 0 ]; then
echo "=============================================================================="
echo "LOW ISSUES (Informational)"
echo "=============================================================================="
echo ""
for issue in "${LOW_ISSUES[@]}"; do
echo "$issue"
echo "------------------------------------------------------------------------------"
echo ""
done
fi
# Summary
echo "=============================================================================="
echo "NEXT STEPS"
echo "=============================================================================="
echo ""
echo "Priority Actions:"
echo " 1. Address all CRITICAL issues immediately"
echo " 2. Plan fixes for HIGH issues"
echo " 3. Monitor MEDIUM issues for trends"
echo ""
echo "Detailed Analysis Available:"
echo " • Bot Analyzer (Menu → Security) for traffic/attack analysis"
echo " • MySQL Query Analyzer (Menu → Performance) for database optimization"
echo ""
echo "Report saved to: $REPORT_FILE"
echo ""
} | tee "$REPORT_FILE"
}
################################################################################
# Save Health Baseline to Reference Database
################################################################################
save_health_baseline() {
# Only save if reference database exists
if [ ! -f "$SYSREF_DB" ]; then
return
fi
print_info "Saving health baseline to reference database..."
# Remove old health baseline section
sed -i '/^\[HEALTH_BASELINE\]/,/^$/d' "$SYSREF_DB" 2>/dev/null
# Collect current metrics
local mem_total=$(free -m | awk '/^Mem:/ {print $2}')
local mem_used=$(free -m | awk '/^Mem:/ {print $3}')
local mem_percent=$((mem_used * 100 / mem_total))
local cpu_load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
local disk_percent=$(df -h / | tail -1 | awk '{print $5}' | tr -d '%')
local email_queue=0
if command -v exim >/dev/null 2>&1; then
email_queue=$(exim -bpc 2>/dev/null || echo "0")
fi
local httpd_status="stopped"
if pgrep -x httpd >/dev/null || pgrep -x apache2 >/dev/null; then
httpd_status="running"
fi
local mysql_status="stopped"
if pgrep -x mysqld >/dev/null || pgrep -x mariadbd >/dev/null; then
mysql_status="running"
fi
local iowait=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//' | cut -d'%' -f1 | cut -d'.' -f1)
[ -z "$iowait" ] && iowait=0
local zombie_count=$(ps aux | awk '$8 ~ /Z/' | wc -l)
local firewall_active="none"
if [ -x "/usr/sbin/csf" ] && csf -l >/dev/null 2>&1; then
firewall_active="csf"
elif systemctl is-active --quiet iptables 2>/dev/null || iptables -L >/dev/null 2>&1; then
firewall_active="iptables"
elif systemctl is-active --quiet firewalld 2>/dev/null; then
firewall_active="firewalld"
fi
local current_date=$(date '+%Y-%m-%d')
local current_datetime=$(date '+%Y-%m-%d %H:%M:%S')
# Collect network metrics
local network_interface=$(ip route | grep default | awk '{print $5}' | head -1)
local network_mtu=$(ip link show "$network_interface" 2>/dev/null | grep mtu | awk '{print $5}' || echo "unknown")
local tcp_retrans=$(netstat -s 2>/dev/null | grep "segments retransmitted" | awk '{print $1}' || echo "0")
local tcp_out=$(netstat -s 2>/dev/null | grep "segments sent out" | awk '{print $1}' || echo "1")
local tcp_retrans_percent="0"
if [ "$tcp_out" -gt 1000000 ]; then
tcp_retrans_percent=$(echo "scale=2; $tcp_retrans * 100 / $tcp_out" | bc 2>/dev/null || echo "0")
fi
local rx_errors=0
local tx_errors=0
local rx_dropped=0
local tx_dropped=0
if [ -n "$network_interface" ] && [ -d "/sys/class/net/$network_interface/statistics" ]; then
rx_errors=$(cat "/sys/class/net/$network_interface/statistics/rx_errors" 2>/dev/null || echo "0")
tx_errors=$(cat "/sys/class/net/$network_interface/statistics/tx_errors" 2>/dev/null || echo "0")
rx_dropped=$(cat "/sys/class/net/$network_interface/statistics/rx_dropped" 2>/dev/null || echo "0")
tx_dropped=$(cat "/sys/class/net/$network_interface/statistics/tx_dropped" 2>/dev/null || echo "0")
fi
# Collect hardware status
local disk_smart_status="unknown"
local disk_errors_count=0
if command -v smartctl >/dev/null 2>&1; then
local primary_disk=$(lsblk -nd -o NAME,TYPE | awk '$2=="disk" {print "/dev/"$1; exit}')
if [ -n "$primary_disk" ]; then
disk_smart_status=$(smartctl -H "$primary_disk" 2>/dev/null | grep "SMART overall-health" | awk '{print $NF}' || echo "unknown")
fi
fi
disk_errors_count=$(dmesg | grep -i "I/O error\|sector\|SMART\|Hardware Error" | wc -l || echo "0")
# Collect security metrics
local ssh_failed_attempts=0
local ssh_attacks_today=0
if [ -f "/var/log/secure" ]; then
ssh_failed_attempts=$(grep "Failed password" /var/log/secure 2>/dev/null | wc -l || echo "0")
ssh_attacks_today=$(grep "Failed password" /var/log/secure 2>/dev/null | grep "$(date '+%b %e')" | wc -l || echo "0")
fi
local cphulk_status="not_installed"
if [ -x "/usr/local/cpanel/bin/cphulk_pam_ctl" ]; then
if /usr/local/cpanel/bin/cphulk_pam_ctl --status 2>/dev/null | grep -qi "enabled"; then
cphulk_status="enabled"
else
cphulk_status="disabled"
fi
fi
# Append new health baseline section
{
echo ""
echo "[HEALTH_BASELINE]"
echo "HEALTH|TIMESTAMP|$current_datetime|"
# System resources
echo "HEALTH|MEMORY_TOTAL_MB|$mem_total|$current_date"
echo "HEALTH|MEMORY_USED_PERCENT|$mem_percent|$current_date"
echo "HEALTH|CPU_LOAD_1MIN|$cpu_load_1min|$current_date"
echo "HEALTH|CPU_CORES|$CPU_CORES|$current_date"
echo "HEALTH|DISK_USED_PERCENT|$disk_percent|$current_date"
echo "HEALTH|IOWAIT_PERCENT|$iowait|$current_date"
# Services
echo "HEALTH|EMAIL_QUEUE_SIZE|$email_queue|$current_date"
echo "HEALTH|ZOMBIE_PROCESSES|$zombie_count|$current_date"
echo "HEALTH|HTTPD_STATUS|$httpd_status|$current_date"
echo "HEALTH|MYSQL_STATUS|$mysql_status|$current_date"
echo "HEALTH|FIREWALL_STATUS|$firewall_active|$current_date"
# Network status
echo "HEALTH|NETWORK_INTERFACE|$network_interface|$current_date"
echo "HEALTH|NETWORK_MTU|$network_mtu|$current_date"
echo "HEALTH|NETWORK_RX_ERRORS|$rx_errors|$current_date"
echo "HEALTH|NETWORK_TX_ERRORS|$tx_errors|$current_date"
echo "HEALTH|NETWORK_RX_DROPPED|$rx_dropped|$current_date"
echo "HEALTH|NETWORK_TX_DROPPED|$tx_dropped|$current_date"
echo "HEALTH|TCP_RETRANS_PERCENT|$tcp_retrans_percent|$current_date"
# Hardware status
echo "HEALTH|DISK_SMART_STATUS|$disk_smart_status|$current_date"
echo "HEALTH|HARDWARE_ERRORS|$disk_errors_count|$current_date"
# Security status
echo "HEALTH|SSH_FAILED_ATTEMPTS_TOTAL|$ssh_failed_attempts|$current_date"
echo "HEALTH|SSH_ATTACKS_TODAY|$ssh_attacks_today|$current_date"
echo "HEALTH|CPHULK_STATUS|$cphulk_status|$current_date"
# Issue counts
echo "HEALTH|CRITICAL_ISSUES|${#CRITICAL_ISSUES[@]}|$current_date"
echo "HEALTH|HIGH_ISSUES|${#HIGH_ISSUES[@]}|$current_date"
echo "HEALTH|MEDIUM_ISSUES|${#MEDIUM_ISSUES[@]}|$current_date"
echo "HEALTH|LOW_ISSUES|${#LOW_ISSUES[@]}|$current_date"
echo ""
} >> "$SYSREF_DB"
}
################################################################################
# Display Report
################################################################################
display_report() {
if [ ${#CRITICAL_ISSUES[@]} -eq 0 ] && [ ${#HIGH_ISSUES[@]} -eq 0 ] && [ ${#MEDIUM_ISSUES[@]} -eq 0 ] && [ ${#LOW_ISSUES[@]} -eq 0 ]; then
echo ""
print_success "No issues detected! System is healthy."
echo ""
else
generate_report | less -R
fi
echo ""
print_info "Full report saved to: $REPORT_FILE"
echo ""
read -p "Press Enter to continue..."
}
################################################################################
# Main
################################################################################
run_analysis
save_health_baseline
display_report
# Cleanup
rm -rf "$TEMP_DIR"