From 0a8cb302df2830f10e9a2f04905986a14f081f4c Mon Sep 17 00:00:00 2001 From: cschantz Date: Thu, 20 Nov 2025 20:35:16 -0500 Subject: [PATCH] Add Loadwatch Health Analyzer for system monitoring analysis NEW FEATURE: Loadwatch Health Analyzer - Comprehensive system health analysis from loadwatch monitoring logs - Time-range analysis: 1h, 6h, 24h, 7d, 30d options - Intelligent problem detection and trending CAPABILITIES: - Memory pressure detection (low available memory, high swap usage) - CPU saturation analysis (idle %, iowait, steal time) - Load average trending and threshold detection - Process issue detection (zombie processes, high CPU/MEM consumers) - MySQL performance monitoring (slow queries, thread counts) - Network connection analysis - Historical trending across snapshots (3-minute intervals) IMPLEMENTATION: - modules/diagnostics/loadwatch-analyzer.sh - Main analyzer script - Handles symlinked loadwatch directories - Parses 7 log sections: alerts, summary, memory, CPU, tasks, MySQL, network - Generates detailed reports with actionable recommendations - Saves reports to tmp/ directory for review INTEGRATION: - Added to Performance & Diagnostics menu (option 10) - Time range selection submenu for user-friendly access - Updated README.md with feature documentation and usage examples ANALYSIS FEATURES: - Swap threshold alerts (>= 50% usage) - CPU saturation detection (< 10% idle) - High I/O wait warnings (> 20%) - Zombie process tracking - Memory availability trending (avg/min/max) - Top CPU consumers aggregated across period Perfect for: - Post-incident investigation - Capacity planning - Performance trending - System health monitoring - Identifying resource bottlenecks Works with servers that have loadwatch monitoring enabled (logs in /root/loadwatch or /var/log/loadwatch) --- README.md | 32 +- launcher.sh | 40 +- modules/diagnostics/loadwatch-analyzer.sh | 666 ++++++++++++++++++++++ 3 files changed, 731 insertions(+), 7 deletions(-) create mode 100755 modules/diagnostics/loadwatch-analyzer.sh diff --git a/README.md b/README.md index ac50c7a..1c6e2ca 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ server-toolkit/ │ ├── modules/ # Modular scripts organized by category │ │ +│ ├── diagnostics/ # 🔍 System Diagnostics +│ │ ├── system-health-check.sh # Comprehensive health analysis +│ │ └── loadwatch-analyzer.sh # System health from loadwatch monitoring logs +│ │ │ ├── security/ # 🛡️ Security & Threat Analysis │ │ ├── bot-analyzer.sh # Full bot/threat analysis │ │ ├── live-attack-monitor.sh # Real-time attack monitoring dashboard @@ -42,13 +46,15 @@ server-toolkit/ │ │ ├── website-error-analyzer.sh # Comprehensive website error analysis │ │ └── 500-error-tracker.sh # Track and analyze 500 errors │ │ -│ ├── diagnostics/ # 🔍 System Diagnostics -│ │ └── system-health-check.sh # Comprehensive health analysis +│ ├── diagnostics/ # 🔍 System Diagnostics & Log Analysis +│ │ ├── system-health-check.sh # Comprehensive health analysis +│ │ └── loadwatch-analyzer.sh # System health monitoring from loadwatch logs │ │ │ ├── performance/ # 📊 Performance Analysis │ │ ├── hardware-health-check.sh # Hardware diagnostics │ │ ├── mysql-query-analyzer.sh # MySQL performance analysis -│ │ └── network-bandwidth-analyzer.sh # Network analysis +│ │ ├── network-bandwidth-analyzer.sh # Network analysis +│ │ └── (other performance modules) │ │ │ └── maintenance/ # 🧹 System Maintenance │ └── cleanup-toolkit-data.sh # Clean temporary toolkit data @@ -110,10 +116,17 @@ source /root/server-toolkit/run.sh - **Log Integration**: Apache, PHP-FPM, cPanel error log analysis - **Smart Recommendations**: Context-aware suggestions for fixing issues -### 🔍 System Diagnostics +### 🔍 System Diagnostics & Performance Monitoring - **Comprehensive Health Checks**: Hardware, services, security posture +- **Loadwatch Health Analyzer**: Historical system health analysis from monitoring logs + - Time-range analysis: 1h, 6h, 24h, 7d, 30d + - Memory pressure detection and swap usage trending + - CPU saturation analysis (idle, iowait, steal time) + - Process issue detection (zombies, high CPU/MEM consumers) + - MySQL performance monitoring + - Actionable recommendations based on findings - **Smart Recommendations**: Context-aware suggestions based on findings -- **cPanel/WHM Integration**: Native support for cPanel environments +- **Multi-Panel Support**: cPanel, InterWorx, Plesk, standalone Apache ### 📊 Session Intelligence - **Reference Database**: Cross-module data sharing (.sysref) @@ -168,6 +181,15 @@ bash launcher.sh # Select: System Health Check ``` +### Loadwatch System Health Analysis + +```bash +bash launcher.sh +# Select: Performance & Diagnostics +# Select: Loadwatch Health Analyzer +# Choose time range: 1h, 6h, 24h, 7d, or 30d +``` + ## 🔧 Configuration Edit the configuration file: diff --git a/launcher.sh b/launcher.sh index 58cfa1a..58d403d 100755 --- a/launcher.sh +++ b/launcher.sh @@ -539,7 +539,8 @@ show_performance_menu() { echo "" echo -e "${BOLD}Logs & Diagnostics:${NC}" echo -e " ${MAGENTA}9)${NC} Log Analyzer - Parse and analyze system logs" - echo -e " ${MAGENTA}10)${NC} Email Queue Monitor - Mail queue analysis" + echo -e " ${MAGENTA}10)${NC} Loadwatch Health Analyzer - System health from monitoring logs" + echo -e " ${MAGENTA}11)${NC} Email Queue Monitor - Mail queue analysis" echo "" echo -e " ${RED}0)${NC} Back to Main Menu" echo "" @@ -1346,6 +1347,40 @@ handle_wp_security_menu() { done } +# Loadwatch analyzer handler with time range selection +handle_loadwatch_analyzer() { + show_banner + echo -e "${MAGENTA}${BOLD}📊 Loadwatch Health Analyzer${NC}" + echo "" + echo -e "Select time range for analysis:" + echo "" + echo -e " ${CYAN}1)${NC} Last 1 Hour - Recent system activity" + echo -e " ${CYAN}2)${NC} Last 6 Hours - Mid-term trending" + echo -e " ${CYAN}3)${NC} Last 24 Hours - Full day analysis" + echo -e " ${CYAN}4)${NC} Last 7 Days - Weekly patterns" + echo -e " ${CYAN}5)${NC} Last 30 Days - Monthly overview" + echo "" + echo -e " ${RED}0)${NC} Back" + echo "" + echo -e "${CYAN}──────────────────────────────────────────────────────────────${NC}" + echo -n "Select time range: " + + read -r range_choice + + case $range_choice in + 1) run_module "diagnostics" "loadwatch-analyzer.sh" "-r" "1h" ;; + 2) run_module "diagnostics" "loadwatch-analyzer.sh" "-r" "6h" ;; + 3) run_module "diagnostics" "loadwatch-analyzer.sh" "-r" "24h" ;; + 4) run_module "diagnostics" "loadwatch-analyzer.sh" "-r" "7d" ;; + 5) run_module "diagnostics" "loadwatch-analyzer.sh" "-r" "30d" ;; + 0) return ;; + *) + echo -e "${RED}Invalid option${NC}" + sleep 1 + ;; + esac +} + # Performance submenu handler handle_performance_menu() { while true; do @@ -1362,7 +1397,8 @@ handle_performance_menu() { 7) run_module "performance" "apache-performance.sh" ;; 8) run_module "performance" "php-fpm-monitor.sh" ;; 9) run_module "performance" "log-analyzer.sh" ;; - 10) run_module "performance" "email-queue-monitor.sh" ;; + 10) handle_loadwatch_analyzer ;; + 11) run_module "performance" "email-queue-monitor.sh" ;; 0) return ;; *) echo -e "${RED}Invalid option${NC}"; sleep 1 ;; esac diff --git a/modules/diagnostics/loadwatch-analyzer.sh b/modules/diagnostics/loadwatch-analyzer.sh new file mode 100755 index 0000000..f842e2d --- /dev/null +++ b/modules/diagnostics/loadwatch-analyzer.sh @@ -0,0 +1,666 @@ +#!/bin/bash + +############################################################################# +# Loadwatch Log Analyzer +# Version: 1.0 +# Comprehensive system health analysis from loadwatch monitoring logs +# +# Features: +# - Time-range analysis (1h, 6h, 24h, 7d, 30d) +# - Memory pressure detection (low available memory, high swap usage) +# - CPU saturation analysis (idle, iowait, steal time) +# - Load average trending +# - Process issue detection (zombies, high CPU/MEM consumers) +# - MySQL performance monitoring +# - Network connection analysis +# - Historical trending and pattern detection +# - Actionable recommendations based on findings +############################################################################# + +# Load libraries +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +source "$SCRIPT_DIR/lib/common-functions.sh" + +# Configuration +LOADWATCH_DIR="/root/loadwatch" +TOOLKIT_TMP_DIR="$SCRIPT_DIR/tmp" +mkdir -p "$TOOLKIT_TMP_DIR" 2>/dev/null + +TEMP_DIR="$TOOLKIT_TMP_DIR/loadwatch_analysis_$$" +OUTPUT_FILE="$TOOLKIT_TMP_DIR/loadwatch_report_$(date +%Y%m%d_%H%M%S).txt" +TIME_RANGE="" # hours, 6hours, day, week, month + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -r|--range) + TIME_RANGE="$2" + shift 2 + ;; + -d|--directory) + LOADWATCH_DIR="$2" + shift 2 + ;; + -o|--output) + OUTPUT_FILE="$2" + shift 2 + ;; + -h|--help) + echo "Loadwatch Log Analyzer v1.0" + echo "" + echo "Usage: $0 [-r RANGE] [-d DIRECTORY] [-o OUTPUT_FILE]" + echo "" + echo "Options:" + echo " -r, --range RANGE Time range: 1h, 6h, 24h, 7d, 30d (default: 24h)" + echo " -d, --directory DIR Loadwatch log directory (default: /root/loadwatch)" + echo " -o, --output FILE Custom output file path" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " $0 -r 1h # Analyze last hour" + echo " $0 -r 6h # Analyze last 6 hours" + echo " $0 -r 24h # Analyze last 24 hours" + echo " $0 -r 7d # Analyze last 7 days" + echo " $0 -r 30d # Analyze last 30 days" + echo "" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use -h or --help for usage information" + exit 1 + ;; + esac +done + +# Set default time range if not specified +if [ -z "$TIME_RANGE" ]; then + TIME_RANGE="24h" +fi + +# Cleanup function +cleanup() { + rm -rf "$TEMP_DIR" 2>/dev/null +} +trap cleanup EXIT + +# Create temp directory +mkdir -p "$TEMP_DIR" + +# Color codes +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +BOLD='\033[1m' +NC='\033[0m' + +# Helper functions for output (if not already loaded from common-functions) +if ! type print_status >/dev/null 2>&1; then + print_status() { echo -e "${BLUE}[STATUS]${NC} $1"; } + print_substatus() { echo -e " ${BLUE}→${NC} $1"; } +fi + +# Start analysis +print_header "LOADWATCH SYSTEM HEALTH ANALYZER v1.0" +print_info "Analysis Time: $(date '+%Y-%m-%d %H:%M:%S')" +print_info "Time Range: $TIME_RANGE" +print_info "Log Directory: $LOADWATCH_DIR" + +# Check if loadwatch directory exists +if [ ! -d "$LOADWATCH_DIR" ]; then + print_error "Loadwatch directory not found: $LOADWATCH_DIR" + print_info "This server may not have loadwatch monitoring enabled" + exit 1 +fi + +# Count available logs (handle symlinked directories) +TOTAL_LOGS=$(find -L "$LOADWATCH_DIR" -name "*.txt" -type f 2>/dev/null | wc -l) +if [ "$TOTAL_LOGS" -eq 0 ]; then + print_error "No loadwatch logs found in $LOADWATCH_DIR" + exit 1 +fi +print_info "Total logs available: $TOTAL_LOGS" + +# Calculate time cutoff based on range +CUTOFF_TIME="" +case "$TIME_RANGE" in + 1h) + CUTOFF_TIME=$(date -d '1 hour ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-1H '+%Y-%m-%d.%H.%M') + RANGE_DESC="Last 1 Hour" + ;; + 6h) + CUTOFF_TIME=$(date -d '6 hours ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-6H '+%Y-%m-%d.%H.%M') + RANGE_DESC="Last 6 Hours" + ;; + 24h|day) + CUTOFF_TIME=$(date -d '24 hours ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-24H '+%Y-%m-%d.%H.%M') + RANGE_DESC="Last 24 Hours" + ;; + 7d|week) + CUTOFF_TIME=$(date -d '7 days ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-7d '+%Y-%m-%d.%H.%M') + RANGE_DESC="Last 7 Days" + ;; + 30d|month) + CUTOFF_TIME=$(date -d '30 days ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-30d '+%Y-%m-%d.%H.%M') + RANGE_DESC="Last 30 Days" + ;; + *) + print_error "Invalid time range: $TIME_RANGE" + echo "Valid options: 1h, 6h, 24h, 7d, 30d" + exit 1 + ;; +esac + +print_info "Analyzing: $RANGE_DESC" +print_info "Cutoff time: $CUTOFF_TIME" + +# Find logs within time range +print_status "Collecting logs in time range..." +> "$TEMP_DIR/logfiles.txt" + +for logfile in "$LOADWATCH_DIR"/*.txt; do + [ -f "$logfile" ] || continue + basename=$(basename "$logfile" .txt) + + # Compare timestamps (simple string comparison works with YYYY-MM-DD.HH.MM format) + if [[ "$basename" > "$CUTOFF_TIME" || "$basename" == "$CUTOFF_TIME" ]]; then + echo "$logfile" >> "$TEMP_DIR/logfiles.txt" + fi +done + +ANALYZED_LOGS=$(wc -l < "$TEMP_DIR/logfiles.txt") +if [ "$ANALYZED_LOGS" -eq 0 ]; then + print_error "No logs found in specified time range" + exit 1 +fi +print_success "Found $ANALYZED_LOGS logs in range" + +############################################################################# +# PHASE 1: Parse all logs and extract metrics +############################################################################# + +print_status "Phase 1/4: Parsing logs and extracting metrics..." + +> "$TEMP_DIR/metrics.txt" +> "$TEMP_DIR/alerts.txt" +> "$TEMP_DIR/top_processes.txt" + +LOG_COUNT=0 +SWAP_ALERT_COUNT=0 + +while IFS= read -r logfile; do + LOG_COUNT=$((LOG_COUNT + 1)) + basename=$(basename "$logfile" .txt) + TIMESTAMP="$basename" + + # Check for swap alert (first line) + if head -1 "$logfile" | grep -q "swap threshold exceeded"; then + SWAP_ALERT_COUNT=$((SWAP_ALERT_COUNT + 1)) + echo "$TIMESTAMP SWAP_THRESHOLD_EXCEEDED" >> "$TEMP_DIR/alerts.txt" + fi + + # Parse summary line + SUMMARY=$(grep -A1 "## server overview" "$logfile" | tail -1) + if [ ! -z "$SUMMARY" ]; then + # Extract: timestamp load[X] mem[XX/YY] mysql[X/X] httpd[X] + LOAD_ROUNDED=$(echo "$SUMMARY" | awk '{print $2}' | sed 's/load\[\(.*\)\]/\1/') + MEM_PCT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/') + MYSQL_THREADS=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/') + HTTPD_COUNT=$(echo "$SUMMARY" | awk '{print $5}' | sed 's/httpd\[\(.*\)\]/\1/') + fi + + # Parse memory stats + MEM_LINE=$(grep "^Mem:" "$logfile") + if [ ! -z "$MEM_LINE" ]; then + MEM_TOTAL=$(echo "$MEM_LINE" | awk '{print $2}') + MEM_USED=$(echo "$MEM_LINE" | awk '{print $3}') + MEM_AVAILABLE=$(echo "$MEM_LINE" | awk '{print $7}') + fi + + SWAP_LINE=$(grep "^Swap:" "$logfile") + if [ ! -z "$SWAP_LINE" ]; then + SWAP_TOTAL=$(echo "$SWAP_LINE" | awk '{print $2}') + SWAP_USED=$(echo "$SWAP_LINE" | awk '{print $3}') + fi + + # Parse load average + LOAD_LINE=$(grep "load average:" "$logfile" | head -1) + if [ ! -z "$LOAD_LINE" ]; then + LOAD_1MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: \([0-9.]*\),.*/\1/') + LOAD_5MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: [0-9.]*, \([0-9.]*\),.*/\1/') + LOAD_15MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: [0-9.]*, [0-9.]*, \([0-9.]*\)/\1/') + fi + + # Parse task summary + TASK_LINE=$(grep "^Tasks:" "$logfile") + if [ ! -z "$TASK_LINE" ]; then + TASK_TOTAL=$(echo "$TASK_LINE" | awk '{print $2}') + TASK_RUNNING=$(echo "$TASK_LINE" | awk '{print $4}' | tr -d ',') + TASK_ZOMBIE=$(echo "$TASK_LINE" | awk '{print $12}') + + if [ "$TASK_ZOMBIE" -gt 0 ] 2>/dev/null; then + echo "$TIMESTAMP ZOMBIE_PROCESSES count=$TASK_ZOMBIE" >> "$TEMP_DIR/alerts.txt" + fi + fi + + # Parse CPU stats + CPU_LINE=$(grep "^%Cpu(s):" "$logfile") + if [ ! -z "$CPU_LINE" ]; then + CPU_USER=$(echo "$CPU_LINE" | awk '{print $2}') + CPU_SYSTEM=$(echo "$CPU_LINE" | awk '{print $4}' | tr -d ',') + CPU_IDLE=$(echo "$CPU_LINE" | awk '{print $8}' | tr -d ',') + CPU_IOWAIT=$(echo "$CPU_LINE" | awk '{print $10}' | tr -d ',') + CPU_STEAL=$(echo "$CPU_LINE" | awk '{print $16}') + + # Check thresholds + CPU_IDLE_INT=$(echo "$CPU_IDLE" | cut -d. -f1) + if [ "$CPU_IDLE_INT" -lt 10 ] 2>/dev/null; then + echo "$TIMESTAMP CPU_SATURATED idle=${CPU_IDLE}%" >> "$TEMP_DIR/alerts.txt" + fi + + IOWAIT_INT=$(echo "$CPU_IOWAIT" | cut -d. -f1) + if [ "$IOWAIT_INT" -gt 20 ] 2>/dev/null; then + echo "$TIMESTAMP HIGH_IOWAIT iowait=${CPU_IOWAIT}%" >> "$TEMP_DIR/alerts.txt" + fi + fi + + # Parse MySQL stats + MYSQL_LINE=$(grep "^Uptime:" "$logfile") + if [ ! -z "$MYSQL_LINE" ]; then + MYSQL_QPS=$(echo "$MYSQL_LINE" | awk '{print $18}') + MYSQL_SLOW=$(echo "$MYSQL_LINE" | awk '{print $8}') + + if [ "$MYSQL_SLOW" -gt 0 ] 2>/dev/null; then + echo "$TIMESTAMP MYSQL_SLOW_QUERIES count=$MYSQL_SLOW" >> "$TEMP_DIR/alerts.txt" + fi + fi + + # Extract top 3 CPU processes + awk '/PID USER.*COMMAND/,/^USER.*TTY/ { + if ($1 ~ /^[0-9]+$/ && NR <= 20) { + print "'"$TIMESTAMP"'", $1, $12, $9, $10 + } + }' "$logfile" | head -3 >> "$TEMP_DIR/top_processes.txt" + + # Write metrics line + echo "$TIMESTAMP|$MEM_AVAILABLE|$MEM_USED|$MEM_TOTAL|$SWAP_USED|$SWAP_TOTAL|$LOAD_1MIN|$LOAD_5MIN|$LOAD_15MIN|$CPU_IDLE|$CPU_IOWAIT|$CPU_STEAL|$TASK_TOTAL|$TASK_RUNNING|$TASK_ZOMBIE|$MYSQL_QPS|$HTTPD_COUNT" >> "$TEMP_DIR/metrics.txt" + + # Progress indicator + if [ $((LOG_COUNT % 20)) -eq 0 ]; then + echo -ne "\rProcessed $LOG_COUNT/$ANALYZED_LOGS logs..." + fi +done < "$TEMP_DIR/logfiles.txt" + +echo -ne "\r" +print_success "Parsed $LOG_COUNT logs successfully" + +############################################################################# +# PHASE 2: Analyze metrics and detect issues +############################################################################# + +print_status "Phase 2/4: Analyzing metrics and detecting issues..." + +# Memory analysis +print_substatus "Analyzing memory usage..." +awk -F'|' '{ + if ($2 != "" && $2 > 0) { + sum_avail += $2 + count++ + if ($2 < min_avail || min_avail == 0) min_avail = $2 + if ($2 > max_avail) max_avail = $2 + if ($2 < 200) critical++ + if ($2 < 500 && $2 >= 200) warning++ + } +} +END { + if (count > 0) { + avg = sum_avail / count + print "MEMORY_AVAILABLE", "avg=" avg, "min=" min_avail, "max=" max_avail, "critical=" critical, "warning=" warning + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/memory_analysis.txt" + +# Swap analysis +awk -F'|' '{ + if ($5 != "" && $5 > 0 && $6 > 0) { + pct = ($5 / $6) * 100 + sum_swap += $5 + count++ + if (pct > max_swap_pct) max_swap_pct = pct + if (pct >= 50) high++ + } +} +END { + if (count > 0) { + avg = sum_swap / count + print "SWAP_USAGE", "avg_used=" avg, "max_pct=" max_swap_pct, "high_count=" high + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/swap_analysis.txt" + +# Load average analysis +print_substatus "Analyzing load average..." +awk -F'|' '{ + if ($7 != "") { + sum_load1 += $7 + count++ + if ($7 > max_load1) max_load1 = $7 + if ($7 > 2.0) high++ + } +} +END { + if (count > 0) { + avg = sum_load1 / count + print "LOAD_AVERAGE", "avg_1min=" avg, "max_1min=" max_load1, "high_count=" high + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/load_analysis.txt" + +# CPU analysis +print_substatus "Analyzing CPU usage..." +awk -F'|' '{ + if ($10 != "") { + sum_idle += $10 + count++ + if ($10 < min_idle || min_idle == 0 || min_idle == "") min_idle = $10 + idle_int = int($10) + if (idle_int < 10) critical++ + if (idle_int >= 10 && idle_int < 20) warning++ + } + if ($11 != "") { + sum_iowait += $11 + iowait_int = int($11) + if (iowait_int > 20) high_iowait++ + } +} +END { + if (count > 0) { + avg_idle = sum_idle / count + avg_iowait = sum_iowait / count + print "CPU_STATS", "avg_idle=" avg_idle, "min_idle=" min_idle, "critical=" critical, "warning=" warning, "avg_iowait=" avg_iowait, "high_iowait=" high_iowait + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/cpu_analysis.txt" + +print_success "Metric analysis complete" + +############################################################################# +# PHASE 3: Identify top resource consumers +############################################################################# + +print_status "Phase 3/4: Identifying resource consumers..." + +# Top CPU processes (aggregate across all snapshots) +print_substatus "Analyzing top CPU consumers..." +awk '{ + cmd = $3 + cpu = $4 + if (cpu > 0) { + cpu_sum[cmd] += cpu + count[cmd]++ + } +} +END { + for (cmd in cpu_sum) { + avg = cpu_sum[cmd] / count[cmd] + print avg, cmd, count[cmd] + } +}' "$TEMP_DIR/top_processes.txt" | sort -rn | head -10 > "$TEMP_DIR/top_cpu_consumers.txt" + +print_success "Resource analysis complete" + +############################################################################# +# PHASE 4: Generate report +############################################################################# + +print_status "Phase 4/4: Generating report..." + +{ + echo "================================================================================" + echo "LOADWATCH SYSTEM HEALTH ANALYSIS REPORT" + echo "================================================================================" + echo "" + echo "Report Generated: $(date '+%Y-%m-%d %H:%M:%S')" + echo "Analysis Period: $RANGE_DESC" + echo "Snapshots Analyzed: $ANALYZED_LOGS (@ 3-minute intervals)" + echo "" + + echo "================================================================================" + echo "CRITICAL ALERTS SUMMARY" + echo "================================================================================" + echo "" + + ALERT_COUNT=$(wc -l < "$TEMP_DIR/alerts.txt") + if [ "$ALERT_COUNT" -eq 0 ]; then + echo "✓ No critical alerts detected" + else + echo "⚠ Total Alerts: $ALERT_COUNT" + echo "" + + # Swap alerts + SWAP_ALERTS=$(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$SWAP_ALERTS" -gt 0 ]; then + echo "SWAP THRESHOLD EXCEEDED: $SWAP_ALERTS occurrences" + echo " First: $(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | head -1 | awk '{print $1}')" + echo " Last: $(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | tail -1 | awk '{print $1}')" + echo "" + fi + + # CPU saturation + CPU_ALERTS=$(grep "CPU_SATURATED" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$CPU_ALERTS" -gt 0 ]; then + echo "CPU SATURATION: $CPU_ALERTS occurrences" + grep "CPU_SATURATED" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$CPU_ALERTS" -gt 5 ] && echo " ... and $((CPU_ALERTS - 5)) more" + echo "" + fi + + # I/O wait + IO_ALERTS=$(grep "HIGH_IOWAIT" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$IO_ALERTS" -gt 0 ]; then + echo "HIGH I/O WAIT: $IO_ALERTS occurrences" + grep "HIGH_IOWAIT" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$IO_ALERTS" -gt 5 ] && echo " ... and $((IO_ALERTS - 5)) more" + echo "" + fi + + # Zombie processes + ZOMBIE_ALERTS=$(grep "ZOMBIE_PROCESSES" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$ZOMBIE_ALERTS" -gt 0 ]; then + echo "ZOMBIE PROCESSES: $ZOMBIE_ALERTS occurrences" + grep "ZOMBIE_PROCESSES" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$ZOMBIE_ALERTS" -gt 5 ] && echo " ... and $((ZOMBIE_ALERTS - 5)) more" + echo "" + fi + fi + + echo "================================================================================" + echo "MEMORY ANALYSIS" + echo "================================================================================" + echo "" + + if [ -f "$TEMP_DIR/memory_analysis.txt" ]; then + read -r _ avg min max critical warning < "$TEMP_DIR/memory_analysis.txt" + AVG_MEM=$(echo "$avg" | cut -d= -f2 | cut -d. -f1) + MIN_MEM=$(echo "$min" | cut -d= -f2 | cut -d. -f1) + MAX_MEM=$(echo "$max" | cut -d= -f2 | cut -d. -f1) + CRIT_COUNT=$(echo "$critical" | cut -d= -f2) + WARN_COUNT=$(echo "$warning" | cut -d= -f2) + + echo "Available Memory:" + echo " Average: ${AVG_MEM} MiB" + echo " Minimum: ${MIN_MEM} MiB" + echo " Maximum: ${MAX_MEM} MiB" + echo "" + echo "Memory Pressure Events:" + echo " Critical (< 200 MiB): $CRIT_COUNT snapshots" + echo " Warning (< 500 MiB): $WARN_COUNT snapshots" + echo "" + fi + + if [ -f "$TEMP_DIR/swap_analysis.txt" ]; then + read -r _ avg_used max_pct high_count < "$TEMP_DIR/swap_analysis.txt" + AVG_SWAP=$(echo "$avg_used" | cut -d= -f2 | cut -d. -f1) + MAX_SWAP_PCT=$(echo "$max_pct" | cut -d= -f2 | cut -d. -f1) + HIGH_SWAP=$(echo "$high_count" | cut -d= -f2) + + echo "Swap Usage:" + echo " Average Used: ${AVG_SWAP} MiB" + echo " Maximum %: ${MAX_SWAP_PCT}%" + echo " High (>= 50%): $HIGH_SWAP snapshots" + echo "" + fi + + echo "================================================================================" + echo "CPU & LOAD ANALYSIS" + echo "================================================================================" + echo "" + + if [ -f "$TEMP_DIR/load_analysis.txt" ]; then + read -r _ avg_load max_load high_count < "$TEMP_DIR/load_analysis.txt" + AVG_LOAD=$(echo "$avg_load" | cut -d= -f2) + MAX_LOAD=$(echo "$max_load" | cut -d= -f2) + HIGH_LOAD=$(echo "$high_count" | cut -d= -f2) + + echo "Load Average (1-minute):" + echo " Average: $AVG_LOAD" + echo " Maximum: $MAX_LOAD" + echo " High (> 2.0): $HIGH_LOAD snapshots" + echo "" + fi + + if [ -f "$TEMP_DIR/cpu_analysis.txt" ]; then + read -r _ avg_idle min_idle critical warning avg_iowait high_iowait < "$TEMP_DIR/cpu_analysis.txt" + AVG_IDLE=$(echo "$avg_idle" | cut -d= -f2) + MIN_IDLE=$(echo "$min_idle" | cut -d= -f2) + CRIT_CPU=$(echo "$critical" | cut -d= -f2) + WARN_CPU=$(echo "$warning" | cut -d= -f2) + AVG_IOWAIT=$(echo "$avg_iowait" | cut -d= -f2) + HIGH_IOWAIT=$(echo "$high_iowait" | cut -d= -f2) + + echo "CPU Idle Time:" + echo " Average: ${AVG_IDLE}%" + echo " Minimum: ${MIN_IDLE}%" + echo "" + echo "CPU Pressure Events:" + echo " Critical (< 10% idle): $CRIT_CPU snapshots" + echo " Warning (< 20% idle): $WARN_CPU snapshots" + echo "" + echo "I/O Wait:" + echo " Average: ${AVG_IOWAIT}%" + echo " High (> 20%): $HIGH_IOWAIT snapshots" + echo "" + fi + + echo "================================================================================" + echo "TOP CPU CONSUMERS (Averaged Across Period)" + echo "================================================================================" + echo "" + + if [ -f "$TEMP_DIR/top_cpu_consumers.txt" ] && [ -s "$TEMP_DIR/top_cpu_consumers.txt" ]; then + printf "%-10s %-50s %s\n" "AVG CPU%" "PROCESS" "OCCURRENCES" + printf "%-10s %-50s %s\n" "--------" "------------------------------------------------" "-----------" + while read avg_cpu cmd occurrences; do + printf "%-10.1f %-50s %s\n" "$avg_cpu" "$cmd" "$occurrences" + done < "$TEMP_DIR/top_cpu_consumers.txt" + else + echo "No significant CPU consumers found" + fi + echo "" + + echo "================================================================================" + echo "RECOMMENDATIONS" + echo "================================================================================" + echo "" + + # Generate recommendations based on findings + RECOMMENDATIONS=0 + + # Set defaults for empty values + CRIT_COUNT=${CRIT_COUNT:-0} + AVG_MEM=${AVG_MEM:-1000} + HIGH_SWAP=${HIGH_SWAP:-0} + CRIT_CPU=${CRIT_CPU:-0} + HIGH_IOWAIT=${HIGH_IOWAIT:-0} + ZOMBIE_ALERTS=${ZOMBIE_ALERTS:-0} + + if [ "$CRIT_COUNT" -gt 0 ] || [ "$AVG_MEM" -lt 300 ]; then + echo "⚠ MEMORY: Critical memory pressure detected" + echo " - Average available memory is very low (${AVG_MEM} MiB)" + echo " - Consider adding more RAM to the server" + echo " - Review top memory consumers and optimize/limit resource usage" + echo " - Check for memory leaks in long-running processes" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + if [ "$HIGH_SWAP" -gt "$((ANALYZED_LOGS / 2))" ]; then + echo "⚠ SWAP: Excessive swap usage detected" + echo " - Swap usage exceeded 50% in ${HIGH_SWAP} of ${ANALYZED_LOGS} snapshots" + echo " - This indicates insufficient RAM for current workload" + echo " - Add more physical RAM or reduce memory-intensive processes" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + if [ "$CRIT_CPU" -gt 10 ]; then + echo "⚠ CPU: CPU saturation detected" + echo " - CPU was saturated (< 10% idle) in $CRIT_CPU snapshots" + echo " - Review top CPU consumers above" + echo " - Consider upgrading to more CPU cores" + echo " - Optimize or throttle CPU-intensive processes" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + if [ "$HIGH_IOWAIT" -gt 10 ]; then + echo "⚠ I/O: High disk I/O wait detected" + echo " - I/O wait exceeded 20% in $HIGH_IOWAIT snapshots" + echo " - Indicates disk bottleneck or slow storage" + echo " - Consider upgrading to SSD storage" + echo " - Check for disk-intensive processes or failing drives" + echo " - Run: iostat -x 1 5 to identify slow devices" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + if [ "$ZOMBIE_ALERTS" -gt 0 ]; then + echo "⚠ PROCESSES: Zombie processes detected" + echo " - Zombie processes found in $ZOMBIE_ALERTS snapshots" + echo " - Indicates parent processes not cleaning up children properly" + echo " - Review process management in affected applications" + echo " - May require application restart or code fixes" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + if [ "$RECOMMENDATIONS" -eq 0 ]; then + echo "✓ System appears healthy during analyzed period" + echo " - No critical issues detected" + echo " - Memory, CPU, and I/O metrics within normal ranges" + echo " - Continue regular monitoring" + echo "" + fi + + echo "================================================================================" + echo "ANALYSIS COMPLETE" + echo "================================================================================" + echo "" + echo "Report saved to: $OUTPUT_FILE" + echo "" + +} | tee "$OUTPUT_FILE" + +print_success "Report generated successfully" +print_info "Report location: $OUTPUT_FILE" + +# Display report path for easy access +echo "" +echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${BOLD}View full report:${NC} cat $OUTPUT_FILE" +echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo ""