2461d972ce
lib/php-analyzer.sh: - Line 364: Initialize sum=0 in awk for request counting - Line 1374: Initialize sum=0 in awk for MySQL memory calculation modules/diagnostics/loadwatch-analyzer.sh: - Lines 748-752: Initialize i=0 for memory velocity parsing - Lines 794-797: Initialize i=0 for load trend parsing modules/performance/hardware-health-check.sh: - Lines 1243, 1244, 1247: Initialize sum=0 for network error metrics Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
1088 lines
41 KiB
Bash
Executable File
1088 lines
41 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
#############################################################################
|
|
# Loadwatch Log Analyzer
|
|
# Version: 1.0
|
|
# Comprehensive system health analysis from loadwatch monitoring logs
|
|
#
|
|
# Features:
|
|
# - Time-range analysis (1h, 6h, 24h, 7d, 30d)
|
|
# - Memory pressure detection (low available memory, high swap usage)
|
|
# - CPU saturation analysis (idle, iowait, steal time)
|
|
# - Load average trending
|
|
# - Process issue detection (zombies, high CPU/MEM consumers)
|
|
# - MySQL performance monitoring
|
|
# - Network connection analysis
|
|
# - Historical trending and pattern detection
|
|
# - Actionable recommendations based on findings
|
|
#############################################################################
|
|
|
|
# Load libraries
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
source "$SCRIPT_DIR/lib/common-functions.sh"
|
|
|
|
# Configuration
|
|
LOADWATCH_DIR="/root/loadwatch"
|
|
TOOLKIT_TMP_DIR="$SCRIPT_DIR/tmp"
|
|
mkdir -p "$TOOLKIT_TMP_DIR" 2>/dev/null
|
|
|
|
TEMP_DIR="$TOOLKIT_TMP_DIR/loadwatch_analysis_$$"
|
|
OUTPUT_FILE="$TOOLKIT_TMP_DIR/loadwatch_report_$(date +%Y%m%d_%H%M%S).txt"
|
|
TIME_RANGE="" # hours, 6hours, day, week, month
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
-r|--range)
|
|
TIME_RANGE="$2"
|
|
shift 2
|
|
;;
|
|
-d|--directory)
|
|
LOADWATCH_DIR="$2"
|
|
shift 2
|
|
;;
|
|
-o|--output)
|
|
OUTPUT_FILE="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
echo "Loadwatch Log Analyzer v1.0"
|
|
echo ""
|
|
echo "Usage: $0 [-r RANGE] [-d DIRECTORY] [-o OUTPUT_FILE]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " -r, --range RANGE Time range: 1h, 6h, 24h, 7d, 30d (default: 24h)"
|
|
echo " -d, --directory DIR Loadwatch log directory (default: /root/loadwatch)"
|
|
echo " -o, --output FILE Custom output file path"
|
|
echo " -h, --help Show this help message"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 -r 1h # Analyze last hour"
|
|
echo " $0 -r 6h # Analyze last 6 hours"
|
|
echo " $0 -r 24h # Analyze last 24 hours"
|
|
echo " $0 -r 7d # Analyze last 7 days"
|
|
echo " $0 -r 30d # Analyze last 30 days"
|
|
echo ""
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
echo "Use -h or --help for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Set default time range if not specified
|
|
if [ -z "$TIME_RANGE" ]; then
|
|
TIME_RANGE="24h"
|
|
fi
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
rm -rf "$TEMP_DIR" 2>/dev/null
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Create temp directory
|
|
mkdir -p "$TEMP_DIR"
|
|
|
|
# Color codes
|
|
RED='\033[0;31m'
|
|
YELLOW='\033[1;33m'
|
|
GREEN='\033[0;32m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
# Helper functions for output (if not already loaded from common-functions)
|
|
if ! type print_status >/dev/null 2>&1; then
|
|
print_status() { echo -e "${BLUE}[STATUS]${NC} $1"; }
|
|
print_substatus() { echo -e " ${BLUE}→${NC} $1"; }
|
|
fi
|
|
|
|
# Start analysis
|
|
print_header "LOADWATCH SYSTEM HEALTH ANALYZER v1.0"
|
|
print_info "Analysis Time: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
print_info "Time Range: $TIME_RANGE"
|
|
print_info "Log Directory: $LOADWATCH_DIR"
|
|
|
|
# Check if loadwatch directory exists
|
|
if [ ! -d "$LOADWATCH_DIR" ]; then
|
|
print_error "Loadwatch directory not found: $LOADWATCH_DIR"
|
|
print_info "This server may not have loadwatch monitoring enabled"
|
|
exit 1
|
|
fi
|
|
|
|
# Count available logs (handle symlinked directories)
|
|
TOTAL_LOGS=$(find -L "$LOADWATCH_DIR" -name "*.txt" -type f 2>/dev/null | wc -l)
|
|
if [ "$TOTAL_LOGS" -eq 0 ]; then
|
|
print_error "No loadwatch logs found in $LOADWATCH_DIR"
|
|
exit 1
|
|
fi
|
|
print_info "Total logs available: $TOTAL_LOGS"
|
|
|
|
# Calculate time cutoff based on range
|
|
CUTOFF_TIME=""
|
|
case "$TIME_RANGE" in
|
|
1h)
|
|
CUTOFF_TIME=$(date -d '1 hour ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-1H '+%Y-%m-%d.%H.%M')
|
|
RANGE_DESC="Last 1 Hour"
|
|
;;
|
|
6h)
|
|
CUTOFF_TIME=$(date -d '6 hours ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-6H '+%Y-%m-%d.%H.%M')
|
|
RANGE_DESC="Last 6 Hours"
|
|
;;
|
|
24h|day)
|
|
CUTOFF_TIME=$(date -d '24 hours ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-24H '+%Y-%m-%d.%H.%M')
|
|
RANGE_DESC="Last 24 Hours"
|
|
;;
|
|
7d|week)
|
|
CUTOFF_TIME=$(date -d '7 days ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-7d '+%Y-%m-%d.%H.%M')
|
|
RANGE_DESC="Last 7 Days"
|
|
;;
|
|
30d|month)
|
|
CUTOFF_TIME=$(date -d '30 days ago' '+%Y-%m-%d.%H.%M' 2>/dev/null || date -v-30d '+%Y-%m-%d.%H.%M')
|
|
RANGE_DESC="Last 30 Days"
|
|
;;
|
|
*)
|
|
print_error "Invalid time range: $TIME_RANGE"
|
|
echo "Valid options: 1h, 6h, 24h, 7d, 30d"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
print_info "Analyzing: $RANGE_DESC"
|
|
print_info "Cutoff time: $CUTOFF_TIME"
|
|
|
|
# Find logs within time range
|
|
print_status "Collecting logs in time range..."
|
|
> "$TEMP_DIR/logfiles.txt"
|
|
|
|
for logfile in "$LOADWATCH_DIR"/*.txt; do
|
|
[ -f "$logfile" ] || continue
|
|
basename=$(basename "$logfile" .txt)
|
|
|
|
# Compare timestamps (simple string comparison works with YYYY-MM-DD.HH.MM format)
|
|
if [[ "$basename" > "$CUTOFF_TIME" || "$basename" == "$CUTOFF_TIME" ]]; then
|
|
echo "$logfile" >> "$TEMP_DIR/logfiles.txt"
|
|
fi
|
|
done
|
|
|
|
ANALYZED_LOGS=$(wc -l < "$TEMP_DIR/logfiles.txt")
|
|
if [ "$ANALYZED_LOGS" -eq 0 ]; then
|
|
print_error "No logs found in specified time range"
|
|
exit 1
|
|
fi
|
|
print_success "Found $ANALYZED_LOGS logs in range"
|
|
|
|
#############################################################################
|
|
# PHASE 1: Parse all logs and extract metrics
|
|
#############################################################################
|
|
|
|
print_status "Phase 1/4: Parsing logs and extracting metrics..."
|
|
|
|
> "$TEMP_DIR/metrics.txt"
|
|
> "$TEMP_DIR/alerts.txt"
|
|
> "$TEMP_DIR/top_processes.txt"
|
|
|
|
LOG_COUNT=0
|
|
SWAP_ALERT_COUNT=0
|
|
|
|
while IFS= read -r logfile; do
|
|
LOG_COUNT=$((LOG_COUNT + 1))
|
|
basename=$(basename "$logfile" .txt)
|
|
TIMESTAMP="$basename"
|
|
|
|
# Check for swap alert (first line)
|
|
if head -1 "$logfile" | grep -q "swap threshold exceeded"; then
|
|
SWAP_ALERT_COUNT=$((SWAP_ALERT_COUNT + 1))
|
|
echo "$TIMESTAMP SWAP_THRESHOLD_EXCEEDED" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
|
|
# Parse summary line
|
|
SUMMARY=$(grep -A1 "## server overview" "$logfile" | tail -1)
|
|
if [ ! -z "$SUMMARY" ]; then
|
|
# Extract: timestamp load[X] mem[XX/YY] mysql[X/X] httpd[X]
|
|
LOAD_ROUNDED=$(echo "$SUMMARY" | awk '{print $2}' | sed 's/load\[\(.*\)\]/\1/')
|
|
MEM_PCT_CURRENT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/')
|
|
MEM_PCT_PREVIOUS=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[.*\/\(.*\)\]/\1/')
|
|
MYSQL_THREADS_CURRENT=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/')
|
|
MYSQL_THREADS_EXPECTED=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[.*\/\(.*\)\]/\1/')
|
|
HTTPD_COUNT=$(echo "$SUMMARY" | awk '{print $5}' | sed 's/httpd\[\(.*\)\]/\1/')
|
|
|
|
# NEW: MySQL thread anomaly detection
|
|
if [ ! -z "$MYSQL_THREADS_CURRENT" ] && [ ! -z "$MYSQL_THREADS_EXPECTED" ]; then
|
|
if [ "$MYSQL_THREADS_CURRENT" -gt "$((MYSQL_THREADS_EXPECTED * 3))" ] 2>/dev/null; then
|
|
ANOMALY=$((MYSQL_THREADS_CURRENT - MYSQL_THREADS_EXPECTED))
|
|
echo "$TIMESTAMP MYSQL_THREAD_ANOMALY current=$MYSQL_THREADS_CURRENT expected=$MYSQL_THREADS_EXPECTED anomaly=+$ANOMALY" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
fi
|
|
|
|
# Track httpd count for trending
|
|
echo "$TIMESTAMP $HTTPD_COUNT" >> "$TEMP_DIR/httpd_count.txt"
|
|
fi
|
|
|
|
# Parse memory stats
|
|
MEM_LINE=$(grep "^Mem:" "$logfile")
|
|
if [ ! -z "$MEM_LINE" ]; then
|
|
MEM_TOTAL=$(echo "$MEM_LINE" | awk '{print $2}')
|
|
MEM_USED=$(echo "$MEM_LINE" | awk '{print $3}')
|
|
MEM_AVAILABLE=$(echo "$MEM_LINE" | awk '{print $7}')
|
|
fi
|
|
|
|
SWAP_LINE=$(grep "^Swap:" "$logfile")
|
|
if [ ! -z "$SWAP_LINE" ]; then
|
|
SWAP_TOTAL=$(echo "$SWAP_LINE" | awk '{print $2}')
|
|
SWAP_USED=$(echo "$SWAP_LINE" | awk '{print $3}')
|
|
fi
|
|
|
|
# Parse load average
|
|
LOAD_LINE=$(grep "load average:" "$logfile" | head -1)
|
|
if [ ! -z "$LOAD_LINE" ]; then
|
|
LOAD_1MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: \([0-9.]*\),.*/\1/')
|
|
LOAD_5MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: [0-9.]*, \([0-9.]*\),.*/\1/')
|
|
LOAD_15MIN=$(echo "$LOAD_LINE" | sed 's/.*load average: [0-9.]*, [0-9.]*, \([0-9.]*\)/\1/')
|
|
fi
|
|
|
|
# Parse task summary
|
|
TASK_LINE=$(grep "^Tasks:" "$logfile")
|
|
if [ ! -z "$TASK_LINE" ]; then
|
|
TASK_TOTAL=$(echo "$TASK_LINE" | awk '{print $2}')
|
|
TASK_RUNNING=$(echo "$TASK_LINE" | awk '{print $4}' | tr -d ',')
|
|
TASK_ZOMBIE=$(echo "$TASK_LINE" | awk '{print $12}')
|
|
|
|
if [ "$TASK_ZOMBIE" -gt 0 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP ZOMBIE_PROCESSES count=$TASK_ZOMBIE" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
fi
|
|
|
|
# NEW: Count R-state (runnable) processes for CPU pressure metric
|
|
RSTATE_COUNT=$(awk '/PID USER.*COMMAND/,/^USER.*TTY/ {
|
|
if ($8 == "R") count++
|
|
} END {print count+0}' "$logfile")
|
|
|
|
# Track R-state count (better CPU pressure metric than load average)
|
|
echo "$TIMESTAMP $RSTATE_COUNT" >> "$TEMP_DIR/rstate_count.txt"
|
|
|
|
# Parse CPU stats
|
|
CPU_LINE=$(grep "^%Cpu(s):" "$logfile")
|
|
if [ ! -z "$CPU_LINE" ]; then
|
|
CPU_USER=$(echo "$CPU_LINE" | awk '{print $2}')
|
|
CPU_SYSTEM=$(echo "$CPU_LINE" | awk '{print $4}' | tr -d ',')
|
|
CPU_IDLE=$(echo "$CPU_LINE" | awk '{print $8}' | tr -d ',')
|
|
CPU_IOWAIT=$(echo "$CPU_LINE" | awk '{print $10}' | tr -d ',')
|
|
CPU_STEAL=$(echo "$CPU_LINE" | awk '{print $16}')
|
|
|
|
# Check thresholds
|
|
CPU_IDLE_INT=$(echo "$CPU_IDLE" | cut -d. -f1)
|
|
if [ "$CPU_IDLE_INT" -lt 10 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP CPU_SATURATED idle=${CPU_IDLE}%" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
|
|
IOWAIT_INT=$(echo "$CPU_IOWAIT" | cut -d. -f1)
|
|
if [ "$IOWAIT_INT" -gt 20 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP HIGH_IOWAIT iowait=${CPU_IOWAIT}%" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
|
|
# CRITICAL: Check CPU steal time (VM resource contention)
|
|
STEAL_INT=$(echo "$CPU_STEAL" | cut -d. -f1)
|
|
if [ "$STEAL_INT" -gt 10 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP HIGH_CPU_STEAL steal=${CPU_STEAL}%" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
fi
|
|
|
|
# Parse MySQL stats
|
|
MYSQL_LINE=$(grep "^Uptime:" "$logfile")
|
|
if [ ! -z "$MYSQL_LINE" ]; then
|
|
MYSQL_QPS=$(echo "$MYSQL_LINE" | awk '{print $18}')
|
|
MYSQL_SLOW=$(echo "$MYSQL_LINE" | awk '{print $8}')
|
|
|
|
if [ "$MYSQL_SLOW" -gt 0 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP MYSQL_SLOW_QUERIES count=$MYSQL_SLOW" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
fi
|
|
|
|
# Extract top 3 CPU processes
|
|
awk '/PID USER.*COMMAND/,/^USER.*TTY/ {
|
|
if ($1 ~ /^[0-9]+$/ && NR <= 20) {
|
|
print "'"$TIMESTAMP"'", $1, $12, $9, $10
|
|
}
|
|
}' "$logfile" | head -3 >> "$TEMP_DIR/top_processes.txt"
|
|
|
|
# CRITICAL: Check for kswapd0 in top processes (memory thrashing indicator)
|
|
KSWAPD_CHECK=$(awk '/PID USER.*COMMAND/,/^USER.*TTY/ {
|
|
if ($12 ~ /kswapd/ && $9 > 1.0) {
|
|
print $9
|
|
}
|
|
}' "$logfile" | head -1)
|
|
|
|
if [ ! -z "$KSWAPD_CHECK" ]; then
|
|
echo "$TIMESTAMP MEMORY_THRASHING kswapd0_cpu=${KSWAPD_CHECK}%" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
|
|
# CRITICAL: Count D-state processes (I/O blocking)
|
|
DSTATE_COUNT=$(awk '/^USER.*STAT.*COMMAND/,/^## / {
|
|
if ($8 ~ /D/) count++
|
|
} END {print count+0}' "$logfile")
|
|
|
|
if [ "$DSTATE_COUNT" -gt 0 ] 2>/dev/null; then
|
|
echo "$TIMESTAMP IO_BLOCKED_PROCESSES count=$DSTATE_COUNT" >> "$TEMP_DIR/alerts.txt"
|
|
fi
|
|
|
|
# Extract top 3 memory consumers
|
|
awk '/PID USER.*COMMAND/,/^USER.*TTY/ {
|
|
if ($1 ~ /^[0-9]+$/ && NR <= 20) {
|
|
print "'"$TIMESTAMP"'", $1, $12, $9, $10
|
|
}
|
|
}' "$logfile" | head -3 >> "$TEMP_DIR/top_mem_processes.txt"
|
|
|
|
# NEW: Parse network connection states for attack/leak detection
|
|
awk '/^## network stats/,/^$/ {
|
|
if ($2 ~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ && $3 != "") {
|
|
state = $3
|
|
count = $1
|
|
states[state] += count
|
|
}
|
|
}
|
|
END {
|
|
for (state in states) {
|
|
print "'"$TIMESTAMP"'", state, states[state]
|
|
}
|
|
}' "$logfile" >> "$TEMP_DIR/network_states.txt"
|
|
|
|
# Write metrics line
|
|
echo "$TIMESTAMP|$MEM_AVAILABLE|$MEM_USED|$MEM_TOTAL|$SWAP_USED|$SWAP_TOTAL|$LOAD_1MIN|$LOAD_5MIN|$LOAD_15MIN|$CPU_IDLE|$CPU_IOWAIT|$CPU_STEAL|$TASK_TOTAL|$TASK_RUNNING|$TASK_ZOMBIE|$MYSQL_QPS|$HTTPD_COUNT" >> "$TEMP_DIR/metrics.txt"
|
|
|
|
# Progress indicator
|
|
if [ $((LOG_COUNT % 20)) -eq 0 ]; then
|
|
echo -ne "\rProcessed $LOG_COUNT/$ANALYZED_LOGS logs..."
|
|
fi
|
|
done < "$TEMP_DIR/logfiles.txt"
|
|
|
|
echo -ne "\r"
|
|
print_success "Parsed $LOG_COUNT logs successfully"
|
|
|
|
#############################################################################
|
|
# PHASE 2: Analyze metrics and detect issues
|
|
#############################################################################
|
|
|
|
print_status "Phase 2/4: Analyzing metrics and detecting issues..."
|
|
|
|
# Memory analysis
|
|
print_substatus "Analyzing memory usage..."
|
|
awk -F'|' '{
|
|
if ($2 != "" && $2 > 0) {
|
|
sum_avail += $2
|
|
count++
|
|
if ($2 < min_avail || min_avail == 0) min_avail = $2
|
|
if ($2 > max_avail) max_avail = $2
|
|
if ($2 < 200) critical++
|
|
if ($2 < 500 && $2 >= 200) warning++
|
|
}
|
|
}
|
|
END {
|
|
if (count > 0) {
|
|
avg = sum_avail / count
|
|
print "MEMORY_AVAILABLE", "avg=" avg, "min=" min_avail, "max=" max_avail, "critical=" critical, "warning=" warning
|
|
}
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/memory_analysis.txt"
|
|
|
|
# Swap analysis
|
|
awk -F'|' '{
|
|
if ($5 != "" && $5 > 0 && $6 > 0) {
|
|
pct = ($5 / $6) * 100
|
|
sum_swap += $5
|
|
count++
|
|
if (pct > max_swap_pct) max_swap_pct = pct
|
|
if (pct >= 50) high++
|
|
}
|
|
}
|
|
END {
|
|
if (count > 0) {
|
|
avg = sum_swap / count
|
|
print "SWAP_USAGE", "avg_used=" avg, "max_pct=" max_swap_pct, "high_count=" high
|
|
}
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/swap_analysis.txt"
|
|
|
|
# Load average analysis
|
|
print_substatus "Analyzing load average..."
|
|
awk -F'|' '{
|
|
if ($7 != "") {
|
|
sum_load1 += $7
|
|
count++
|
|
if ($7 > max_load1) max_load1 = $7
|
|
if ($7 > 2.0) high++
|
|
}
|
|
}
|
|
END {
|
|
if (count > 0) {
|
|
avg = sum_load1 / count
|
|
print "LOAD_AVERAGE", "avg_1min=" avg, "max_1min=" max_load1, "high_count=" high
|
|
}
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/load_analysis.txt"
|
|
|
|
# CPU analysis
|
|
print_substatus "Analyzing CPU usage..."
|
|
awk -F'|' '{
|
|
if ($10 != "") {
|
|
sum_idle += $10
|
|
count++
|
|
if ($10 < min_idle || min_idle == 0 || min_idle == "") min_idle = $10
|
|
idle_int = int($10)
|
|
if (idle_int < 10) critical++
|
|
if (idle_int >= 10 && idle_int < 20) warning++
|
|
}
|
|
if ($11 != "") {
|
|
sum_iowait += $11
|
|
iowait_int = int($11)
|
|
if (iowait_int > 20) high_iowait++
|
|
}
|
|
}
|
|
END {
|
|
if (count > 0) {
|
|
avg_idle = sum_idle / count
|
|
avg_iowait = sum_iowait / count
|
|
print "CPU_STATS", "avg_idle=" avg_idle, "min_idle=" min_idle, "critical=" critical, "warning=" warning, "avg_iowait=" avg_iowait, "high_iowait=" high_iowait
|
|
}
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/cpu_analysis.txt"
|
|
|
|
# NEW: Load trend direction analysis
|
|
print_substatus "Analyzing load trend direction..."
|
|
awk -F'|' '{
|
|
if ($7 != "" && $8 != "" && $9 != "") {
|
|
load_1min = $7
|
|
load_5min = $8
|
|
load_15min = $9
|
|
|
|
# Determine trend direction
|
|
if (load_1min > load_5min && load_5min > load_15min) {
|
|
rising++
|
|
} else if (load_1min < load_5min && load_5min < load_15min) {
|
|
falling++
|
|
} else {
|
|
stable++
|
|
}
|
|
|
|
last_1min = load_1min
|
|
last_5min = load_5min
|
|
last_15min = load_15min
|
|
}
|
|
}
|
|
END {
|
|
if (rising > falling && rising > stable) {
|
|
trend = "RISING"
|
|
} else if (falling > rising && falling > stable) {
|
|
trend = "FALLING"
|
|
} else {
|
|
trend = "STABLE"
|
|
}
|
|
print "LOAD_TREND", "direction=" trend, "rising=" rising, "falling=" falling, "stable=" stable, "last_1min=" last_1min, "last_5min=" last_5min, "last_15min=" last_15min
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/load_trend.txt"
|
|
|
|
# NEW: Memory growth velocity analysis
|
|
print_substatus "Analyzing memory growth velocity..."
|
|
awk -F'|' '{
|
|
if ($2 != "" && $2 > 0) {
|
|
if (first_avail == 0) {
|
|
first_avail = $2
|
|
first_time = NR
|
|
}
|
|
last_avail = $2
|
|
last_time = NR
|
|
count++
|
|
}
|
|
}
|
|
END {
|
|
if (count > 1) {
|
|
delta = last_avail - first_avail
|
|
snapshots = last_time - first_time
|
|
if (snapshots > 0) {
|
|
rate_per_snapshot = delta / snapshots
|
|
rate_per_hour = rate_per_snapshot * 20 # 20 snapshots per hour
|
|
|
|
# Predict time to OOM if declining
|
|
if (rate_per_hour < 0 && last_avail > 0) {
|
|
hours_to_oom = last_avail / (-1 * rate_per_hour)
|
|
} else {
|
|
hours_to_oom = 0
|
|
}
|
|
|
|
print "MEMORY_VELOCITY", "first=" first_avail, "last=" last_avail, "delta=" delta, "rate_per_hour=" rate_per_hour, "hours_to_oom=" hours_to_oom
|
|
}
|
|
}
|
|
}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/memory_velocity.txt"
|
|
|
|
# NEW: R-state analysis (CPU pressure)
|
|
print_substatus "Analyzing R-state process pressure..."
|
|
awk '{
|
|
if ($2 != "" && $2 > 0) {
|
|
sum += $2
|
|
count++
|
|
if ($2 > max) max = $2
|
|
if ($2 > 4) high++ # More than 2x typical 2-core system
|
|
}
|
|
}
|
|
END {
|
|
if (count > 0) {
|
|
avg = sum / count
|
|
print "RSTATE_ANALYSIS", "avg=" avg, "max=" max, "high_count=" high
|
|
}
|
|
}' "$TEMP_DIR/rstate_count.txt" > "$TEMP_DIR/rstate_analysis.txt"
|
|
|
|
# NEW: Network state analysis
|
|
print_substatus "Analyzing network connection states..."
|
|
awk '{
|
|
state = $2
|
|
count = $3
|
|
state_totals[state] += count
|
|
state_occurrences[state]++
|
|
}
|
|
END {
|
|
for (state in state_totals) {
|
|
avg = state_totals[state] / state_occurrences[state]
|
|
print state, avg, state_totals[state]
|
|
}
|
|
}' "$TEMP_DIR/network_states.txt" | sort -k3 -rn > "$TEMP_DIR/network_state_summary.txt"
|
|
|
|
print_success "Metric analysis complete"
|
|
|
|
#############################################################################
|
|
# PHASE 3: Identify top resource consumers
|
|
#############################################################################
|
|
|
|
print_status "Phase 3/4: Identifying resource consumers..."
|
|
|
|
# Top CPU processes (aggregate across all snapshots)
|
|
print_substatus "Analyzing top CPU consumers..."
|
|
awk '{
|
|
cmd = $3
|
|
cpu = $4
|
|
if (cpu > 0) {
|
|
cpu_sum[cmd] += cpu
|
|
count[cmd]++
|
|
}
|
|
}
|
|
END {
|
|
for (cmd in cpu_sum) {
|
|
avg = cpu_sum[cmd] / count[cmd]
|
|
print avg, cmd, count[cmd]
|
|
}
|
|
}' "$TEMP_DIR/top_processes.txt" | sort -rn | head -10 > "$TEMP_DIR/top_cpu_consumers.txt"
|
|
|
|
# Top memory consumers (aggregate across all snapshots)
|
|
print_substatus "Analyzing top memory consumers..."
|
|
awk '{
|
|
cmd = $3
|
|
mem = $5
|
|
if (mem > 0) {
|
|
mem_sum[cmd] += mem
|
|
count[cmd]++
|
|
}
|
|
}
|
|
END {
|
|
for (cmd in mem_sum) {
|
|
avg = mem_sum[cmd] / count[cmd]
|
|
print avg, cmd, count[cmd]
|
|
}
|
|
}' "$TEMP_DIR/top_mem_processes.txt" | sort -rn | head -10 > "$TEMP_DIR/top_mem_consumers.txt"
|
|
|
|
print_success "Resource analysis complete"
|
|
|
|
#############################################################################
|
|
# PHASE 4: Generate report
|
|
#############################################################################
|
|
|
|
print_status "Phase 4/4: Generating report..."
|
|
|
|
{
|
|
echo "================================================================================"
|
|
echo "LOADWATCH SYSTEM HEALTH ANALYSIS REPORT"
|
|
echo "================================================================================"
|
|
echo ""
|
|
echo "Report Generated: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "Analysis Period: $RANGE_DESC"
|
|
echo "Snapshots Analyzed: $ANALYZED_LOGS (@ 3-minute intervals)"
|
|
echo ""
|
|
|
|
echo "================================================================================"
|
|
echo "CRITICAL ALERTS SUMMARY"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
ALERT_COUNT=$(wc -l < "$TEMP_DIR/alerts.txt")
|
|
if [ "$ALERT_COUNT" -eq 0 ]; then
|
|
echo "✓ No critical alerts detected"
|
|
else
|
|
echo "⚠ Total Alerts: $ALERT_COUNT"
|
|
echo ""
|
|
|
|
# Swap alerts
|
|
SWAP_ALERTS=$(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$SWAP_ALERTS" -gt 0 ]; then
|
|
echo "SWAP THRESHOLD EXCEEDED: $SWAP_ALERTS occurrences"
|
|
echo " First: $(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | head -1 | awk '{print $1}')"
|
|
echo " Last: $(grep "SWAP_THRESHOLD_EXCEEDED" "$TEMP_DIR/alerts.txt" | tail -1 | awk '{print $1}')"
|
|
echo ""
|
|
fi
|
|
|
|
# CPU saturation
|
|
CPU_ALERTS=$(grep "CPU_SATURATED" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$CPU_ALERTS" -gt 0 ]; then
|
|
echo "CPU SATURATION: $CPU_ALERTS occurrences"
|
|
grep "CPU_SATURATED" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$CPU_ALERTS" -gt 5 ] && echo " ... and $((CPU_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# I/O wait
|
|
IO_ALERTS=$(grep "HIGH_IOWAIT" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$IO_ALERTS" -gt 0 ]; then
|
|
echo "HIGH I/O WAIT: $IO_ALERTS occurrences"
|
|
grep "HIGH_IOWAIT" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$IO_ALERTS" -gt 5 ] && echo " ... and $((IO_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# Zombie processes
|
|
ZOMBIE_ALERTS=$(grep "ZOMBIE_PROCESSES" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$ZOMBIE_ALERTS" -gt 0 ]; then
|
|
echo "ZOMBIE PROCESSES: $ZOMBIE_ALERTS occurrences"
|
|
grep "ZOMBIE_PROCESSES" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$ZOMBIE_ALERTS" -gt 5 ] && echo " ... and $((ZOMBIE_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# Memory thrashing (kswapd0)
|
|
THRASH_ALERTS=$(grep "MEMORY_THRASHING" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$THRASH_ALERTS" -gt 0 ]; then
|
|
echo "⚠️ MEMORY THRASHING (kswapd0 active): $THRASH_ALERTS occurrences"
|
|
grep "MEMORY_THRASHING" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$THRASH_ALERTS" -gt 5 ] && echo " ... and $((THRASH_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# I/O blocked processes
|
|
BLOCKED_ALERTS=$(grep "IO_BLOCKED_PROCESSES" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$BLOCKED_ALERTS" -gt 0 ]; then
|
|
echo "I/O BLOCKED PROCESSES (D-state): $BLOCKED_ALERTS occurrences"
|
|
grep "IO_BLOCKED_PROCESSES" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$BLOCKED_ALERTS" -gt 5 ] && echo " ... and $((BLOCKED_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# CPU steal time (VM contention)
|
|
STEAL_ALERTS=$(grep "HIGH_CPU_STEAL" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$STEAL_ALERTS" -gt 0 ]; then
|
|
echo "HIGH CPU STEAL TIME (VM resource contention): $STEAL_ALERTS occurrences"
|
|
grep "HIGH_CPU_STEAL" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$STEAL_ALERTS" -gt 5 ] && echo " ... and $((STEAL_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
|
|
# MySQL thread anomaly (connection storm)
|
|
THREAD_ALERTS=$(grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | wc -l)
|
|
if [ "$THREAD_ALERTS" -gt 0 ]; then
|
|
echo "⚠️ MYSQL THREAD ANOMALY (Connection Storm): $THREAD_ALERTS occurrences"
|
|
grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do
|
|
echo " - $line"
|
|
done
|
|
[ "$THREAD_ALERTS" -gt 5 ] && echo " ... and $((THREAD_ALERTS - 5)) more"
|
|
echo ""
|
|
fi
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "MEMORY ANALYSIS"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
if [ -f "$TEMP_DIR/memory_analysis.txt" ]; then
|
|
read -r _ avg min max critical warning < "$TEMP_DIR/memory_analysis.txt"
|
|
AVG_MEM=$(echo "$avg" | cut -d= -f2 | cut -d. -f1)
|
|
MIN_MEM=$(echo "$min" | cut -d= -f2 | cut -d. -f1)
|
|
MAX_MEM=$(echo "$max" | cut -d= -f2 | cut -d. -f1)
|
|
CRIT_COUNT=$(echo "$critical" | cut -d= -f2)
|
|
WARN_COUNT=$(echo "$warning" | cut -d= -f2)
|
|
|
|
echo "Available Memory:"
|
|
echo " Average: ${AVG_MEM} MiB"
|
|
echo " Minimum: ${MIN_MEM} MiB"
|
|
echo " Maximum: ${MAX_MEM} MiB"
|
|
echo ""
|
|
echo "Memory Pressure Events:"
|
|
echo " Critical (< 200 MiB): $CRIT_COUNT snapshots"
|
|
echo " Warning (< 500 MiB): $WARN_COUNT snapshots"
|
|
echo ""
|
|
fi
|
|
|
|
if [ -f "$TEMP_DIR/swap_analysis.txt" ]; then
|
|
read -r _ avg_used max_pct high_count < "$TEMP_DIR/swap_analysis.txt"
|
|
AVG_SWAP=$(echo "$avg_used" | cut -d= -f2 | cut -d. -f1)
|
|
MAX_SWAP_PCT=$(echo "$max_pct" | cut -d= -f2 | cut -d. -f1)
|
|
HIGH_SWAP=$(echo "$high_count" | cut -d= -f2)
|
|
|
|
echo "Swap Usage:"
|
|
echo " Average Used: ${AVG_SWAP} MiB"
|
|
echo " Maximum %: ${MAX_SWAP_PCT}%"
|
|
echo " High (>= 50%): $HIGH_SWAP snapshots"
|
|
echo ""
|
|
fi
|
|
|
|
# Memory growth velocity
|
|
if [ -f "$TEMP_DIR/memory_velocity.txt" ]; then
|
|
read -r _ first_line < "$TEMP_DIR/memory_velocity.txt"
|
|
FIRST_AVAIL=$(echo "$first_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^first=/) print $i}' | cut -d= -f2)
|
|
LAST_AVAIL=$(echo "$first_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^last=/) print $i}' | cut -d= -f2)
|
|
DELTA=$(echo "$first_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^delta=/) print $i}' | cut -d= -f2)
|
|
RATE=$(echo "$first_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^rate_per_hour=/) print $i}' | cut -d= -f2)
|
|
HOURS_TO_OOM=$(echo "$first_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^hours_to_oom=/) print $i}' | cut -d= -f2)
|
|
|
|
echo "Memory Growth Velocity:"
|
|
echo " First Available: ${FIRST_AVAIL} MiB"
|
|
echo " Last Available: ${LAST_AVAIL} MiB"
|
|
echo " Delta: ${DELTA} MiB"
|
|
echo " Rate: ${RATE} MiB/hour"
|
|
|
|
# Check if memory is declining
|
|
RATE_INT=$(echo "$RATE" | cut -d. -f1 | sed 's/^-//')
|
|
if echo "$RATE" | grep -q "^-"; then
|
|
HOURS_INT=$(echo "$HOURS_TO_OOM" | cut -d. -f1)
|
|
if [ "$HOURS_INT" -gt 0 ] 2>/dev/null; then
|
|
echo " ⚠️ WARNING: Memory declining - OOM predicted in ${HOURS_INT} hours"
|
|
fi
|
|
else
|
|
echo " ✓ Memory stable or increasing"
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "CPU & LOAD ANALYSIS"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
if [ -f "$TEMP_DIR/load_analysis.txt" ]; then
|
|
read -r _ avg_load max_load high_count < "$TEMP_DIR/load_analysis.txt"
|
|
AVG_LOAD=$(echo "$avg_load" | cut -d= -f2)
|
|
MAX_LOAD=$(echo "$max_load" | cut -d= -f2)
|
|
HIGH_LOAD=$(echo "$high_count" | cut -d= -f2)
|
|
|
|
echo "Load Average (1-minute):"
|
|
echo " Average: $AVG_LOAD"
|
|
echo " Maximum: $MAX_LOAD"
|
|
echo " High (> 2.0): $HIGH_LOAD snapshots"
|
|
echo ""
|
|
fi
|
|
|
|
# Load trend direction
|
|
if [ -f "$TEMP_DIR/load_trend.txt" ]; then
|
|
read -r _ trend_line < "$TEMP_DIR/load_trend.txt"
|
|
TREND_DIR=$(echo "$trend_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^direction=/) print $i}' | cut -d= -f2)
|
|
RISING_COUNT=$(echo "$trend_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^rising=/) print $i}' | cut -d= -f2)
|
|
FALLING_COUNT=$(echo "$trend_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^falling=/) print $i}' | cut -d= -f2)
|
|
STABLE_COUNT=$(echo "$trend_line" | awk 'BEGIN {i=0} {for(i=1;i<=NF;i++) if($i ~ /^stable=/) print $i}' | cut -d= -f2)
|
|
|
|
echo "Load Trend Direction:"
|
|
case "$TREND_DIR" in
|
|
RISING)
|
|
echo " ⚠️ RISING - Problem getting worse (1min > 5min > 15min)"
|
|
;;
|
|
FALLING)
|
|
echo " ✓ FALLING - Problem resolving (1min < 5min < 15min)"
|
|
;;
|
|
STABLE)
|
|
echo " → STABLE - Load averages consistent"
|
|
;;
|
|
esac
|
|
echo " Rising snapshots: $RISING_COUNT"
|
|
echo " Falling snapshots: $FALLING_COUNT"
|
|
echo " Stable snapshots: $STABLE_COUNT"
|
|
echo ""
|
|
fi
|
|
|
|
if [ -f "$TEMP_DIR/cpu_analysis.txt" ]; then
|
|
read -r _ avg_idle min_idle critical warning avg_iowait high_iowait < "$TEMP_DIR/cpu_analysis.txt"
|
|
AVG_IDLE=$(echo "$avg_idle" | cut -d= -f2)
|
|
MIN_IDLE=$(echo "$min_idle" | cut -d= -f2)
|
|
CRIT_CPU=$(echo "$critical" | cut -d= -f2)
|
|
WARN_CPU=$(echo "$warning" | cut -d= -f2)
|
|
AVG_IOWAIT=$(echo "$avg_iowait" | cut -d= -f2)
|
|
HIGH_IOWAIT=$(echo "$high_iowait" | cut -d= -f2)
|
|
|
|
echo "CPU Idle Time:"
|
|
echo " Average: ${AVG_IDLE}%"
|
|
echo " Minimum: ${MIN_IDLE}%"
|
|
echo ""
|
|
echo "CPU Pressure Events:"
|
|
echo " Critical (< 10% idle): $CRIT_CPU snapshots"
|
|
echo " Warning (< 20% idle): $WARN_CPU snapshots"
|
|
echo ""
|
|
echo "I/O Wait:"
|
|
echo " Average: ${AVG_IOWAIT}%"
|
|
echo " High (> 20%): $HIGH_IOWAIT snapshots"
|
|
echo ""
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "TOP CPU CONSUMERS (Averaged Across Period)"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
if [ -f "$TEMP_DIR/top_cpu_consumers.txt" ] && [ -s "$TEMP_DIR/top_cpu_consumers.txt" ]; then
|
|
printf "%-10s %-50s %s\n" "AVG CPU%" "PROCESS" "OCCURRENCES"
|
|
printf "%-10s %-50s %s\n" "--------" "------------------------------------------------" "-----------"
|
|
while read avg_cpu cmd occurrences; do
|
|
printf "%-10.1f %-50s %s\n" "$avg_cpu" "$cmd" "$occurrences"
|
|
done < "$TEMP_DIR/top_cpu_consumers.txt"
|
|
else
|
|
echo "No significant CPU consumers found"
|
|
fi
|
|
echo ""
|
|
|
|
echo "================================================================================"
|
|
echo "TOP MEMORY CONSUMERS (Averaged Across Period)"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
if [ -f "$TEMP_DIR/top_mem_consumers.txt" ] && [ -s "$TEMP_DIR/top_mem_consumers.txt" ]; then
|
|
printf "%-10s %-50s %s\n" "AVG MEM%" "PROCESS" "OCCURRENCES"
|
|
printf "%-10s %-50s %s\n" "--------" "------------------------------------------------" "-----------"
|
|
while read avg_mem cmd occurrences; do
|
|
printf "%-10.1f %-50s %s\n" "$avg_mem" "$cmd" "$occurrences"
|
|
done < "$TEMP_DIR/top_mem_consumers.txt"
|
|
else
|
|
echo "No significant memory consumers found"
|
|
fi
|
|
echo ""
|
|
|
|
echo "================================================================================"
|
|
echo "CPU PRESSURE ANALYSIS (R-state Processes)"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
# R-state analysis
|
|
if [ -f "$TEMP_DIR/rstate_analysis.txt" ]; then
|
|
read -r _ avg_rstate max_rstate high_count < "$TEMP_DIR/rstate_analysis.txt"
|
|
AVG_RSTATE=$(echo "$avg_rstate" | cut -d= -f2 | cut -d. -f1)
|
|
MAX_RSTATE=$(echo "$max_rstate" | cut -d= -f2)
|
|
HIGH_RSTATE=$(echo "$high_count" | cut -d= -f2)
|
|
|
|
echo "Runnable (R-state) Processes:"
|
|
echo " Average: $AVG_RSTATE processes"
|
|
echo " Maximum: $MAX_RSTATE processes"
|
|
echo " High (> 5): $HIGH_RSTATE snapshots"
|
|
echo ""
|
|
echo "R-state Count Analysis:"
|
|
echo " - R-state = processes waiting for CPU (runnable but not running)"
|
|
echo " - Better CPU pressure metric than load average alone"
|
|
echo " - R-state count > CPU cores = CPU contention"
|
|
|
|
MAX_RSTATE_INT=$(echo "$MAX_RSTATE" | cut -d. -f1)
|
|
if [ "$MAX_RSTATE_INT" -gt 10 ] 2>/dev/null; then
|
|
echo " ⚠️ WARNING: High R-state count indicates severe CPU pressure"
|
|
elif [ "$MAX_RSTATE_INT" -gt 5 ] 2>/dev/null; then
|
|
echo " ⚠️ CAUTION: Moderate CPU contention detected"
|
|
else
|
|
echo " ✓ CPU pressure within normal range"
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "NETWORK CONNECTION ANALYSIS"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
# Network state breakdown
|
|
if [ -f "$TEMP_DIR/network_state_summary.txt" ] && [ -s "$TEMP_DIR/network_state_summary.txt" ]; then
|
|
echo "Connection States (Aggregated):"
|
|
echo ""
|
|
printf "%-20s %15s %15s\n" "STATE" "AVG" "TOTAL"
|
|
printf "%-20s %15s %15s\n" "--------------------" "---------------" "---------------"
|
|
while read state avg total; do
|
|
AVG_INT=$(echo "$avg" | cut -d. -f1)
|
|
TOTAL_INT=$(echo "$total" | cut -d. -f1)
|
|
printf "%-20s %15s %15s\n" "$state" "$AVG_INT" "$TOTAL_INT"
|
|
|
|
# Add warnings for specific states
|
|
case "$state" in
|
|
SYN_RECV)
|
|
if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then
|
|
echo " ⚠️ Possible SYN flood attack"
|
|
fi
|
|
;;
|
|
CLOSE_WAIT)
|
|
if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then
|
|
echo " ⚠️ Connection leak - application not closing connections"
|
|
fi
|
|
;;
|
|
TIME_WAIT)
|
|
if [ "$TOTAL_INT" -gt 1000 ] 2>/dev/null; then
|
|
echo " ⚠️ Excessive TIME_WAIT - may need tuning"
|
|
fi
|
|
;;
|
|
esac
|
|
done < "$TEMP_DIR/network_state_summary.txt"
|
|
echo ""
|
|
else
|
|
echo "No network connection data available"
|
|
echo ""
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "RECOMMENDATIONS"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
# Generate recommendations based on findings
|
|
RECOMMENDATIONS=0
|
|
|
|
# Set defaults for empty values
|
|
CRIT_COUNT=${CRIT_COUNT:-0}
|
|
AVG_MEM=${AVG_MEM:-1000}
|
|
HIGH_SWAP=${HIGH_SWAP:-0}
|
|
CRIT_CPU=${CRIT_CPU:-0}
|
|
HIGH_IOWAIT=${HIGH_IOWAIT:-0}
|
|
ZOMBIE_ALERTS=${ZOMBIE_ALERTS:-0}
|
|
THRASH_ALERTS=${THRASH_ALERTS:-0}
|
|
BLOCKED_ALERTS=${BLOCKED_ALERTS:-0}
|
|
STEAL_ALERTS=${STEAL_ALERTS:-0}
|
|
|
|
# CRITICAL: Memory thrashing
|
|
if [ "$THRASH_ALERTS" -gt 0 ]; then
|
|
echo "🔴 CRITICAL - MEMORY THRASHING DETECTED"
|
|
echo " - kswapd0 (kernel swap daemon) was consuming CPU in $THRASH_ALERTS snapshots"
|
|
echo " - This is THE definitive indicator of severe memory pressure"
|
|
echo " - System is thrashing - constantly swapping pages in/out of memory"
|
|
echo " - IMMEDIATE ACTION REQUIRED:"
|
|
echo " 1. Add more RAM to the server (most effective solution)"
|
|
echo " 2. Kill/restart memory-intensive processes"
|
|
echo " 3. Review top memory consumers above"
|
|
echo " 4. Check for memory leaks in applications"
|
|
echo " - Performance is severely degraded during thrashing"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
# CRITICAL: I/O blocking
|
|
if [ "$BLOCKED_ALERTS" -gt 5 ]; then
|
|
echo "🔴 CRITICAL - I/O BLOCKING DETECTED"
|
|
echo " - Processes stuck in D-state (uninterruptible sleep) in $BLOCKED_ALERTS snapshots"
|
|
echo " - Processes are blocked waiting for I/O operations to complete"
|
|
echo " - Indicates severe disk performance issues or hardware problems"
|
|
echo " - IMMEDIATE ACTION REQUIRED:"
|
|
echo " 1. Check disk health: smartctl -a /dev/sda"
|
|
echo " 2. Check I/O performance: iostat -x 1 5"
|
|
echo " 3. Look for failing drives in dmesg: dmesg | grep -i error"
|
|
echo " 4. Consider upgrading to SSD storage"
|
|
echo " 5. Check for network storage timeouts (NFS/iSCSI)"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
# CRITICAL: CPU steal time (VM resource contention)
|
|
if [ "$STEAL_ALERTS" -gt 10 ]; then
|
|
echo "🔴 CRITICAL - VM RESOURCE CONTENTION"
|
|
echo " - High CPU steal time detected in $STEAL_ALERTS snapshots"
|
|
echo " - Hypervisor is stealing CPU cycles from this VM"
|
|
echo " - Physical host is overcommitted or experiencing contention"
|
|
echo " - ACTIONS REQUIRED:"
|
|
echo " 1. Contact hosting provider about resource contention"
|
|
echo " 2. Request move to less crowded physical host"
|
|
echo " 3. Upgrade to dedicated/guaranteed CPU resources"
|
|
echo " 4. Consider upgrading VM plan for better resource allocation"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$CRIT_COUNT" -gt 0 ] || [ "$AVG_MEM" -lt 300 ]; then
|
|
echo "⚠ MEMORY: Critical memory pressure detected"
|
|
echo " - Average available memory is very low (${AVG_MEM} MiB)"
|
|
echo " - Consider adding more RAM to the server"
|
|
echo " - Review top memory consumers and optimize/limit resource usage"
|
|
echo " - Check for memory leaks in long-running processes"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$HIGH_SWAP" -gt "$((ANALYZED_LOGS / 2))" ]; then
|
|
echo "⚠ SWAP: Excessive swap usage detected"
|
|
echo " - Swap usage exceeded 50% in ${HIGH_SWAP} of ${ANALYZED_LOGS} snapshots"
|
|
echo " - This indicates insufficient RAM for current workload"
|
|
echo " - Add more physical RAM or reduce memory-intensive processes"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$CRIT_CPU" -gt 10 ]; then
|
|
echo "⚠ CPU: CPU saturation detected"
|
|
echo " - CPU was saturated (< 10% idle) in $CRIT_CPU snapshots"
|
|
echo " - Review top CPU consumers above"
|
|
echo " - Consider upgrading to more CPU cores"
|
|
echo " - Optimize or throttle CPU-intensive processes"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$HIGH_IOWAIT" -gt 10 ]; then
|
|
echo "⚠ I/O: High disk I/O wait detected"
|
|
echo " - I/O wait exceeded 20% in $HIGH_IOWAIT snapshots"
|
|
echo " - Indicates disk bottleneck or slow storage"
|
|
echo " - Consider upgrading to SSD storage"
|
|
echo " - Check for disk-intensive processes or failing drives"
|
|
echo " - Run: iostat -x 1 5 to identify slow devices"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$ZOMBIE_ALERTS" -gt 0 ]; then
|
|
echo "⚠ PROCESSES: Zombie processes detected"
|
|
echo " - Zombie processes found in $ZOMBIE_ALERTS snapshots"
|
|
echo " - Indicates parent processes not cleaning up children properly"
|
|
echo " - Review process management in affected applications"
|
|
echo " - May require application restart or code fixes"
|
|
echo ""
|
|
RECOMMENDATIONS=$((RECOMMENDATIONS + 1))
|
|
fi
|
|
|
|
if [ "$RECOMMENDATIONS" -eq 0 ]; then
|
|
echo "✓ System appears healthy during analyzed period"
|
|
echo " - No critical issues detected"
|
|
echo " - Memory, CPU, and I/O metrics within normal ranges"
|
|
echo " - Continue regular monitoring"
|
|
echo ""
|
|
fi
|
|
|
|
echo "================================================================================"
|
|
echo "ANALYSIS COMPLETE"
|
|
echo "================================================================================"
|
|
echo ""
|
|
echo "Report saved to: $OUTPUT_FILE"
|
|
echo ""
|
|
|
|
} | tee "$OUTPUT_FILE"
|
|
|
|
print_success "Report generated successfully"
|
|
print_info "Report location: $OUTPUT_FILE"
|
|
|
|
# Display report path for easy access
|
|
echo ""
|
|
echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo -e "${BOLD}View full report:${NC} cat $OUTPUT_FILE"
|
|
echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo ""
|