From 37c1df567cb792d203c0206bff8664206019ed3f Mon Sep 17 00:00:00 2001 From: cschantz Date: Thu, 20 Nov 2025 21:50:16 -0500 Subject: [PATCH] Phase 2: Advanced analytics for loadwatch-analyzer - predictive and trend analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PHASE 2 ENHANCEMENTS (5 new features): 1. LOAD TREND DIRECTION ANALYSIS - Analyzes 1min vs 5min vs 15min load averages - Detects RISING (problem worsening), FALLING (resolving), or STABLE - Provides snapshot counts for each trend type - Critical for understanding if issue is active or resolving 2. CONNECTION STATE BREAKDOWN - Parses network connection states from logs - Aggregates by state (ESTABLISHED, SYN_RECV, CLOSE_WAIT, TIME_WAIT, etc) - Shows average and total counts per state - Detects: * SYN flood attacks (high SYN_RECV) * Connection leaks (high CLOSE_WAIT) * Excessive TIME_WAIT (may need tuning) 3. MEMORY GROWTH VELOCITY TRACKING - Calculates rate of memory consumption change - Tracks MiB/hour growth or decline - Predicts time until OOM if memory is declining - Proactive alert: "Memory declining - OOM predicted in X hours" - Shows whether memory is stable, increasing, or declining 4. R-STATE PROCESS COUNT - Counts runnable (R-state) processes waiting for CPU - Better CPU pressure metric than load average alone - R-state > CPU cores = CPU contention - Detects: * Severe CPU pressure (R-state > 10) * Moderate contention (R-state > 5) * Normal range (R-state <= 5) 5. MYSQL THREAD ANOMALY DETECTION - Parses summary line mysql[current/expected] format - Alerts when current > 3x expected threads - Shows anomaly delta (extra threads) - Detects connection storms and thread explosions - Tracks httpd process count for correlation REPORT SECTIONS ADDED: - MySQL Thread Anomaly alerts in Critical Alerts section - Memory Growth Velocity in Memory Analysis section - Load Trend Direction in CPU & Load Analysis section - CPU Pressure Analysis (R-state) - new dedicated section - Network Connection Analysis - new dedicated section PARSING ENHANCEMENTS: - Enhanced summary line parsing for mysql[X/Y] format - R-state process counting from top output - Network state aggregation from network stats section - Httpd count tracking for trending ANALYSIS IMPROVEMENTS: - Predictive OOM warnings based on memory velocity - Trend-based load analysis (not just absolute values) - State-specific network connection warnings - CPU pressure quantification via R-state IMPACT: - Shifts from reactive (what happened) to predictive (what will happen) - Provides trend analysis for problem resolution tracking - Detects attacks and leaks from connection state patterns - Better CPU pressure understanding via R-state metrics - MySQL connection storm early warning system All features tested and validated on production logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- modules/diagnostics/loadwatch-analyzer.sh | 276 +++++++++++++++++++++- 1 file changed, 274 insertions(+), 2 deletions(-) diff --git a/modules/diagnostics/loadwatch-analyzer.sh b/modules/diagnostics/loadwatch-analyzer.sh index b9cc6a4..eb589e7 100755 --- a/modules/diagnostics/loadwatch-analyzer.sh +++ b/modules/diagnostics/loadwatch-analyzer.sh @@ -206,9 +206,22 @@ while IFS= read -r logfile; do if [ ! -z "$SUMMARY" ]; then # Extract: timestamp load[X] mem[XX/YY] mysql[X/X] httpd[X] LOAD_ROUNDED=$(echo "$SUMMARY" | awk '{print $2}' | sed 's/load\[\(.*\)\]/\1/') - MEM_PCT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/') - MYSQL_THREADS=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/') + MEM_PCT_CURRENT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/') + MEM_PCT_PREVIOUS=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[.*\/\(.*\)\]/\1/') + MYSQL_THREADS_CURRENT=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/') + MYSQL_THREADS_EXPECTED=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[.*\/\(.*\)\]/\1/') HTTPD_COUNT=$(echo "$SUMMARY" | awk '{print $5}' | sed 's/httpd\[\(.*\)\]/\1/') + + # NEW: MySQL thread anomaly detection + if [ ! -z "$MYSQL_THREADS_CURRENT" ] && [ ! -z "$MYSQL_THREADS_EXPECTED" ]; then + if [ "$MYSQL_THREADS_CURRENT" -gt "$((MYSQL_THREADS_EXPECTED * 3))" ] 2>/dev/null; then + ANOMALY=$((MYSQL_THREADS_CURRENT - MYSQL_THREADS_EXPECTED)) + echo "$TIMESTAMP MYSQL_THREAD_ANOMALY current=$MYSQL_THREADS_CURRENT expected=$MYSQL_THREADS_EXPECTED anomaly=+$ANOMALY" >> "$TEMP_DIR/alerts.txt" + fi + fi + + # Track httpd count for trending + echo "$TIMESTAMP $HTTPD_COUNT" >> "$TEMP_DIR/httpd_count.txt" fi # Parse memory stats @@ -245,6 +258,14 @@ while IFS= read -r logfile; do fi fi + # NEW: Count R-state (runnable) processes for CPU pressure metric + RSTATE_COUNT=$(awk '/PID USER.*COMMAND/,/^USER.*TTY/ { + if ($8 == "R") count++ + } END {print count+0}' "$logfile") + + # Track R-state count (better CPU pressure metric than load average) + echo "$TIMESTAMP $RSTATE_COUNT" >> "$TEMP_DIR/rstate_count.txt" + # Parse CPU stats CPU_LINE=$(grep "^%Cpu(s):" "$logfile") if [ ! -z "$CPU_LINE" ]; then @@ -317,6 +338,20 @@ while IFS= read -r logfile; do } }' "$logfile" | head -3 >> "$TEMP_DIR/top_mem_processes.txt" + # NEW: Parse network connection states for attack/leak detection + awk '/^## network stats/,/^$/ { + if ($2 ~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ && $3 != "") { + state = $3 + count = $1 + states[state] += count + } + } + END { + for (state in states) { + print "'"$TIMESTAMP"'", state, states[state] + } + }' "$logfile" >> "$TEMP_DIR/network_states.txt" + # Write metrics line echo "$TIMESTAMP|$MEM_AVAILABLE|$MEM_USED|$MEM_TOTAL|$SWAP_USED|$SWAP_TOTAL|$LOAD_1MIN|$LOAD_5MIN|$LOAD_15MIN|$CPU_IDLE|$CPU_IOWAIT|$CPU_STEAL|$TASK_TOTAL|$TASK_RUNNING|$TASK_ZOMBIE|$MYSQL_QPS|$HTTPD_COUNT" >> "$TEMP_DIR/metrics.txt" @@ -413,6 +448,104 @@ END { } }' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/cpu_analysis.txt" +# NEW: Load trend direction analysis +print_substatus "Analyzing load trend direction..." +awk -F'|' '{ + if ($7 != "" && $8 != "" && $9 != "") { + load_1min = $7 + load_5min = $8 + load_15min = $9 + + # Determine trend direction + if (load_1min > load_5min && load_5min > load_15min) { + rising++ + } else if (load_1min < load_5min && load_5min < load_15min) { + falling++ + } else { + stable++ + } + + last_1min = load_1min + last_5min = load_5min + last_15min = load_15min + } +} +END { + if (rising > falling && rising > stable) { + trend = "RISING" + } else if (falling > rising && falling > stable) { + trend = "FALLING" + } else { + trend = "STABLE" + } + print "LOAD_TREND", "direction=" trend, "rising=" rising, "falling=" falling, "stable=" stable, "last_1min=" last_1min, "last_5min=" last_5min, "last_15min=" last_15min +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/load_trend.txt" + +# NEW: Memory growth velocity analysis +print_substatus "Analyzing memory growth velocity..." +awk -F'|' '{ + if ($2 != "" && $2 > 0) { + if (first_avail == 0) { + first_avail = $2 + first_time = NR + } + last_avail = $2 + last_time = NR + count++ + } +} +END { + if (count > 1) { + delta = last_avail - first_avail + snapshots = last_time - first_time + if (snapshots > 0) { + rate_per_snapshot = delta / snapshots + rate_per_hour = rate_per_snapshot * 20 # 20 snapshots per hour + + # Predict time to OOM if declining + if (rate_per_hour < 0 && last_avail > 0) { + hours_to_oom = last_avail / (-1 * rate_per_hour) + } else { + hours_to_oom = 0 + } + + print "MEMORY_VELOCITY", "first=" first_avail, "last=" last_avail, "delta=" delta, "rate_per_hour=" rate_per_hour, "hours_to_oom=" hours_to_oom + } + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/memory_velocity.txt" + +# NEW: R-state analysis (CPU pressure) +print_substatus "Analyzing R-state process pressure..." +awk '{ + if ($2 != "" && $2 > 0) { + sum += $2 + count++ + if ($2 > max) max = $2 + if ($2 > 4) high++ # More than 2x typical 2-core system + } +} +END { + if (count > 0) { + avg = sum / count + print "RSTATE_ANALYSIS", "avg=" avg, "max=" max, "high_count=" high + } +}' "$TEMP_DIR/rstate_count.txt" > "$TEMP_DIR/rstate_analysis.txt" + +# NEW: Network state analysis +print_substatus "Analyzing network connection states..." +awk '{ + state = $2 + count = $3 + state_totals[state] += count + state_occurrences[state]++ +} +END { + for (state in state_totals) { + avg = state_totals[state] / state_occurrences[state] + print state, avg, state_totals[state] + } +}' "$TEMP_DIR/network_states.txt" | sort -k3 -rn > "$TEMP_DIR/network_state_summary.txt" + print_success "Metric analysis complete" ############################################################################# @@ -559,6 +692,17 @@ print_status "Phase 4/4: Generating report..." [ "$STEAL_ALERTS" -gt 5 ] && echo " ... and $((STEAL_ALERTS - 5)) more" echo "" fi + + # MySQL thread anomaly (connection storm) + THREAD_ALERTS=$(grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$THREAD_ALERTS" -gt 0 ]; then + echo "⚠️ MYSQL THREAD ANOMALY (Connection Storm): $THREAD_ALERTS occurrences" + grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$THREAD_ALERTS" -gt 5 ] && echo " ... and $((THREAD_ALERTS - 5)) more" + echo "" + fi fi echo "================================================================================" @@ -598,6 +742,34 @@ print_status "Phase 4/4: Generating report..." echo "" fi + # Memory growth velocity + if [ -f "$TEMP_DIR/memory_velocity.txt" ]; then + read -r _ first_line < "$TEMP_DIR/memory_velocity.txt" + FIRST_AVAIL=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^first=/) print $i}' | cut -d= -f2) + LAST_AVAIL=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^last=/) print $i}' | cut -d= -f2) + DELTA=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^delta=/) print $i}' | cut -d= -f2) + RATE=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^rate_per_hour=/) print $i}' | cut -d= -f2) + HOURS_TO_OOM=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^hours_to_oom=/) print $i}' | cut -d= -f2) + + echo "Memory Growth Velocity:" + echo " First Available: ${FIRST_AVAIL} MiB" + echo " Last Available: ${LAST_AVAIL} MiB" + echo " Delta: ${DELTA} MiB" + echo " Rate: ${RATE} MiB/hour" + + # Check if memory is declining + RATE_INT=$(echo "$RATE" | cut -d. -f1 | sed 's/^-//') + if echo "$RATE" | grep -q "^-"; then + HOURS_INT=$(echo "$HOURS_TO_OOM" | cut -d. -f1) + if [ "$HOURS_INT" -gt 0 ] 2>/dev/null; then + echo " ⚠️ WARNING: Memory declining - OOM predicted in ${HOURS_INT} hours" + fi + else + echo " ✓ Memory stable or increasing" + fi + echo "" + fi + echo "================================================================================" echo "CPU & LOAD ANALYSIS" echo "================================================================================" @@ -616,6 +788,32 @@ print_status "Phase 4/4: Generating report..." echo "" fi + # Load trend direction + if [ -f "$TEMP_DIR/load_trend.txt" ]; then + read -r _ trend_line < "$TEMP_DIR/load_trend.txt" + TREND_DIR=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^direction=/) print $i}' | cut -d= -f2) + RISING_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^rising=/) print $i}' | cut -d= -f2) + FALLING_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^falling=/) print $i}' | cut -d= -f2) + STABLE_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^stable=/) print $i}' | cut -d= -f2) + + echo "Load Trend Direction:" + case "$TREND_DIR" in + RISING) + echo " ⚠️ RISING - Problem getting worse (1min > 5min > 15min)" + ;; + FALLING) + echo " ✓ FALLING - Problem resolving (1min < 5min < 15min)" + ;; + STABLE) + echo " → STABLE - Load averages consistent" + ;; + esac + echo " Rising snapshots: $RISING_COUNT" + echo " Falling snapshots: $FALLING_COUNT" + echo " Stable snapshots: $STABLE_COUNT" + echo "" + fi + if [ -f "$TEMP_DIR/cpu_analysis.txt" ]; then read -r _ avg_idle min_idle critical warning avg_iowait high_iowait < "$TEMP_DIR/cpu_analysis.txt" AVG_IDLE=$(echo "$avg_idle" | cut -d= -f2) @@ -671,6 +869,80 @@ print_status "Phase 4/4: Generating report..." fi echo "" + echo "================================================================================" + echo "CPU PRESSURE ANALYSIS (R-state Processes)" + echo "================================================================================" + echo "" + + # R-state analysis + if [ -f "$TEMP_DIR/rstate_analysis.txt" ]; then + read -r _ avg_rstate max_rstate high_count < "$TEMP_DIR/rstate_analysis.txt" + AVG_RSTATE=$(echo "$avg_rstate" | cut -d= -f2 | cut -d. -f1) + MAX_RSTATE=$(echo "$max_rstate" | cut -d= -f2) + HIGH_RSTATE=$(echo "$high_count" | cut -d= -f2) + + echo "Runnable (R-state) Processes:" + echo " Average: $AVG_RSTATE processes" + echo " Maximum: $MAX_RSTATE processes" + echo " High (> 5): $HIGH_RSTATE snapshots" + echo "" + echo "R-state Count Analysis:" + echo " - R-state = processes waiting for CPU (runnable but not running)" + echo " - Better CPU pressure metric than load average alone" + echo " - R-state count > CPU cores = CPU contention" + + MAX_RSTATE_INT=$(echo "$MAX_RSTATE" | cut -d. -f1) + if [ "$MAX_RSTATE_INT" -gt 10 ] 2>/dev/null; then + echo " ⚠️ WARNING: High R-state count indicates severe CPU pressure" + elif [ "$MAX_RSTATE_INT" -gt 5 ] 2>/dev/null; then + echo " ⚠️ CAUTION: Moderate CPU contention detected" + else + echo " ✓ CPU pressure within normal range" + fi + echo "" + fi + + echo "================================================================================" + echo "NETWORK CONNECTION ANALYSIS" + echo "================================================================================" + echo "" + + # Network state breakdown + if [ -f "$TEMP_DIR/network_state_summary.txt" ] && [ -s "$TEMP_DIR/network_state_summary.txt" ]; then + echo "Connection States (Aggregated):" + echo "" + printf "%-20s %15s %15s\n" "STATE" "AVG" "TOTAL" + printf "%-20s %15s %15s\n" "--------------------" "---------------" "---------------" + while read state avg total; do + AVG_INT=$(echo "$avg" | cut -d. -f1) + TOTAL_INT=$(echo "$total" | cut -d. -f1) + printf "%-20s %15s %15s\n" "$state" "$AVG_INT" "$TOTAL_INT" + + # Add warnings for specific states + case "$state" in + SYN_RECV) + if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then + echo " ⚠️ Possible SYN flood attack" + fi + ;; + CLOSE_WAIT) + if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then + echo " ⚠️ Connection leak - application not closing connections" + fi + ;; + TIME_WAIT) + if [ "$TOTAL_INT" -gt 1000 ] 2>/dev/null; then + echo " ⚠️ Excessive TIME_WAIT - may need tuning" + fi + ;; + esac + done < "$TEMP_DIR/network_state_summary.txt" + echo "" + else + echo "No network connection data available" + echo "" + fi + echo "================================================================================" echo "RECOMMENDATIONS" echo "================================================================================"