diff --git a/modules/diagnostics/loadwatch-analyzer.sh b/modules/diagnostics/loadwatch-analyzer.sh index b9cc6a4..eb589e7 100755 --- a/modules/diagnostics/loadwatch-analyzer.sh +++ b/modules/diagnostics/loadwatch-analyzer.sh @@ -206,9 +206,22 @@ while IFS= read -r logfile; do if [ ! -z "$SUMMARY" ]; then # Extract: timestamp load[X] mem[XX/YY] mysql[X/X] httpd[X] LOAD_ROUNDED=$(echo "$SUMMARY" | awk '{print $2}' | sed 's/load\[\(.*\)\]/\1/') - MEM_PCT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/') - MYSQL_THREADS=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/') + MEM_PCT_CURRENT=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[\(.*\)\/.*\]/\1/') + MEM_PCT_PREVIOUS=$(echo "$SUMMARY" | awk '{print $3}' | sed 's/mem\[.*\/\(.*\)\]/\1/') + MYSQL_THREADS_CURRENT=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[\(.*\)\/.*\]/\1/') + MYSQL_THREADS_EXPECTED=$(echo "$SUMMARY" | awk '{print $4}' | sed 's/mysql\[.*\/\(.*\)\]/\1/') HTTPD_COUNT=$(echo "$SUMMARY" | awk '{print $5}' | sed 's/httpd\[\(.*\)\]/\1/') + + # NEW: MySQL thread anomaly detection + if [ ! -z "$MYSQL_THREADS_CURRENT" ] && [ ! -z "$MYSQL_THREADS_EXPECTED" ]; then + if [ "$MYSQL_THREADS_CURRENT" -gt "$((MYSQL_THREADS_EXPECTED * 3))" ] 2>/dev/null; then + ANOMALY=$((MYSQL_THREADS_CURRENT - MYSQL_THREADS_EXPECTED)) + echo "$TIMESTAMP MYSQL_THREAD_ANOMALY current=$MYSQL_THREADS_CURRENT expected=$MYSQL_THREADS_EXPECTED anomaly=+$ANOMALY" >> "$TEMP_DIR/alerts.txt" + fi + fi + + # Track httpd count for trending + echo "$TIMESTAMP $HTTPD_COUNT" >> "$TEMP_DIR/httpd_count.txt" fi # Parse memory stats @@ -245,6 +258,14 @@ while IFS= read -r logfile; do fi fi + # NEW: Count R-state (runnable) processes for CPU pressure metric + RSTATE_COUNT=$(awk '/PID USER.*COMMAND/,/^USER.*TTY/ { + if ($8 == "R") count++ + } END {print count+0}' "$logfile") + + # Track R-state count (better CPU pressure metric than load average) + echo "$TIMESTAMP $RSTATE_COUNT" >> "$TEMP_DIR/rstate_count.txt" + # Parse CPU stats CPU_LINE=$(grep "^%Cpu(s):" "$logfile") if [ ! -z "$CPU_LINE" ]; then @@ -317,6 +338,20 @@ while IFS= read -r logfile; do } }' "$logfile" | head -3 >> "$TEMP_DIR/top_mem_processes.txt" + # NEW: Parse network connection states for attack/leak detection + awk '/^## network stats/,/^$/ { + if ($2 ~ /^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ && $3 != "") { + state = $3 + count = $1 + states[state] += count + } + } + END { + for (state in states) { + print "'"$TIMESTAMP"'", state, states[state] + } + }' "$logfile" >> "$TEMP_DIR/network_states.txt" + # Write metrics line echo "$TIMESTAMP|$MEM_AVAILABLE|$MEM_USED|$MEM_TOTAL|$SWAP_USED|$SWAP_TOTAL|$LOAD_1MIN|$LOAD_5MIN|$LOAD_15MIN|$CPU_IDLE|$CPU_IOWAIT|$CPU_STEAL|$TASK_TOTAL|$TASK_RUNNING|$TASK_ZOMBIE|$MYSQL_QPS|$HTTPD_COUNT" >> "$TEMP_DIR/metrics.txt" @@ -413,6 +448,104 @@ END { } }' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/cpu_analysis.txt" +# NEW: Load trend direction analysis +print_substatus "Analyzing load trend direction..." +awk -F'|' '{ + if ($7 != "" && $8 != "" && $9 != "") { + load_1min = $7 + load_5min = $8 + load_15min = $9 + + # Determine trend direction + if (load_1min > load_5min && load_5min > load_15min) { + rising++ + } else if (load_1min < load_5min && load_5min < load_15min) { + falling++ + } else { + stable++ + } + + last_1min = load_1min + last_5min = load_5min + last_15min = load_15min + } +} +END { + if (rising > falling && rising > stable) { + trend = "RISING" + } else if (falling > rising && falling > stable) { + trend = "FALLING" + } else { + trend = "STABLE" + } + print "LOAD_TREND", "direction=" trend, "rising=" rising, "falling=" falling, "stable=" stable, "last_1min=" last_1min, "last_5min=" last_5min, "last_15min=" last_15min +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/load_trend.txt" + +# NEW: Memory growth velocity analysis +print_substatus "Analyzing memory growth velocity..." +awk -F'|' '{ + if ($2 != "" && $2 > 0) { + if (first_avail == 0) { + first_avail = $2 + first_time = NR + } + last_avail = $2 + last_time = NR + count++ + } +} +END { + if (count > 1) { + delta = last_avail - first_avail + snapshots = last_time - first_time + if (snapshots > 0) { + rate_per_snapshot = delta / snapshots + rate_per_hour = rate_per_snapshot * 20 # 20 snapshots per hour + + # Predict time to OOM if declining + if (rate_per_hour < 0 && last_avail > 0) { + hours_to_oom = last_avail / (-1 * rate_per_hour) + } else { + hours_to_oom = 0 + } + + print "MEMORY_VELOCITY", "first=" first_avail, "last=" last_avail, "delta=" delta, "rate_per_hour=" rate_per_hour, "hours_to_oom=" hours_to_oom + } + } +}' "$TEMP_DIR/metrics.txt" > "$TEMP_DIR/memory_velocity.txt" + +# NEW: R-state analysis (CPU pressure) +print_substatus "Analyzing R-state process pressure..." +awk '{ + if ($2 != "" && $2 > 0) { + sum += $2 + count++ + if ($2 > max) max = $2 + if ($2 > 4) high++ # More than 2x typical 2-core system + } +} +END { + if (count > 0) { + avg = sum / count + print "RSTATE_ANALYSIS", "avg=" avg, "max=" max, "high_count=" high + } +}' "$TEMP_DIR/rstate_count.txt" > "$TEMP_DIR/rstate_analysis.txt" + +# NEW: Network state analysis +print_substatus "Analyzing network connection states..." +awk '{ + state = $2 + count = $3 + state_totals[state] += count + state_occurrences[state]++ +} +END { + for (state in state_totals) { + avg = state_totals[state] / state_occurrences[state] + print state, avg, state_totals[state] + } +}' "$TEMP_DIR/network_states.txt" | sort -k3 -rn > "$TEMP_DIR/network_state_summary.txt" + print_success "Metric analysis complete" ############################################################################# @@ -559,6 +692,17 @@ print_status "Phase 4/4: Generating report..." [ "$STEAL_ALERTS" -gt 5 ] && echo " ... and $((STEAL_ALERTS - 5)) more" echo "" fi + + # MySQL thread anomaly (connection storm) + THREAD_ALERTS=$(grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$THREAD_ALERTS" -gt 0 ]; then + echo "⚠️ MYSQL THREAD ANOMALY (Connection Storm): $THREAD_ALERTS occurrences" + grep "MYSQL_THREAD_ANOMALY" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$THREAD_ALERTS" -gt 5 ] && echo " ... and $((THREAD_ALERTS - 5)) more" + echo "" + fi fi echo "================================================================================" @@ -598,6 +742,34 @@ print_status "Phase 4/4: Generating report..." echo "" fi + # Memory growth velocity + if [ -f "$TEMP_DIR/memory_velocity.txt" ]; then + read -r _ first_line < "$TEMP_DIR/memory_velocity.txt" + FIRST_AVAIL=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^first=/) print $i}' | cut -d= -f2) + LAST_AVAIL=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^last=/) print $i}' | cut -d= -f2) + DELTA=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^delta=/) print $i}' | cut -d= -f2) + RATE=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^rate_per_hour=/) print $i}' | cut -d= -f2) + HOURS_TO_OOM=$(echo "$first_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^hours_to_oom=/) print $i}' | cut -d= -f2) + + echo "Memory Growth Velocity:" + echo " First Available: ${FIRST_AVAIL} MiB" + echo " Last Available: ${LAST_AVAIL} MiB" + echo " Delta: ${DELTA} MiB" + echo " Rate: ${RATE} MiB/hour" + + # Check if memory is declining + RATE_INT=$(echo "$RATE" | cut -d. -f1 | sed 's/^-//') + if echo "$RATE" | grep -q "^-"; then + HOURS_INT=$(echo "$HOURS_TO_OOM" | cut -d. -f1) + if [ "$HOURS_INT" -gt 0 ] 2>/dev/null; then + echo " ⚠️ WARNING: Memory declining - OOM predicted in ${HOURS_INT} hours" + fi + else + echo " ✓ Memory stable or increasing" + fi + echo "" + fi + echo "================================================================================" echo "CPU & LOAD ANALYSIS" echo "================================================================================" @@ -616,6 +788,32 @@ print_status "Phase 4/4: Generating report..." echo "" fi + # Load trend direction + if [ -f "$TEMP_DIR/load_trend.txt" ]; then + read -r _ trend_line < "$TEMP_DIR/load_trend.txt" + TREND_DIR=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^direction=/) print $i}' | cut -d= -f2) + RISING_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^rising=/) print $i}' | cut -d= -f2) + FALLING_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^falling=/) print $i}' | cut -d= -f2) + STABLE_COUNT=$(echo "$trend_line" | awk '{for(i=1;i<=NF;i++) if($i ~ /^stable=/) print $i}' | cut -d= -f2) + + echo "Load Trend Direction:" + case "$TREND_DIR" in + RISING) + echo " ⚠️ RISING - Problem getting worse (1min > 5min > 15min)" + ;; + FALLING) + echo " ✓ FALLING - Problem resolving (1min < 5min < 15min)" + ;; + STABLE) + echo " → STABLE - Load averages consistent" + ;; + esac + echo " Rising snapshots: $RISING_COUNT" + echo " Falling snapshots: $FALLING_COUNT" + echo " Stable snapshots: $STABLE_COUNT" + echo "" + fi + if [ -f "$TEMP_DIR/cpu_analysis.txt" ]; then read -r _ avg_idle min_idle critical warning avg_iowait high_iowait < "$TEMP_DIR/cpu_analysis.txt" AVG_IDLE=$(echo "$avg_idle" | cut -d= -f2) @@ -671,6 +869,80 @@ print_status "Phase 4/4: Generating report..." fi echo "" + echo "================================================================================" + echo "CPU PRESSURE ANALYSIS (R-state Processes)" + echo "================================================================================" + echo "" + + # R-state analysis + if [ -f "$TEMP_DIR/rstate_analysis.txt" ]; then + read -r _ avg_rstate max_rstate high_count < "$TEMP_DIR/rstate_analysis.txt" + AVG_RSTATE=$(echo "$avg_rstate" | cut -d= -f2 | cut -d. -f1) + MAX_RSTATE=$(echo "$max_rstate" | cut -d= -f2) + HIGH_RSTATE=$(echo "$high_count" | cut -d= -f2) + + echo "Runnable (R-state) Processes:" + echo " Average: $AVG_RSTATE processes" + echo " Maximum: $MAX_RSTATE processes" + echo " High (> 5): $HIGH_RSTATE snapshots" + echo "" + echo "R-state Count Analysis:" + echo " - R-state = processes waiting for CPU (runnable but not running)" + echo " - Better CPU pressure metric than load average alone" + echo " - R-state count > CPU cores = CPU contention" + + MAX_RSTATE_INT=$(echo "$MAX_RSTATE" | cut -d. -f1) + if [ "$MAX_RSTATE_INT" -gt 10 ] 2>/dev/null; then + echo " ⚠️ WARNING: High R-state count indicates severe CPU pressure" + elif [ "$MAX_RSTATE_INT" -gt 5 ] 2>/dev/null; then + echo " ⚠️ CAUTION: Moderate CPU contention detected" + else + echo " ✓ CPU pressure within normal range" + fi + echo "" + fi + + echo "================================================================================" + echo "NETWORK CONNECTION ANALYSIS" + echo "================================================================================" + echo "" + + # Network state breakdown + if [ -f "$TEMP_DIR/network_state_summary.txt" ] && [ -s "$TEMP_DIR/network_state_summary.txt" ]; then + echo "Connection States (Aggregated):" + echo "" + printf "%-20s %15s %15s\n" "STATE" "AVG" "TOTAL" + printf "%-20s %15s %15s\n" "--------------------" "---------------" "---------------" + while read state avg total; do + AVG_INT=$(echo "$avg" | cut -d. -f1) + TOTAL_INT=$(echo "$total" | cut -d. -f1) + printf "%-20s %15s %15s\n" "$state" "$AVG_INT" "$TOTAL_INT" + + # Add warnings for specific states + case "$state" in + SYN_RECV) + if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then + echo " ⚠️ Possible SYN flood attack" + fi + ;; + CLOSE_WAIT) + if [ "$TOTAL_INT" -gt 100 ] 2>/dev/null; then + echo " ⚠️ Connection leak - application not closing connections" + fi + ;; + TIME_WAIT) + if [ "$TOTAL_INT" -gt 1000 ] 2>/dev/null; then + echo " ⚠️ Excessive TIME_WAIT - may need tuning" + fi + ;; + esac + done < "$TEMP_DIR/network_state_summary.txt" + echo "" + else + echo "No network connection data available" + echo "" + fi + echo "================================================================================" echo "RECOMMENDATIONS" echo "================================================================================"