diff --git a/modules/diagnostics/loadwatch-analyzer.sh b/modules/diagnostics/loadwatch-analyzer.sh index f842e2d..b9cc6a4 100755 --- a/modules/diagnostics/loadwatch-analyzer.sh +++ b/modules/diagnostics/loadwatch-analyzer.sh @@ -264,6 +264,12 @@ while IFS= read -r logfile; do if [ "$IOWAIT_INT" -gt 20 ] 2>/dev/null; then echo "$TIMESTAMP HIGH_IOWAIT iowait=${CPU_IOWAIT}%" >> "$TEMP_DIR/alerts.txt" fi + + # CRITICAL: Check CPU steal time (VM resource contention) + STEAL_INT=$(echo "$CPU_STEAL" | cut -d. -f1) + if [ "$STEAL_INT" -gt 10 ] 2>/dev/null; then + echo "$TIMESTAMP HIGH_CPU_STEAL steal=${CPU_STEAL}%" >> "$TEMP_DIR/alerts.txt" + fi fi # Parse MySQL stats @@ -284,6 +290,33 @@ while IFS= read -r logfile; do } }' "$logfile" | head -3 >> "$TEMP_DIR/top_processes.txt" + # CRITICAL: Check for kswapd0 in top processes (memory thrashing indicator) + KSWAPD_CHECK=$(awk '/PID USER.*COMMAND/,/^USER.*TTY/ { + if ($12 ~ /kswapd/ && $9 > 1.0) { + print $9 + } + }' "$logfile" | head -1) + + if [ ! -z "$KSWAPD_CHECK" ]; then + echo "$TIMESTAMP MEMORY_THRASHING kswapd0_cpu=${KSWAPD_CHECK}%" >> "$TEMP_DIR/alerts.txt" + fi + + # CRITICAL: Count D-state processes (I/O blocking) + DSTATE_COUNT=$(awk '/^USER.*STAT.*COMMAND/,/^## / { + if ($8 ~ /D/) count++ + } END {print count+0}' "$logfile") + + if [ "$DSTATE_COUNT" -gt 0 ] 2>/dev/null; then + echo "$TIMESTAMP IO_BLOCKED_PROCESSES count=$DSTATE_COUNT" >> "$TEMP_DIR/alerts.txt" + fi + + # Extract top 3 memory consumers + awk '/PID USER.*COMMAND/,/^USER.*TTY/ { + if ($1 ~ /^[0-9]+$/ && NR <= 20) { + print "'"$TIMESTAMP"'", $1, $12, $9, $10 + } + }' "$logfile" | head -3 >> "$TEMP_DIR/top_mem_processes.txt" + # Write metrics line echo "$TIMESTAMP|$MEM_AVAILABLE|$MEM_USED|$MEM_TOTAL|$SWAP_USED|$SWAP_TOTAL|$LOAD_1MIN|$LOAD_5MIN|$LOAD_15MIN|$CPU_IDLE|$CPU_IOWAIT|$CPU_STEAL|$TASK_TOTAL|$TASK_RUNNING|$TASK_ZOMBIE|$MYSQL_QPS|$HTTPD_COUNT" >> "$TEMP_DIR/metrics.txt" @@ -405,6 +438,23 @@ END { } }' "$TEMP_DIR/top_processes.txt" | sort -rn | head -10 > "$TEMP_DIR/top_cpu_consumers.txt" +# Top memory consumers (aggregate across all snapshots) +print_substatus "Analyzing top memory consumers..." +awk '{ + cmd = $3 + mem = $5 + if (mem > 0) { + mem_sum[cmd] += mem + count[cmd]++ + } +} +END { + for (cmd in mem_sum) { + avg = mem_sum[cmd] / count[cmd] + print avg, cmd, count[cmd] + } +}' "$TEMP_DIR/top_mem_processes.txt" | sort -rn | head -10 > "$TEMP_DIR/top_mem_consumers.txt" + print_success "Resource analysis complete" ############################################################################# @@ -476,6 +526,39 @@ print_status "Phase 4/4: Generating report..." [ "$ZOMBIE_ALERTS" -gt 5 ] && echo " ... and $((ZOMBIE_ALERTS - 5)) more" echo "" fi + + # Memory thrashing (kswapd0) + THRASH_ALERTS=$(grep "MEMORY_THRASHING" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$THRASH_ALERTS" -gt 0 ]; then + echo "⚠️ MEMORY THRASHING (kswapd0 active): $THRASH_ALERTS occurrences" + grep "MEMORY_THRASHING" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$THRASH_ALERTS" -gt 5 ] && echo " ... and $((THRASH_ALERTS - 5)) more" + echo "" + fi + + # I/O blocked processes + BLOCKED_ALERTS=$(grep "IO_BLOCKED_PROCESSES" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$BLOCKED_ALERTS" -gt 0 ]; then + echo "I/O BLOCKED PROCESSES (D-state): $BLOCKED_ALERTS occurrences" + grep "IO_BLOCKED_PROCESSES" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$BLOCKED_ALERTS" -gt 5 ] && echo " ... and $((BLOCKED_ALERTS - 5)) more" + echo "" + fi + + # CPU steal time (VM contention) + STEAL_ALERTS=$(grep "HIGH_CPU_STEAL" "$TEMP_DIR/alerts.txt" | wc -l) + if [ "$STEAL_ALERTS" -gt 0 ]; then + echo "HIGH CPU STEAL TIME (VM resource contention): $STEAL_ALERTS occurrences" + grep "HIGH_CPU_STEAL" "$TEMP_DIR/alerts.txt" | head -5 | while read line; do + echo " - $line" + done + [ "$STEAL_ALERTS" -gt 5 ] && echo " ... and $((STEAL_ALERTS - 5)) more" + echo "" + fi fi echo "================================================================================" @@ -572,6 +655,22 @@ print_status "Phase 4/4: Generating report..." fi echo "" + echo "================================================================================" + echo "TOP MEMORY CONSUMERS (Averaged Across Period)" + echo "================================================================================" + echo "" + + if [ -f "$TEMP_DIR/top_mem_consumers.txt" ] && [ -s "$TEMP_DIR/top_mem_consumers.txt" ]; then + printf "%-10s %-50s %s\n" "AVG MEM%" "PROCESS" "OCCURRENCES" + printf "%-10s %-50s %s\n" "--------" "------------------------------------------------" "-----------" + while read avg_mem cmd occurrences; do + printf "%-10.1f %-50s %s\n" "$avg_mem" "$cmd" "$occurrences" + done < "$TEMP_DIR/top_mem_consumers.txt" + else + echo "No significant memory consumers found" + fi + echo "" + echo "================================================================================" echo "RECOMMENDATIONS" echo "================================================================================" @@ -587,6 +686,56 @@ print_status "Phase 4/4: Generating report..." CRIT_CPU=${CRIT_CPU:-0} HIGH_IOWAIT=${HIGH_IOWAIT:-0} ZOMBIE_ALERTS=${ZOMBIE_ALERTS:-0} + THRASH_ALERTS=${THRASH_ALERTS:-0} + BLOCKED_ALERTS=${BLOCKED_ALERTS:-0} + STEAL_ALERTS=${STEAL_ALERTS:-0} + + # CRITICAL: Memory thrashing + if [ "$THRASH_ALERTS" -gt 0 ]; then + echo "🔴 CRITICAL - MEMORY THRASHING DETECTED" + echo " - kswapd0 (kernel swap daemon) was consuming CPU in $THRASH_ALERTS snapshots" + echo " - This is THE definitive indicator of severe memory pressure" + echo " - System is thrashing - constantly swapping pages in/out of memory" + echo " - IMMEDIATE ACTION REQUIRED:" + echo " 1. Add more RAM to the server (most effective solution)" + echo " 2. Kill/restart memory-intensive processes" + echo " 3. Review top memory consumers above" + echo " 4. Check for memory leaks in applications" + echo " - Performance is severely degraded during thrashing" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + # CRITICAL: I/O blocking + if [ "$BLOCKED_ALERTS" -gt 5 ]; then + echo "🔴 CRITICAL - I/O BLOCKING DETECTED" + echo " - Processes stuck in D-state (uninterruptible sleep) in $BLOCKED_ALERTS snapshots" + echo " - Processes are blocked waiting for I/O operations to complete" + echo " - Indicates severe disk performance issues or hardware problems" + echo " - IMMEDIATE ACTION REQUIRED:" + echo " 1. Check disk health: smartctl -a /dev/sda" + echo " 2. Check I/O performance: iostat -x 1 5" + echo " 3. Look for failing drives in dmesg: dmesg | grep -i error" + echo " 4. Consider upgrading to SSD storage" + echo " 5. Check for network storage timeouts (NFS/iSCSI)" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi + + # CRITICAL: CPU steal time (VM resource contention) + if [ "$STEAL_ALERTS" -gt 10 ]; then + echo "🔴 CRITICAL - VM RESOURCE CONTENTION" + echo " - High CPU steal time detected in $STEAL_ALERTS snapshots" + echo " - Hypervisor is stealing CPU cycles from this VM" + echo " - Physical host is overcommitted or experiencing contention" + echo " - ACTIONS REQUIRED:" + echo " 1. Contact hosting provider about resource contention" + echo " 2. Request move to less crowded physical host" + echo " 3. Upgrade to dedicated/guaranteed CPU resources" + echo " 4. Consider upgrading VM plan for better resource allocation" + echo "" + RECOMMENDATIONS=$((RECOMMENDATIONS + 1)) + fi if [ "$CRIT_COUNT" -gt 0 ] || [ "$AVG_MEM" -lt 300 ]; then echo "⚠ MEMORY: Critical memory pressure detected"