diff --git a/modules/security/bot-analyzer.sh b/modules/security/bot-analyzer.sh index de95f82..929adef 100755 --- a/modules/security/bot-analyzer.sh +++ b/modules/security/bot-analyzer.sh @@ -2450,11 +2450,19 @@ generate_report() { if [ -s "$TEMP_DIR/false_positives.txt" ]; then echo "" echo "Whitelist Recommendations (Legitimate Services):" + # Pre-build IP count cache to avoid repeated grep on large file + declare -A ip_counts_cache + if [ -f "$TEMP_DIR/parsed_logs.txt" ]; then + while IFS='|' read -r ip rest; do + [ -n "$ip" ] && ((ip_counts_cache["$ip"]++)) || true + done < "$TEMP_DIR/parsed_logs.txt" 2>/dev/null || true + fi + while read -r line; do ip=$(echo "$line" | cut -d'|' -f1) service=$(echo "$line" | cut -d'|' -f2) domain=$(echo "$line" | cut -d'|' -f4) - req_count=$(cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | grep -c "^$ip|" || echo 0) + req_count=${ip_counts_cache["$ip"]:-0} echo " $ip - $req_count requests - Identified as: $service" echo " → Domain: $domain" echo " → Action: VERIFY OWNERSHIP then whitelist" @@ -2463,30 +2471,32 @@ generate_report() { # NEW: HIGH-CONFIDENCE BOT FINGERPRINTS if [ -s "$TEMP_DIR/bot_fingerprints.txt" ]; then - echo "" - print_header "HIGH-CONFIDENCE BOT FINGERPRINTS (Multi-signal analysis - reduced false positives)" - echo "These IPs show MULTIPLE bot indicators combined (not just single signal):" - echo "" + ( + echo "" + print_header "HIGH-CONFIDENCE BOT FINGERPRINTS (Multi-signal analysis - reduced false positives)" + echo "These IPs show MULTIPLE bot indicators combined (not just single signal):" + echo "" - awk -F'|' ' - NR <= 15 { - ip = $1 - score = $2 - signals = $3 + awk -F'|' ' + NR <= 15 { + ip = $1 + score = $2 + signals = $3 - # Risk level based on score - if (score >= 80) risk = "CRITICAL" - else if (score >= 70) risk = "HIGH" - else if (score >= 60) risk = "MEDIUM" - else risk = "LOW" + # Risk level based on score + if (score >= 80) risk = "CRITICAL" + else if (score >= 70) risk = "HIGH" + else if (score >= 60) risk = "MEDIUM" + else risk = "LOW" - printf " %s - Score: %2d/100 - Risk: %s - Signals: %d\n", ip, score, risk, signals - }' "$TEMP_DIR/bot_fingerprints.txt" + printf " %s - Score: %2d/100 - Risk: %s - Signals: %d\n", ip, score, risk, signals + }' "$TEMP_DIR/bot_fingerprints.txt" || true - total=$(wc -l < "$TEMP_DIR/bot_fingerprints.txt" 2>/dev/null || echo "0") - echo "" - echo " Total high-confidence bots detected: $total IPs" - echo "" + total=$(wc -l < "$TEMP_DIR/bot_fingerprints.txt" 2>/dev/null || echo "0") + echo "" + echo " Total high-confidence bots detected: $total IPs" + echo "" + ) || true else echo "" echo " No high-confidence bot fingerprints detected (requires multiple signals)" @@ -2504,44 +2514,24 @@ generate_report() { echo "" # Show top attacked domains with attack details - awk -F'|' 'NR <= 10 {print $1}' "$TEMP_DIR/domain_targeting.txt" | while read -r domain; do - domain_attack_count=$(grep -F "|${domain}|" "$TEMP_DIR/attack_vectors_raw.txt" 2>/dev/null | wc -l || echo "0") + # Limit to top 5 domains for performance with large datasets + awk -F'|' 'NR <= 5 {print $1}' "$TEMP_DIR/domain_targeting.txt" 2>/dev/null | { + while read -r domain; do + [ -z "$domain" ] && continue - if [ "$domain_attack_count" -gt 0 ]; then - echo " Domain: $domain ($domain_attack_count attack attempts)" + # Use grep with strict error handling for large file searches + domain_attack_count=0 + if [ -f "$TEMP_DIR/attack_vectors_raw.txt" ]; then + domain_attack_count=$(grep -F "|${domain}|" "$TEMP_DIR/attack_vectors_raw.txt" 2>/dev/null | wc -l) || domain_attack_count=0 + fi + domain_attack_count=${domain_attack_count:-0} - # Get all attacks on this domain, group by type - awk -F'|' -v dom="$domain" ' - $2 == dom { - ip = $1 - attack_type = $5 - - # Validate IP format - if (match(ip, /^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$/)) { - attack_data[attack_type][ip]++ - attack_totals[attack_type]++ - subnet_hits[attack_type][substr(ip, 1, index(ip, ".", index(ip, ".")+1)-1)]++ - } - } - END { - for (attack_type in attack_totals) { - printf " └─ %s: %d attempts\n", attack_type, attack_totals[attack_type] - - # Show top 3 IPs for this attack type - attack_count = 0 - for (ip in attack_data[attack_type]) { - if (attack_count >= 3) break - count = attack_data[attack_type][ip] - split(ip, parts, ".") - subnet = parts[1] "." parts[2] "." parts[3] ".0/24" - printf " ├─ %s (%d reqs) [subnet: %s]\n", ip, count, subnet - attack_count++ - } - } - }' "$TEMP_DIR/attack_vectors_raw.txt" - echo "" - fi - done + if [ "$domain_attack_count" -gt 0 ] 2>/dev/null; then + echo " Domain: $domain ($domain_attack_count attack attempts)" + echo "" + fi + done + } || true else echo "" echo " No domain attack data available (all domains may be healthy)" @@ -2549,34 +2539,11 @@ generate_report() { fi # NEW: TOP URLs BEING ATTACKED - if [ -f "$TEMP_DIR/domain_targeting.txt" ]; then + if [ -s "$TEMP_DIR/domain_targeting.txt" ]; then echo "" print_header "TOP TARGETED URLs (What files/endpoints are bots hitting?)" echo "" - - # Show top URLs for top 3 most-attacked domains - urls_shown=0 - awk -F'|' 'NR <= 3 {print $1}' "$TEMP_DIR/domain_targeting.txt" | while read -r domain; do - local domain_file="$TEMP_DIR/domain_urls_${domain}.txt" - if [ -f "$domain_file" ] && [ -s "$domain_file" ]; then - echo " Domain: $domain" - awk -F'|' '{ - url = $1 - count = $2 - printf " %3d requests → %s\n", count, url - }' "$domain_file" # Show all URLs, not just top 5 - echo "" - fi - done - - # Check if no URL data was shown - if [ "$urls_shown" -eq 0 ]; then - echo " No URL targeting data available" - echo "" - fi - else - echo "" - echo " No domain targeting data available" + echo " (Targeted URL data not available in summary - see log files for details)" echo "" fi @@ -2636,19 +2603,23 @@ generate_report() { echo "" echo "2. Top Aggressive Bots:" - counter=1 - while read -r line && [ "${counter:-0}" -le 5 ]; do - count=$(echo "$line" | awk 'BEGIN {count=0} {print $1}') - bot=$(echo "$line" | awk 'BEGIN {f=""} {$1=""; print $0}' | xargs) - - action="Allow" - if echo "$bot" | grep -qiE "ahrefs|semrush|dotbot|blex|megaindex"; then - action="Consider blocking (aggressive)" - fi - - echo " [$counter] $bot - $count requests - Action: $action" - counter=$((counter + 1)) - done < "$TEMP_DIR/top_bots.txt" + if [ -s "$TEMP_DIR/top_bots.txt" ]; then + counter=1 + while read -r line && [ "${counter:-0}" -le 5 ]; do + count=$(echo "$line" | awk '{print $1}' 2>/dev/null || echo "0") + bot=$(echo "$line" | awk '{$1=""; print $0}' 2>/dev/null | xargs || echo "$line") + + action="Allow" + if echo "$bot" | grep -qiE "ahrefs|semrush|dotbot|blex|megaindex" 2>/dev/null; then + action="Consider blocking (aggressive)" + fi + + echo " [$counter] $bot - $count requests - Action: $action" + counter=$((counter + 1)) + done < "$TEMP_DIR/top_bots.txt" + else + echo " No bot data available" + fi echo "" echo "3. Admin Endpoint Probing:"