diff --git a/modules/security/bot-analyzer.sh b/modules/security/bot-analyzer.sh index 5ad8708..4063138 100755 --- a/modules/security/bot-analyzer.sh +++ b/modules/security/bot-analyzer.sh @@ -542,14 +542,37 @@ classify_bots() { break } } - } else if (match(ua_lower, /bot|crawler|spider|scraper|curl|wget|python-|java\/|scan/)) { - # FILTER OUT legitimate browsers that might contain "bot" in version strings - # Common browsers: Chrome, Firefox, Safari, Edge, Opera, Samsung Browser, etc. - if (match(ua_lower, /chrome\/|firefox\/|safari\/|edg\/|edge\/|opr\/|opera\//) || - match(ua_lower, /mozilla\/5\.0/) && match(ua_lower, /applewebkit|gecko/) && !match(ua_lower, /bot|crawler|spider/) || - match(ua_lower, /samsungbrowser|ucbrowser|yabrowser|vivaldi/) || - match(ua_lower, /android.*mobile|iphone|ipad|windows nt|macintosh|linux x86/) && !match(ua_lower, /bot|crawler|spider/)) { - # This is a legitimate browser, skip it + } else if (match(ua_lower, /bot|crawler|spider|scraper|curl|wget|python-requests|python-urllib|java\/|scan|check|monitor/)) { + # FIXED: Check for bot keywords FIRST, then verify it's not a legitimate browser + # This prevents bots from bypassing detection by including browser strings + + # FIRST: Check if it's actually a legitimate browser with complete UA signature + # Real browsers have: Mozilla/5.0 + platform + rendering engine + browser version + is_real_browser = 0 + + # Chrome/Chromium-based: Must have Chrome/ AND (AppleWebKit OR Mobile) + if (match(ua_lower, /chrome\/[0-9]/) && (match(ua_lower, /applewebkit/) || match(ua_lower, /mobile/))) { + is_real_browser = 1 + } + # Firefox: Must have Firefox/ AND Gecko/ + else if (match(ua_lower, /firefox\/[0-9]/) && match(ua_lower, /gecko\//)) { + is_real_browser = 1 + } + # Safari: Must have Safari/ AND Version/ AND AppleWebKit (not Chrome) + else if (match(ua_lower, /safari\/[0-9]/) && match(ua_lower, /version\//) && match(ua_lower, /applewebkit/) && !match(ua_lower, /chrome/)) { + is_real_browser = 1 + } + # Edge: Must have Edg/ or Edge/ + else if (match(ua_lower, /edg\/[0-9]|edge\/[0-9]/)) { + is_real_browser = 1 + } + # Mobile browsers: Samsung, UC, Opera Mobile + else if (match(ua_lower, /samsungbrowser\/[0-9]|ucbrowser\/[0-9]|opr\/[0-9]/)) { + is_real_browser = 1 + } + + # If it's a real browser, skip bot classification + if (is_real_browser == 1) { next } @@ -616,23 +639,40 @@ detect_threats() { } # Path Traversal / LFI - if (match(url_lower, /\.\.\/|\.\.\\|etc\/passwd|etc\/shadow|boot\.ini|win\.ini/) || - match(url_lower, /proc\/self|\/etc\/|c:\\|windows\/system32/)) { + # FIXED: Added URL-encoded variants (%2e%2e, %5c for backslash) + if (match(url_lower, /\.\.\/|\.\.\\|%2e%2e|%5c|etc\/passwd|etc\/shadow|boot\.ini|win\.ini/) || + match(url_lower, /proc\/self|proc\/environ|\/etc\/|c:\\|c:%5c|windows[\/\\]system32|windows%5csystem32/)) { print ip "|" domain "|" url "|" status "|path_traversal" > "'"$TEMP_DIR"'/attack_vectors_raw.txt" } # Shell upload / RCE attempts - if (match(url_lower, /cmd\.exe|\/bin\/bash|\/bin\/sh|phpinfo\(|system\(|exec\(|passthru\(/) || - match(url_lower, /shell\.php|c99\.php|r57\.php|backdoor/) || - (match(url_lower, /\.(php|jsp|asp|aspx)/) && method == "POST")) { + # FIXED: Removed overly broad "any POST to .php" condition that caused massive false positives + # Now only detects actual shell commands, known malicious files, and suspicious upload patterns + if (match(url_lower, /cmd\.exe|\/bin\/bash|\/bin\/sh|phpinfo\(|system\(|exec\(|passthru\(|eval\(/) || + match(url_lower, /shell\.php|c99\.php|r57\.php|r00t\.php|backdoor|webshell|cmd\.php|exploit\.php/) || + match(url_lower, /base64_decode.*eval|gzinflate.*eval|assert.*\$_/) || + (match(url_lower, /\.(php|phtml|php3|php4|php5|phar)\.suspected$/) && method == "POST")) { print ip "|" domain "|" url "|" status "|rce_upload" > "'"$TEMP_DIR"'/attack_vectors_raw.txt" } # Info Disclosure attempts - if (match(url_lower, /\.git\/|\.env|\.sql$|\.bak$|\.old$|config\.php|phpinfo|readme/) || - match(url_lower, /web\.config|composer\.json|package\.json|\.htaccess|\.htpasswd/) || - match(url_lower, /database\.sql|backup\.zip|dump\.sql/)) { - print ip "|" domain "|" url "|" status "|info_disclosure" > "'"$TEMP_DIR"'/attack_vectors_raw.txt" + # FIXED: Added status code validation - only flag successful access (200/301/302) + # FIXED: readme pattern now only matches actual files (.txt, .html, .md) + # FIXED: Added more backup file extensions and URL-encoded variants + if (match(url_lower, /\.git\/|\.env|\.sql$|\.bak$|\.old$|\.backup$|\.orig$|\.swp$|\.sav$|~$|config\.php|phpinfo/) || + match(url_lower, /readme\.(txt|html|md)$/) || + match(url_lower, /web\.config|\.htaccess|\.htpasswd/) || + match(url_lower, /database\.sql|backup\.zip|backup\.tar|dump\.sql|sitemap\.xml\.gz/)) { + # Only flag if successful access (200) or redirect (301/302) + # Failed attempts (404/403) are just scanning, tracked separately + if (status ~ /^(200|301|302)/) { + print ip "|" domain "|" url "|" status "|info_disclosure" > "'"$TEMP_DIR"'/attack_vectors_raw.txt" + } + } + + # composer.json / package.json - lower severity, only if successful + if (match(url_lower, /composer\.json|package\.json|package-lock\.json/) && status == "200") { + print ip "|" domain "|" url "|" status "|config_exposure" > "'"$TEMP_DIR"'/attack_vectors_raw.txt" } # Login bruteforce @@ -641,9 +681,14 @@ detect_threats() { } # Admin/sensitive endpoint probing + # FIXED: Only count FAILED attempts (403/401/404) - successful logins are legitimate if (match(url_lower, /wp-admin|phpmyadmin|admin|administrator|login|wp-login|xmlrpc/) || match(url_lower, /\.env|\.git|\.sql|backup|config\./)) { - print ip "|" domain "|" url > "'"$TEMP_DIR"'/admin_probes_raw.txt" + # Only flag failed access attempts (403 Forbidden, 401 Unauthorized, 404 Not Found) + # Successful access (200/302) means legitimate user or already compromised + if (status ~ /^(403|401|404)/) { + print ip "|" domain "|" url > "'"$TEMP_DIR"'/admin_probes_raw.txt" + } } # 404 scanning (reconnaissance) @@ -722,6 +767,58 @@ detect_threats() { print_success "Threat detection complete" } +############################################################################# +# NEW: Success Rate & Behavior Analysis (Added for accuracy improvement) +############################################################################# + +analyze_success_rates() { + print_info "Analyzing request success rates and behavior patterns..." + + # Calculate success rate (200/301/302 vs 404/403) for each IP + cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' ' + { + ip = $1 + status = $4 + + # Count total requests + total[ip]++ + + # Count successful responses + if (status ~ /^(200|301|302)/) { + success[ip]++ + } + # Count failed/blocked responses + else if (status ~ /^(404|403|401)/) { + failed[ip]++ + } + } + END { + for (ip in total) { + success_count = (success[ip] ? success[ip] : 0) + failed_count = (failed[ip] ? failed[ip] : 0) + success_rate = (total[ip] > 0) ? int((success_count / total[ip]) * 100) : 0 + fail_rate = (total[ip] > 0) ? int((failed_count / total[ip]) * 100) : 0 + + # High failure rate indicates scanning/probing + if (fail_rate >= 80 && total[ip] >= 20) { + print ip "|" total[ip] "|" fail_rate "|scanner" > "'"$TEMP_DIR"'/high_failure_ips.txt" + } + # Very high success rate + high volume could be scraping + else if (success_rate >= 90 && total[ip] >= 100) { + print ip "|" total[ip] "|" success_rate "|scraper" > "'"$TEMP_DIR"'/high_success_ips.txt" + } + + # Output all rates for later analysis + print ip "|" total[ip] "|" success_rate "|" fail_rate > "'"$TEMP_DIR"'/ip_success_rates.txt" + } + }' < <(cat "$TEMP_DIR/parsed_logs.txt") + + # Touch files if they don't exist + touch "$TEMP_DIR/high_failure_ips.txt" "$TEMP_DIR/high_success_ips.txt" "$TEMP_DIR/ip_success_rates.txt" + + print_success "Success rate analysis complete" +} + ############################################################################# # Botnet Detection ############################################################################# @@ -959,6 +1056,31 @@ calculate_threat_scores() { [ -n "$ip" ] && threat_404_count["$ip"]=$count done < <(awk '{print $1, $2}' "$TEMP_DIR/404_scans.txt" | sed 's/|.*//') + # NEW: Load bot classifications to skip volume scoring for legitimate bots + declare -A legit_bot_ips + if [ -f "$TEMP_DIR/classified_bots.txt" ]; then + while IFS='|' read -r ip domain url status size ua method timestamp bot_type bot_name; do + if [ "$bot_type" = "legit" ]; then + legit_bot_ips["$ip"]=1 + fi + done < "$TEMP_DIR/classified_bots.txt" + fi + + # NEW: Load success rate data for scanning/scraping detection + declare -A scanner_ips scraper_ips ip_fail_rates + [ -f "$TEMP_DIR/high_failure_ips.txt" ] && while IFS='|' read -r ip total fail_rate category; do + scanner_ips["$ip"]=$fail_rate + done < "$TEMP_DIR/high_failure_ips.txt" + + [ -f "$TEMP_DIR/high_success_ips.txt" ] && while IFS='|' read -r ip total success_rate category; do + scraper_ips["$ip"]=$success_rate + done < "$TEMP_DIR/high_success_ips.txt" + + # Load all fail rates for threshold checks + [ -f "$TEMP_DIR/ip_success_rates.txt" ] && while IFS='|' read -r ip total success_rate fail_rate; do + ip_fail_rates["$ip"]=$fail_rate + done < "$TEMP_DIR/ip_success_rates.txt" + # Now calculate scores for each IP (using pre-counted requests) for ip in "${!ip_request_counts[@]}"; do # Skip excluded IPs @@ -969,11 +1091,31 @@ calculate_threat_scores() { score=0 req_count=${ip_request_counts[$ip]} - # Base request volume (max 10 points) - if [ "$req_count" -gt 10000 ]; then score=$((score + 10)) - elif [ "$req_count" -gt 5000 ]; then score=$((score + 8)) - elif [ "$req_count" -gt 1000 ]; then score=$((score + 5)) - elif [ "$req_count" -gt 500 ]; then score=$((score + 3)) + # IMPROVED: Base request volume scoring + # Skip volume scoring for legitimate bots (Google, Bing, etc.) + if [ -z "${legit_bot_ips[$ip]}" ]; then + # Not a legitimate bot - apply volume scoring + if [ "$req_count" -gt 10000 ]; then score=$((score + 10)) + elif [ "$req_count" -gt 5000 ]; then score=$((score + 8)) + elif [ "$req_count" -gt 1000 ]; then score=$((score + 5)) + elif [ "$req_count" -gt 500 ]; then score=$((score + 3)) + fi + fi + + # NEW: Success rate analysis bonuses + # High failure rate (80%+ 404/403) = scanning behavior + if [ -n "${scanner_ips[$ip]}" ]; then + fail_rate=${scanner_ips[$ip]} + if [ "$fail_rate" -ge 90 ]; then + score=$((score + 8)) # Very high failure rate + elif [ "$fail_rate" -ge 80 ]; then + score=$((score + 5)) # High failure rate + fi + fi + + # High success rate (90%+ 200/301/302) + high volume = potential scraping + if [ -n "${scraper_ips[$ip]}" ] && [ "$req_count" -gt 500 ]; then + score=$((score + 7)) # Scraping behavior fi # Attack patterns @@ -985,9 +1127,13 @@ calculate_threat_scores() { [ -n "${threat_ips_suspicious[$ip]}" ] && score=$((score + 10)) [ -n "${threat_ips_ddos[$ip]}" ] && score=$((score + 10)) - # Admin probing + # Admin probing - IMPROVED: Raised threshold to 50 (only failed attempts counted) admin_count=${threat_admin_count[$ip]:-0} - [ "$admin_count" -gt 20 ] 2>/dev/null && score=$((score + 5)) + if [ "$admin_count" -gt 100 ] 2>/dev/null; then + score=$((score + 10)) # Excessive probing + elif [ "$admin_count" -gt 50 ] 2>/dev/null; then + score=$((score + 5)) # Moderate probing + fi # 404 scanning scan_404=${threat_404_count[$ip]:-0} @@ -1979,6 +2125,7 @@ main() { detect_server_ips detect_threats + analyze_success_rates # NEW: Analyze success/failure rates for better accuracy detect_botnets analyze_time_series calculate_threat_scores