diff --git a/modules/security/bot-analyzer.sh b/modules/security/bot-analyzer.sh index ee43209..6735186 100755 --- a/modules/security/bot-analyzer.sh +++ b/modules/security/bot-analyzer.sh @@ -45,6 +45,10 @@ LOG_DIR="${SYS_LOG_DIR:-/var/log/apache2/domlogs}" TOOLKIT_TMP_DIR="$SCRIPT_DIR/tmp" mkdir -p "$TOOLKIT_TMP_DIR" 2>/dev/null +# NEW: Baseline history directory (stores 30 days of historical data per domain) +BASELINE_DIR="$TOOLKIT_TMP_DIR/baseline_history" +mkdir -p "$BASELINE_DIR" 2>/dev/null + TEMP_DIR="$TOOLKIT_TMP_DIR/bot_analysis_$$" OUTPUT_FILE="$TOOLKIT_TMP_DIR/bot_analysis_report_$(date +%Y%m%d_%H%M%S).txt" DAYS_BACK="" # Empty means all logs, otherwise filter by days @@ -647,7 +651,119 @@ classify_bots() { } ############################################################################# -# NEW: Header Analysis for Bot Detection +# NEW: Baseline Management (historical tracking for anomaly detection) +############################################################################# + +save_baseline() { + print_info "Storing baseline metrics for anomaly comparison..." + + local today=$(date +%Y%m%d) + + # Calculate current metrics + local total_requests=$(wc -l < "$TEMP_DIR/parsed_logs.txt" 2>/dev/null || echo "0") + local unique_ips=$(awk -F'|' '{print $1}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | sort -u | wc -l || echo "0") + local bot_requests=$(wc -l < "$TEMP_DIR/classified_bots.txt" 2>/dev/null || echo "0") + local bot_pct=0 + if [ "$total_requests" -gt 0 ]; then + bot_pct=$((bot_requests * 100 / total_requests)) + fi + + local sqli_attempts=$(wc -l < "$TEMP_DIR/sqli_attempts.txt" 2>/dev/null || echo "0") + local xss_attempts=$(wc -l < "$TEMP_DIR/xss_attempts.txt" 2>/dev/null || echo "0") + local path_attempts=$(wc -l < "$TEMP_DIR/path_traversal_attempts.txt" 2>/dev/null || echo "0") + local rce_attempts=$(wc -l < "$TEMP_DIR/rce_upload_attempts.txt" 2>/dev/null || echo "0") + local login_attempts=$(wc -l < "$TEMP_DIR/login_bruteforce_attempts.txt" 2>/dev/null || echo "0") + local total_attacks=$((sqli_attempts + xss_attempts + path_attempts + rce_attempts + login_attempts)) + + local high_risk_ips=$(awk -F'|' '$1 >= 70' "$TEMP_DIR/threat_scores.txt" 2>/dev/null | wc -l || echo "0") + + # Store baseline for each domain + if [ -f "$TEMP_DIR/all_domains.txt" ]; then + while read -r domain; do + local baseline_file="$BASELINE_DIR/${domain}_baseline.txt" + + # Get domain-specific metrics + local domain_requests=$(grep "^[^|]*|$domain|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | wc -l || echo "0") + local domain_attacks=$(grep "^[^|]*|$domain|" "$TEMP_DIR/attack_vectors_raw.txt" 2>/dev/null | wc -l || echo "0") + local domain_bots=$(grep "^[^|]*|$domain|" "$TEMP_DIR/classified_bots.txt" 2>/dev/null | wc -l || echo "0") + + # Append to baseline history (timestamp|requests|attacks|bots|high_risk_ips) + echo "$today|$domain_requests|$domain_attacks|$domain_bots|$high_risk_ips" >> "$baseline_file" + + # Keep only last 30 days + tail -30 "$baseline_file" > "$baseline_file.tmp" && mv "$baseline_file.tmp" "$baseline_file" + done < "$TEMP_DIR/all_domains.txt" + fi + + # Store global baseline + local global_baseline="$BASELINE_DIR/global_baseline.txt" + echo "$today|$total_requests|$unique_ips|$bot_pct|$total_attacks|$sqli_attempts|$xss_attempts|$path_attempts|$rce_attempts|$login_attempts|$high_risk_ips" >> "$global_baseline" + tail -30 "$global_baseline" > "$global_baseline.tmp" && mv "$global_baseline.tmp" "$global_baseline" + + print_success "Baseline stored" +} + +get_domain_baseline() { + local domain="$1" + local baseline_file="$BASELINE_DIR/${domain}_baseline.txt" + + if [ -f "$baseline_file" ]; then + cat "$baseline_file" + fi +} + +calculate_baseline_average() { + local domain="$1" + local metric="$2" # requests, attacks, bots, etc. + local days="${3:-7}" # default 7 days + + local baseline_file="$BASELINE_DIR/${domain}_baseline.txt" + if [ ! -f "$baseline_file" ]; then + echo "0" + return + fi + + # Get last N days + local col=2 # requests by default + case "$metric" in + attacks) col=3 ;; + bots) col=4 ;; + high_risk) col=5 ;; + esac + + tail -"$days" "$baseline_file" 2>/dev/null | awk -F'|' -v col="$col" '{sum+=$col; count++} END {if (count>0) print int(sum/count); else print 0}' +} + +############################################################################# +# NEW: Attack Progression/Timeline Analysis +############################################################################# + +analyze_attack_progression() { + print_info "Analyzing attack progression and sequences..." + + # For each high-risk IP, show the sequence of attacks + awk -F'|' '$1 >= 70 {print $2}' "$TEMP_DIR/threat_scores.txt" 2>/dev/null | head -20 | while read -r ip; do + local progression_file="$TEMP_DIR/progression_${ip}.txt" + > "$progression_file" + + # Extract all requests from this IP, in order + grep "^$ip|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | awk -F'|' '{ + print $8 "|" $3 "|" $4 "|" $6 + }' | sort >> "$progression_file" + + # Detect attack phases + local phase="reconnaissance" + local phase_start=$(head -1 "$progression_file" 2>/dev/null | cut -d'|' -f1) + + echo "$ip|$phase|$phase_start" >> "$TEMP_DIR/attack_phases.txt" + done + + touch "$TEMP_DIR/attack_phases.txt" + print_success "Attack progression analysis complete" +} + +############################################################################# +# Header Analysis for Bot Detection ############################################################################# analyze_headers() { @@ -1085,6 +1201,209 @@ analyze_request_timing() { print_success "Request timing analysis complete" } +############################################################################# +# NEW: Fingerprinting - Combine multiple signals for accuracy +############################################################################# + +calculate_bot_fingerprint() { + print_info "Calculating bot fingerprint confidence scores (combining multiple signals)..." + + # Each signal contributes to confidence that an IP is a bot + # Real traffic rarely has ALL signals, bots typically have multiple + awk -F'|' ' + BEGIN { + # Initialize tracking arrays + } + { + ip = $1 + domain = $2 + url = $3 + status = $4 + ua = $6 + referer = $9 + accept_lang = $10 + + ua_lower = tolower(ua) + + # Track per-IP fingerprint components + if (ip in ip_seen) { + ip_seen[ip]++ + } else { + ip_seen[ip] = 1 + } + + # Signal 1: Bot-like User-Agent + if (match(ua_lower, /bot|crawler|spider|scraper|curl|wget|python|java[^script]|perl|ruby|node\.js|headless|mechanize/)) { + ua_bot_signal[ip]++ + } + + # Signal 2: Missing/unusual Accept-Language + if (accept_lang == "-" || accept_lang == "" || accept_lang == "*/*") { + header_anomaly_signal[ip]++ + } + + # Signal 3: Missing Referer (bots often dont send it) + if (referer == "-" || referer == "") { + missing_referer[ip]++ + } + + # Signal 4: Successful requests indicate not just scanning + if (status ~ /^(200|301|302)/) { + success_requests[ip]++ + } + + # Signal 5: Direct admin/config access (suspicious entry) + if (match(url, /\/(wp-admin|phpmyadmin|admin|config\.php|\.env|\.git|\.htaccess|web\.config)/)) { + admin_access[ip]++ + } + } + END { + # Calculate fingerprint scores for each IP + for (ip in ip_seen) { + score = 0 + signal_count = 0 + + # Each signal adds confidence + if (ip in ua_bot_signal && ua_bot_signal[ip] > 0) { + score += 20 + signal_count++ + } + + if (ip in header_anomaly_signal && header_anomaly_signal[ip] > 0) { + score += 15 + signal_count++ + } + + if (ip in missing_referer && missing_referer[ip] > ip_seen[ip] * 0.7) { + score += 15 # 70%+ requests missing referer + signal_count++ + } + + if (ip in admin_access && admin_access[ip] > 0) { + score += 20 # Targeting admin areas + signal_count++ + } + + # Reduce score if mostly getting 200 OK (might be legitimate bot) + if (ip in success_requests && success_requests[ip] > ip_seen[ip] * 0.7) { + score -= 10 # Legitimate traffic (70%+ success) + } + + # Multi-signal boost (confidence increases when multiple signals align) + if (signal_count >= 3) { + score += 25 # Strong indicator of bot when 3+ signals present + } + + # Normalize to 0-100 + if (score > 100) score = 100 + if (score < 0) score = 0 + + # Output fingerprint for high-confidence bots (score >= 60) + if (score >= 60) { + printf "%s|%d|%d\n", ip, score, signal_count > "'"$TEMP_DIR"'/bot_fingerprints.txt" + } + } + } + ' < "$TEMP_DIR/parsed_logs.txt" + + # Create file if empty + touch "$TEMP_DIR/bot_fingerprints.txt" + fingerprint_count=$(wc -l < "$TEMP_DIR/bot_fingerprints.txt" 2>/dev/null || echo "0") + print_success "Fingerprint analysis complete ($fingerprint_count high-confidence bot IPs)" +} + +############################################################################# +# NEW: Domain Targeting Analysis - Which domains are being attacked? +############################################################################# + +analyze_domain_targeting_percentage() { + print_info "Analyzing per-domain attack patterns (what's attacking each domain)..." + + # Build per-domain attack data + # Format: domain|attack_type|ip|count + awk -F'|' ' + NR == FNR { + # Skip attack vectors file - using parsed_logs for all data + next + } + { + # Main log processing + ip = $1 + domain = $2 + status = $4 + + # Track all IPs per domain + ips_per_domain[domain][ip]++ + request_count_per_domain[domain]++ + } + END { + # Output: domain|unique_ips|request_count + for (domain in ips_per_domain) { + ip_count = 0 + for (ip in ips_per_domain[domain]) ip_count++ + printf "%s|%d|%d\n", domain, ip_count, request_count_per_domain[domain] + } + } + ' "$TEMP_DIR/attack_vectors_raw.txt" "$TEMP_DIR/parsed_logs.txt" | sort -t'|' -k3 -rn > "$TEMP_DIR/domain_targeting.txt" + + # Also create per-domain attack type breakdown + # Format: domain|attack_type|ip|count + awk -F'|' ' + { + ip = $1 + domain = $2 + attack_type = $5 + + # Store as domain -> attack_type -> ip -> count + attack_data[domain][attack_type][ip]++ + attack_totals[domain][attack_type]++ + } + END { + for (domain in attack_data) { + domain_file = "'"$TEMP_DIR"'/domain_attacks_" domain ".txt" + for (attack_type in attack_data[domain]) { + total = attack_totals[domain][attack_type] + for (ip in attack_data[domain][attack_type]) { + count = attack_data[domain][attack_type][ip] + printf "%s|%d|%d\n", attack_type "|" ip, count, total + } + } + } + } + ' < "$TEMP_DIR/attack_vectors_raw.txt" + + print_success "Domain attack pattern analysis complete" +} + +############################################################################# +# NEW: Top URLs Analysis - What files/endpoints are bots hitting? +############################################################################# + +analyze_top_urls_per_domain() { + print_info "Analyzing top targeted URLs per domain..." + + # Get list of domains from targeting analysis + if [ -f "$TEMP_DIR/domain_targeting.txt" ]; then + while IFS='|' read -r domain request_count pct; do + local domain_file="$TEMP_DIR/domain_urls_${domain}.txt" + + # Extract all URLs for this domain, sorted by frequency (no arbitrary limit) + awk -F'|' -v dom="$domain" ' + $2 == dom { + urls[$3]++ + } + END { + for (url in urls) { + printf "%s|%d\n", url, urls[url] + } + } + ' < "$TEMP_DIR/parsed_logs.txt" | sort -t'|' -k2 -rn > "$domain_file" + done < "$TEMP_DIR/domain_targeting.txt" + fi + + print_success "Top URLs analysis complete" +} + ############################################################################# # NEW: Success Rate & Behavior Analysis (Added for accuracy improvement) ############################################################################# @@ -1689,7 +2008,7 @@ generate_statistics() { ############################################################################# generate_comparison_report() { - print_info "Generating trend analysis..." + print_info "Generating trend analysis and baseline comparison..." # Store current results for comparison with previous analysis local history_dir="$TOOLKIT_TMP_DIR/analysis_history" @@ -1715,13 +2034,51 @@ generate_comparison_report() { echo "Fuzzing_IPs: $(wc -l < "$TEMP_DIR/fuzzing_ips.txt" 2>/dev/null || echo 0)" } > "$latest_report" + # NEW: Generate baseline comparison + echo "" + print_header "BASELINE COMPARISON (Is this activity normal?)" + + local total_requests=$(grep "^Total_Requests:" "$latest_report" | cut -d: -f2 | tr -d ' ') + local baseline_requests=$(calculate_baseline_average "server" "requests" 7) + + if [ "$baseline_requests" -gt 0 ]; then + local request_pct=$((total_requests * 100 / baseline_requests)) + if [ "$request_pct" -gt 200 ]; then + echo -e "${RED}🔴 ABNORMAL: Requests are $(($request_pct - 100))% above 7-day average${NC}" + echo " Baseline (7-day avg): $baseline_requests requests" + echo " Today: $total_requests requests" + elif [ "$request_pct" -lt 50 ]; then + echo "🟢 LOW: Requests are $(($((100 - $request_pct))))% below baseline" + else + echo "🟡 NORMAL: Requests within expected range" + fi + else + echo "📊 (No historical baseline yet - first analysis)" + fi + + local high_risk=$(grep "^High_Risk_IPs:" "$latest_report" | cut -d: -f2 | tr -d ' ') + local baseline_attacks=$(calculate_baseline_average "server" "high_risk" 7) + + if [ "$baseline_attacks" -gt 0 ]; then + local attack_ratio=$((high_risk / baseline_attacks)) + if [ "$attack_ratio" -gt 3 ]; then + echo -e "${RED}🔴 ABNORMAL: High-risk IPs are ${attack_ratio}x above baseline${NC}" + echo " Baseline (7-day avg): $baseline_attacks high-risk IPs" + echo " Today: $high_risk high-risk IPs" + elif [ "$high_risk" -gt "$baseline_attacks" ]; then + echo -e "${YELLOW}🟡 ELEVATED: $high_risk high-risk IPs (baseline: $baseline_attacks)${NC}" + else + echo "🟢 NORMAL: High-risk IPs within expected range" + fi + fi + # Compare with previous day's analysis local yesterday=$(date -d "1 day ago" +%Y%m%d 2>/dev/null || date -v-1d +%Y%m%d 2>/dev/null) local previous_report="$history_dir/latest_analysis_${yesterday}.txt" if [ -f "$previous_report" ]; then echo "" - print_header "THREAT TREND ANALYSIS (Compared to previous day)" + print_header "DAY-OVER-DAY TRENDS" # Extract metrics and calculate differences local curr_high_risk=$(grep "^High_Risk_IPs:" "$latest_report" | cut -d: -f2 | tr -d ' ') @@ -1735,9 +2092,9 @@ generate_comparison_report() { # Display trend if [ "$risk_diff" -gt 0 ]; then - echo "⚠️ High-Risk IPs: $curr_high_risk (↑ $risk_diff, $risk_pct% increase)" + echo "⚠️ High-Risk IPs: $curr_high_risk (↑ $risk_diff IPs, +${risk_pct}%)" elif [ "$risk_diff" -lt 0 ]; then - echo "✓ High-Risk IPs: $curr_high_risk (↓ $((risk_diff * -1)), ${risk_pct}% decrease)" + echo "✓ High-Risk IPs: $curr_high_risk (↓ $((risk_diff * -1)) IPs, ${risk_pct}%)" else echo "→ High-Risk IPs: $curr_high_risk (no change)" fi @@ -1748,9 +2105,11 @@ generate_comparison_report() { local sql_diff=$((curr_sql - prev_sql)) if [ "$sql_diff" -gt 0 ]; then - echo "⚠️ SQL Injection Attempts: $curr_sql (↑ $sql_diff new attempts)" + echo "⚠️ SQL Injection: $curr_sql (↑ $sql_diff new attempts)" elif [ "$sql_diff" -lt 0 ]; then - echo "✓ SQL Injection Attempts: $curr_sql (↓ $((sql_diff * -1)) fewer)" + echo "✓ SQL Injection: $curr_sql (↓ $((sql_diff * -1)) fewer)" + else + echo "→ SQL Injection: $curr_sql (stable)" fi # Track repeat attackers @@ -1758,7 +2117,7 @@ generate_comparison_report() { if [ -f "$history_dir/known_attackers_${yesterday}.txt" ]; then repeat_attackers=$(grep -Fx -f <(awk -F'|' '$1 >= 70 {print $2}' "$TEMP_DIR/threat_scores.txt" 2>/dev/null) "$history_dir/known_attackers_${yesterday}.txt" 2>/dev/null | wc -l || echo 0) if [ "$repeat_attackers" -gt 0 ]; then - echo "🔄 Repeat Attackers: $repeat_attackers IPs from previous day" + echo -e "${RED}🔄 REPEAT ATTACKERS: $repeat_attackers IPs from yesterday${NC}" fi fi fi @@ -2028,6 +2387,125 @@ generate_report() { done < "$TEMP_DIR/false_positives.txt" | head -6 fi + # NEW: HIGH-CONFIDENCE BOT FINGERPRINTS + if [ -s "$TEMP_DIR/bot_fingerprints.txt" ]; then + echo "" + print_header "HIGH-CONFIDENCE BOT FINGERPRINTS (Multi-signal analysis - reduced false positives)" + echo "These IPs show MULTIPLE bot indicators combined (not just single signal):" + echo "" + + awk -F'|' ' + NR <= 15 { + ip = $1 + score = $2 + signals = $3 + + # Risk level based on score + if (score >= 80) risk = "CRITICAL" + else if (score >= 70) risk = "HIGH" + else if (score >= 60) risk = "MEDIUM" + else risk = "LOW" + + printf " %s - Score: %2d/100 - Risk: %s - Signals: %d\n", ip, score, risk, signals + }' "$TEMP_DIR/bot_fingerprints.txt" + + total=$(wc -l < "$TEMP_DIR/bot_fingerprints.txt" 2>/dev/null || echo "0") + echo "" + echo " Total high-confidence bots detected: $total IPs" + echo "" + else + echo "" + echo " No high-confidence bot fingerprints detected (requires multiple signals)" + echo "" + fi + + # NEW: DOMAIN ATTACK TARGETING ANALYSIS (what's attacking each domain) + if [ -s "$TEMP_DIR/domain_targeting.txt" ]; then + echo "" + print_header "DOMAIN ATTACK TARGETING (Which domains are under attack & from where?)" + echo "" + + total_domains=$(wc -l < "$TEMP_DIR/domain_targeting.txt" 2>/dev/null || echo "0") + echo "Total domains with attacks detected: $total_domains" + echo "" + + # Show top attacked domains with attack details + awk -F'|' 'NR <= 10 {print $1}' "$TEMP_DIR/domain_targeting.txt" | while read -r domain; do + domain_attack_count=$(grep "^[^|]*|${domain}|" "$TEMP_DIR/attack_vectors_raw.txt" 2>/dev/null | wc -l || echo "0") + + if [ "$domain_attack_count" -gt 0 ]; then + echo " Domain: $domain ($domain_attack_count attack attempts)" + + # Get all attacks on this domain, group by type + awk -F'|' -v dom="$domain" ' + $2 == dom { + ip = $1 + attack_type = $5 + + # Validate IP format + if (match(ip, /^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$/)) { + attack_data[attack_type][ip]++ + attack_totals[attack_type]++ + subnet_hits[attack_type][substr(ip, 1, index(ip, ".", index(ip, ".")+1)-1)]++ + } + } + END { + for (attack_type in attack_totals) { + printf " └─ %s: %d attempts\n", attack_type, attack_totals[attack_type] + + # Show top 3 IPs for this attack type + attack_count = 0 + for (ip in attack_data[attack_type]) { + if (attack_count >= 3) break + count = attack_data[attack_type][ip] + split(ip, parts, ".") + subnet = parts[1] "." parts[2] "." parts[3] ".0/24" + printf " ├─ %s (%d reqs) [subnet: %s]\n", ip, count, subnet + attack_count++ + } + } + }' "$TEMP_DIR/attack_vectors_raw.txt" + echo "" + fi + done + else + echo "" + echo " No domain attack data available (all domains may be healthy)" + echo "" + fi + + # NEW: TOP URLs BEING ATTACKED + if [ -f "$TEMP_DIR/domain_targeting.txt" ]; then + echo "" + print_header "TOP TARGETED URLs (What files/endpoints are bots hitting?)" + echo "" + + # Show top URLs for top 3 most-attacked domains + urls_shown=0 + awk -F'|' 'NR <= 3 {print $1}' "$TEMP_DIR/domain_targeting.txt" | while read -r domain; do + local domain_file="$TEMP_DIR/domain_urls_${domain}.txt" + if [ -f "$domain_file" ] && [ -s "$domain_file" ]; then + echo " Domain: $domain" + awk -F'|' '{ + url = $1 + count = $2 + printf " %3d requests → %s\n", count, url + }' "$domain_file" # Show all URLs, not just top 5 + echo "" + fi + done + + # Check if no URL data was shown + if [ "$urls_shown" -eq 0 ]; then + echo " No URL targeting data available" + echo "" + fi + else + echo "" + echo " No domain targeting data available" + echo "" + fi + # TOP 5 THREATS print_header "TOP 5 THREATS (with recommended actions)" @@ -2652,21 +3130,32 @@ main() { exit 1 } - # NEW: Enhanced analysis functions + # NEW: Enhanced analysis functions (before threats detected) analyze_headers # Detect header-based bot patterns analyze_entry_points # Detect suspicious entry points analyze_url_entropy # Detect fuzzing/parameter scanning analyze_request_timing # Detect DDoS patterns via timing detect_server_ips - detect_threats + detect_threats # Must be before fingerprinting/domain targeting (creates attack_vectors_raw.txt) analyze_success_rates # Analyze success/failure rates for better accuracy detect_botnets analyze_time_series calculate_threat_scores detect_false_positives generate_statistics - generate_comparison_report # NEW: Show trends vs previous day + + # NEW: Fingerprinting and domain targeting analysis (after threats detected) + calculate_bot_fingerprint # Combine signals for accuracy (reduce false positives) + analyze_domain_targeting_percentage # Show which domains are being targeted + analyze_top_urls_per_domain # Show what files/endpoints are being hit + + generate_comparison_report # Show trends vs previous day + + # NEW: Baseline and progression analysis + save_baseline # Store current metrics for historical comparison + analyze_attack_progression # Show attack sequences and phases + generate_report print_success "Analysis complete!"