Filter out legitimate browsers from bot analyzer

- Added intelligent browser detection filter
- Excludes Chrome, Firefox, Safari, Edge, Opera, Vivaldi, Samsung Browser
- Detects Mozilla/5.0 with AppleWebKit/Gecko as legitimate browsers
- Filters mobile browsers (Android, iPhone, iPad)
- Only flags actual bots, not regular user traffic
- Prevents false positives from browser user agents
This commit is contained in:
cschantz
2025-11-03 19:05:39 -05:00
parent e3525a3a96
commit e396df5b1a
+14 -1
View File
@@ -496,13 +496,26 @@ classify_bots() {
} }
} }
} else if (match(ua_lower, /bot|crawler|spider|scraper|curl|wget|python-|java\/|scan/)) { } else if (match(ua_lower, /bot|crawler|spider|scraper|curl|wget|python-|java\/|scan/)) {
# FILTER OUT legitimate browsers that might contain "bot" in version strings
# Common browsers: Chrome, Firefox, Safari, Edge, Opera, Samsung Browser, etc.
if (match(ua_lower, /chrome\/|firefox\/|safari\/|edg\/|edge\/|opr\/|opera\//) ||
match(ua_lower, /mozilla\/5\.0/) && match(ua_lower, /applewebkit|gecko/) && !match(ua_lower, /bot|crawler|spider/) ||
match(ua_lower, /samsungbrowser|ucbrowser|yabrowser|vivaldi/) ||
match(ua_lower, /android.*mobile|iphone|ipad|windows nt|macintosh|linux x86/) && !match(ua_lower, /bot|crawler|spider/)) {
# This is a legitimate browser, skip it
next
}
bot_type = "unidentified_bot" bot_type = "unidentified_bot"
# Extract first word of UA as bot name # Extract first word of UA as bot name
match(ua, /^[^ ]+/, name) match(ua, /^[^ ]+/, name)
bot_name = substr(name[0], 1, 30) bot_name = substr(name[0], 1, 30)
} }
print ip "|" domain "|" url "|" status "|" size "|" ua "|" method "|" timestamp "|" bot_type "|" bot_name # Only print if bot_type is not "unknown" (i.e., we identified it as something)
if (bot_type != "unknown") {
print ip "|" domain "|" url "|" status "|" size "|" ua "|" method "|" timestamp "|" bot_type "|" bot_name
}
}' "$TEMP_DIR/parsed_logs.txt" > "$TEMP_DIR/classified_bots.txt" }' "$TEMP_DIR/parsed_logs.txt" > "$TEMP_DIR/classified_bots.txt"
if [ ! -s "$TEMP_DIR/classified_bots.txt" ]; then if [ ! -s "$TEMP_DIR/classified_bots.txt" ]; then