Fix critical bugs in bot-analyzer: gzipped file access, performance, and scoping issues

CRITICAL FIXES:
- Fix gzipped file access bug causing script to hang at "Calculating threat scores"
  - Changed all parsed_logs.txt references to use zcat on .gz files
  - Fixed lines 1203, 1315, 1324, 1800, 1807, 1810, 1823-1824, 2781

- Fix user_domains scoping bug preventing user filtering (-u flag)
  - Export user_domains from main() before parse_logs() call

- Fix TOOLKIT_BASE_DIR undefined variable
  - Changed to SCRIPT_DIR in lines 1551, 2732

CODE QUALITY:
- Add missing BOLD color code definition
- Add is_valid_ip() function for IPv4/IPv6 validation
- Integrate IP validation into is_excluded_ip() to prevent malformed data

PERFORMANCE OPTIMIZATION:
- Major optimization in analyze_domain_threats()
  - Create indexed lookup files (one-time decompression)
  - Eliminates nested zcat calls (was 4x per IP per domain)
  - Expected 10-100x speedup for servers with 200+ domains

SYSTEM DETECTION:
- Add firewall detection exports to system-detect.sh

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
cschantz
2025-11-18 19:35:55 -05:00
parent ae1794cf3d
commit fbfee2061e
2 changed files with 57 additions and 18 deletions
+54 -18
View File
@@ -190,6 +190,7 @@ YELLOW='\033[1;33m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' # No Color
# Check for required commands
@@ -724,10 +725,39 @@ detect_server_ips() {
fi
}
# Helper function to validate IP address format
is_valid_ip() {
local ip="$1"
# IPv4 validation
if [[ "$ip" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
local IFS='.'
local -a octets=($ip)
for octet in "${octets[@]}"; do
if [ "$octet" -gt 255 ]; then
return 1 # Invalid
fi
done
return 0 # Valid IPv4
fi
# IPv6 basic validation (simplified)
if [[ "$ip" =~ ^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$ ]]; then
return 0 # Valid IPv6
fi
return 1 # Invalid
}
# Helper function to check if an IP should be excluded
is_excluded_ip() {
local ip="$1"
# First validate IP format
if ! is_valid_ip "$ip"; then
return 0 # Exclude invalid IPs
fi
# Check if private/internal IP
if [[ "$ip" =~ ^127\. ]] || \
[[ "$ip" =~ ^10\. ]] || \
@@ -1199,7 +1229,7 @@ generate_report() {
ip=$(echo "$line" | cut -d'|' -f1)
service=$(echo "$line" | cut -d'|' -f2)
domain=$(echo "$line" | cut -d'|' -f4)
req_count=$(grep -c "^$ip|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null || echo 0)
req_count=$(zcat "$TEMP_DIR/parsed_logs.txt.gz" 2>/dev/null | grep -c "^$ip|" || echo 0)
echo " $ip - $req_count requests - Identified as: $service"
echo " → Domain: $domain"
echo " → Action: VERIFY OWNERSHIP then whitelist"
@@ -1310,8 +1340,8 @@ generate_report() {
if [ -s "$TEMP_DIR/large_transfers.txt" ]; then
# Calculate total bot bandwidth
total_bot_bandwidth=0
if [ -f "$TEMP_DIR/classified_bots.txt" ]; then
total_bot_bandwidth=$(awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/classified_bots.txt")
if [ -f "$TEMP_DIR/classified_bots.txt.gz" ]; then
total_bot_bandwidth=$(zcat "$TEMP_DIR/classified_bots.txt.gz" | awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
fi
if [ -n "$total_bot_bandwidth" ] && [ "$total_bot_bandwidth" -gt 0 ]; then
@@ -1320,7 +1350,7 @@ generate_report() {
# Estimate cost at $0.09/GB (typical CDN pricing)
estimated_cost=$(awk "BEGIN {printf \"%.2f\", ($total_bot_bandwidth/1073741824) * 0.09}")
total_bandwidth=$(awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/parsed_logs.txt")
total_bandwidth=$(zcat "$TEMP_DIR/parsed_logs.txt.gz" | awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
bot_pct=$(awk "BEGIN {printf \"%.1f\", ($total_bot_bandwidth/$total_bandwidth)*100}")
echo ""
@@ -1547,8 +1577,8 @@ baseline_health_check() {
# If no domains found from log files, try reference database
if [ ! -s "$TEMP_DIR/domain_list.txt" ]; then
if [ -s "$TOOLKIT_BASE_DIR/.sysref" ]; then
grep "^DOMAIN|" "$TOOLKIT_BASE_DIR/.sysref" 2>/dev/null | \
if [ -s "$SCRIPT_DIR/.sysref" ]; then
grep "^DOMAIN|" "$SCRIPT_DIR/.sysref" 2>/dev/null | \
cut -d'|' -f2 | sort -u > "$TEMP_DIR/domain_list.txt"
fi
fi
@@ -1722,12 +1752,14 @@ main() {
# User filtering
if [ -n "$FILTER_USER" ]; then
print_info "Filtering logs for user: $FILTER_USER"
user_domains=$(get_user_domains "$FILTER_USER")
export user_domains=$(get_user_domains "$FILTER_USER")
if [ -z "$user_domains" ]; then
print_error "No domains found for user: $FILTER_USER"
exit 1
fi
print_info "User has $(echo "$user_domains" | wc -l) domain(s)"
else
export user_domains=""
fi
log_count=$(find "$LOG_DIR" -type f ! -name "*-bytes_log" ! -name "*.offset" ! -name "*error_log" "${find_opts[@]}" 2>/dev/null | wc -l)
@@ -1796,17 +1828,21 @@ analyze_domain_threats() {
> "$TEMP_DIR/domain_high_risk_ips.txt"
# Get all unique domains from parsed logs
awk -F'|' '{print $2}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | sort -u > "$TEMP_DIR/all_domains.txt"
zcat "$TEMP_DIR/parsed_logs.txt.gz" 2>/dev/null | awk -F'|' '{print $2}' | sort -u > "$TEMP_DIR/all_domains.txt"
# Pre-process: Create indexed lookup files for performance (one-time decompression)
zcat "$TEMP_DIR/parsed_logs.txt.gz" 2>/dev/null | awk -F'|' '{print $2"|"$1}' | sort > "$TEMP_DIR/domain_ip_lookup.txt"
zcat "$TEMP_DIR/classified_bots.txt.gz" 2>/dev/null | awk -F'|' '{print $2}' | sort > "$TEMP_DIR/bot_domains_lookup.txt"
# For each domain, calculate threat metrics
while read -r domain; do
[ -z "$domain" ] && continue
# Total requests to this domain
local total_requests=$(grep -c "^[^|]*|$domain|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null || echo "0")
# Total requests to this domain (from indexed file)
local total_requests=$(grep -c "^$domain|" "$TEMP_DIR/domain_ip_lookup.txt" 2>/dev/null || echo "0")
# Bot requests to this domain
local bot_requests=$(grep "|$domain|" "$TEMP_DIR/classified_bots.txt" 2>/dev/null | wc -l || echo "0")
# Bot requests to this domain (from indexed file)
local bot_requests=$(grep -c "^$domain$" "$TEMP_DIR/bot_domains_lookup.txt" 2>/dev/null || echo "0")
# High-risk IPs hitting this domain (score >= 70)
local high_risk_count=0
@@ -1818,9 +1854,9 @@ analyze_domain_threats() {
local ip=$(echo "$score_line" | cut -d'|' -f2)
if [ "$score" -ge 70 ]; then
# Check if this IP hit this domain
if grep -q "^$ip|$domain|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null; then
local ip_requests=$(grep -c "^$ip|$domain|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null || echo "0")
# Check if this IP hit this domain (from indexed file)
local ip_requests=$(grep -c "^$domain|$ip$" "$TEMP_DIR/domain_ip_lookup.txt" 2>/dev/null || echo "0")
if [ "$ip_requests" -gt 0 ]; then
high_risk_count=$((high_risk_count + 1))
high_risk_ips="${high_risk_ips}${ip}:${score}:${ip_requests} "
fi
@@ -2728,8 +2764,8 @@ execute_htaccess_domain_blocking() {
# Find document root for this domain using reference database
local doc_root=""
if [ -s "$TOOLKIT_BASE_DIR/.sysref" ]; then
doc_root=$(grep "^DOMAIN|$target_domain|" "$TOOLKIT_BASE_DIR/.sysref" 2>/dev/null | head -1 | cut -d'|' -f4)
if [ -s "$SCRIPT_DIR/.sysref" ]; then
doc_root=$(grep "^DOMAIN|$target_domain|" "$SCRIPT_DIR/.sysref" 2>/dev/null | head -1 | cut -d'|' -f4)
fi
if [ -z "$doc_root" ]; then
@@ -2773,7 +2809,7 @@ execute_htaccess_domain_blocking() {
print_info "Adding bot blocking rules..."
# Get high-risk IPs for this domain
local block_ips=$(grep "^[^|]*|$target_domain|" "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | cut -d'|' -f1 | sort -u | while read ip; do
local block_ips=$(zcat "$TEMP_DIR/parsed_logs.txt.gz" 2>/dev/null | grep "^[^|]*|$target_domain|" | cut -d'|' -f1 | sort -u | while read ip; do
# Check if this IP has high threat score
if grep -q "|$ip$" "$TEMP_DIR/threat_scores.txt" 2>/dev/null; then
local score=$(grep "|$ip$" "$TEMP_DIR/threat_scores.txt" | cut -d'|' -f1)