Compare commits
12 Commits
12973423ef
...
6dfc47d831
| Author | SHA1 | Date | |
|---|---|---|---|
| 6dfc47d831 | |||
| 172ef41fc7 | |||
| 429ee62510 | |||
| 9b6652f512 | |||
| 5902ea990d | |||
| e1a3b1cf90 | |||
| adbe5c14d5 | |||
| 8477c8d7e1 | |||
| ae1503b928 | |||
| 50a996bce3 | |||
| 907e90f78a | |||
| 5a539e4d31 |
@@ -507,7 +507,7 @@ parse_logs() {
|
||||
local line_count
|
||||
line_count=$(wc -l < "$TEMP_DIR/parsed_logs.txt")
|
||||
local file_size_kb
|
||||
file_size_kb=$(du -k "$TEMP_DIR/parsed_logs.txt" | cut -f1)
|
||||
file_size_kb=$(du -k "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | cut -f1 || echo "0")
|
||||
|
||||
# Compress for storage (gzip saves ~90% space on text)
|
||||
# But we keep uncompressed version for fast analysis
|
||||
@@ -641,7 +641,7 @@ classify_bots() {
|
||||
local classified_count
|
||||
classified_count=$(wc -l < "$TEMP_DIR/classified_bots.txt")
|
||||
local file_size_kb
|
||||
file_size_kb=$(du -k "$TEMP_DIR/classified_bots.txt" | cut -f1)
|
||||
file_size_kb=$(du -k "$TEMP_DIR/classified_bots.txt" 2>/dev/null | cut -f1 || echo "0")
|
||||
|
||||
# Compress for storage in background
|
||||
gzip -c "$TEMP_DIR/classified_bots.txt" > "$TEMP_DIR/classified_bots.txt.gz" &
|
||||
@@ -770,7 +770,7 @@ analyze_headers() {
|
||||
print_info "Analyzing request headers for bot patterns..."
|
||||
|
||||
# Analyze header patterns to improve bot detection accuracy
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
ip = $1
|
||||
domain = $2
|
||||
@@ -846,9 +846,10 @@ analyze_headers() {
|
||||
|
||||
# Only flag if high header suspicion score
|
||||
if (score >= 8) {
|
||||
print ip "|header_anomaly|" score > "'"$TEMP_DIR"'/header_anomalies.txt"
|
||||
print ip "|header_anomaly|" score > tmpdir "/header_anomalies.txt"
|
||||
}
|
||||
}
|
||||
close(tmpdir "/header_anomalies.txt")
|
||||
}' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Create file if it doesn't exist
|
||||
@@ -864,7 +865,7 @@ analyze_entry_points() {
|
||||
print_info "Analyzing first request patterns (bot vs. user entry points)..."
|
||||
|
||||
# Get first request from each IP
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
BEGIN {
|
||||
ip_first_request[ip] = url
|
||||
ip_first_status[ip] = status
|
||||
@@ -889,17 +890,20 @@ analyze_entry_points() {
|
||||
|
||||
# Suspicious entry points indicate bot/scanner
|
||||
if (match(url_lower, /wp-admin|phpmyadmin|admin|xmlrpc|shell\.php|\.env|\.git|backdoor|config\.php/)) {
|
||||
print ip "|admin_entry|" url "|" status > "'"$TEMP_DIR"'/suspicious_entry_points.txt"
|
||||
print ip "|admin_entry|" url "|" status > tmpdir "/suspicious_entry_points.txt"
|
||||
}
|
||||
# Legitimate entry: homepage or search
|
||||
else if (match(url_lower, /^\/index|^\/$|^\/search|^\/page|^\/category/)) {
|
||||
print ip "|normal_entry|" url > "'"$TEMP_DIR"'/normal_entry_points.txt"
|
||||
print ip "|normal_entry|" url > tmpdir "/normal_entry_points.txt"
|
||||
}
|
||||
# Unusual but possible: static files
|
||||
else if (match(url_lower, /\.(css|js|jpg|png|gif|woff|svg)$/)) {
|
||||
print ip "|static_entry|" url > "'"$TEMP_DIR"'/static_entry_points.txt"
|
||||
print ip "|static_entry|" url > tmpdir "/static_entry_points.txt"
|
||||
}
|
||||
}
|
||||
close(tmpdir "/suspicious_entry_points.txt")
|
||||
close(tmpdir "/normal_entry_points.txt")
|
||||
close(tmpdir "/static_entry_points.txt")
|
||||
}' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Count suspicious entry points
|
||||
@@ -919,7 +923,7 @@ detect_threats() {
|
||||
print_info "Detecting security threats..."
|
||||
|
||||
# Use a single AWK pass for multiple threat detections (more efficient)
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
ip = $1
|
||||
domain = $2
|
||||
@@ -937,7 +941,7 @@ detect_threats() {
|
||||
match(url_lower, /information_schema|drop table|insert into|update.*set|delete from/) ||
|
||||
match(url_lower, /%27.*(union|select|or |and )|hex\(|unhex\(|load_file\(/) ||
|
||||
match(url_lower, /0x[0-9a-f]+.*(union|select|into|from|where|order)/)) {
|
||||
print ip "|" domain "|" url "|" status "|sqli" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|sqli" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# XSS patterns
|
||||
@@ -945,7 +949,7 @@ detect_threats() {
|
||||
# This prevents false positives on documentation URLs like /docs/innerhtml-api-guide
|
||||
if (match(url_lower, /<script|javascript:|onerror=|onload=|<iframe|eval\(|alert\(/) ||
|
||||
match(url_lower, /\?.*(document\.cookie|document\.write|\.innerhtml)/)) {
|
||||
print ip "|" domain "|" url "|" status "|xss" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|xss" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# Path Traversal / LFI
|
||||
@@ -953,7 +957,7 @@ detect_threats() {
|
||||
# FIXED: Case-insensitive hex encoding support (%5C and %5c)
|
||||
if (match(url_lower, /\.\.\/|\.\.\\|%2e%2e|%5c|etc\/passwd|etc\/shadow|boot\.ini|win\.ini/) ||
|
||||
match(url_lower, /proc\/self|proc\/environ|\/etc\/|c:\\|c:%5c|windows(%5c|[\/\\])system32/)) {
|
||||
print ip "|" domain "|" url "|" status "|path_traversal" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|path_traversal" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# Shell upload / RCE attempts
|
||||
@@ -963,7 +967,7 @@ detect_threats() {
|
||||
match(url_lower, /shell\.php|c99\.php|r57\.php|r00t\.php|backdoor|webshell|cmd\.php|exploit\.php/) ||
|
||||
match(url_lower, /base64_decode.*eval|gzinflate.*eval|assert.*\$_/) ||
|
||||
(match(url_lower, /\.(php|phtml|php3|php4|php5|phar)\.suspected$/) && method == "POST")) {
|
||||
print ip "|" domain "|" url "|" status "|rce_upload" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|rce_upload" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# Info Disclosure attempts
|
||||
@@ -979,18 +983,18 @@ detect_threats() {
|
||||
# Only flag if successful access (200) or redirect (301/302)
|
||||
# Failed attempts (404/403) are just scanning, tracked separately
|
||||
if (status ~ /^(200|301|302)/) {
|
||||
print ip "|" domain "|" url "|" status "|info_disclosure" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|info_disclosure" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
}
|
||||
|
||||
# composer.json / package.json - lower severity, only if successful
|
||||
if (match(url_lower, /composer\.json|package\.json|package-lock\.json/) && status == "200") {
|
||||
print ip "|" domain "|" url "|" status "|config_exposure" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|config_exposure" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# Login bruteforce
|
||||
if (match(url_lower, /wp-login\.php|xmlrpc\.php/) && method == "POST") {
|
||||
print ip "|" domain "|" url "|" status "|login_bruteforce" > "'"$TEMP_DIR"'/attack_vectors_raw.txt"
|
||||
print ip "|" domain "|" url "|" status "|login_bruteforce" > tmpdir "/attack_vectors_raw.txt"
|
||||
}
|
||||
|
||||
# Admin/sensitive endpoint probing
|
||||
@@ -1000,30 +1004,30 @@ detect_threats() {
|
||||
# Only flag failed access attempts (403 Forbidden, 401 Unauthorized, 404 Not Found)
|
||||
# Successful access (200/302) means legitimate user or already compromised
|
||||
if (status ~ /^(403|401|404)/) {
|
||||
print ip "|" domain "|" url > "'"$TEMP_DIR"'/admin_probes_raw.txt"
|
||||
print ip "|" domain "|" url > tmpdir "/admin_probes_raw.txt"
|
||||
}
|
||||
}
|
||||
|
||||
# 404 scanning (reconnaissance)
|
||||
if (status == "404" || status == "403") {
|
||||
print ip "|" domain "|" url "|" status > "'"$TEMP_DIR"'/404_scans_raw.txt"
|
||||
print ip "|" domain "|" url "|" status > tmpdir "/404_scans_raw.txt"
|
||||
}
|
||||
|
||||
# Large data transfers (potential scraping)
|
||||
if (size > 1000000) {
|
||||
print ip "|" domain "|" url "|" size > "'"$TEMP_DIR"'/large_transfers_raw.txt"
|
||||
print ip "|" domain "|" url "|" size > tmpdir "/large_transfers_raw.txt"
|
||||
}
|
||||
|
||||
# Suspicious user agents
|
||||
if (match(ua_lower, /nikto|nmap|masscan|sqlmap|havij|acunetix|nessus|burp/) ||
|
||||
match(ua_lower, /metasploit|<script|null|python-requests|go-http-client/)) {
|
||||
print ip "|" ua > "'"$TEMP_DIR"'/suspicious_ua_raw.txt"
|
||||
print ip "|" ua > tmpdir "/suspicious_ua_raw.txt"
|
||||
}
|
||||
|
||||
# Track response codes for intelligence
|
||||
print status > "'"$TEMP_DIR"'/response_codes_raw.txt"
|
||||
print status > tmpdir "/response_codes_raw.txt"
|
||||
}
|
||||
' < <(cat "$TEMP_DIR/parsed_logs.txt")
|
||||
' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Process attack vectors by type
|
||||
if [ -f "$TEMP_DIR/attack_vectors_raw.txt" ]; then
|
||||
@@ -1088,7 +1092,7 @@ analyze_url_entropy() {
|
||||
print_info "Analyzing URL parameter entropy (fuzzing detection)..."
|
||||
|
||||
# Detect IPs that generate random parameters (scanning/fuzzing behavior)
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
ip = $1
|
||||
url = $3
|
||||
@@ -1123,9 +1127,10 @@ analyze_url_entropy() {
|
||||
# If IP hits >20 URLs with lots of numeric params = scanning
|
||||
if (urls_per_ip[ip] > 20 && unique_path_count > 5) {
|
||||
# Likely fuzzing/parameter scanning
|
||||
print ip "|parameter_fuzzing|" urls_per_ip[ip] "|" unique_path_count > "'"$TEMP_DIR"'/fuzzing_ips.txt"
|
||||
print ip "|parameter_fuzzing|" urls_per_ip[ip] "|" unique_path_count > tmpdir "/fuzzing_ips.txt"
|
||||
}
|
||||
}
|
||||
close(tmpdir "/fuzzing_ips.txt")
|
||||
}' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Create file if it doesn't exist
|
||||
@@ -1141,7 +1146,7 @@ analyze_request_timing() {
|
||||
print_info "Analyzing request timing patterns (DDoS detection)..."
|
||||
|
||||
# Analyze timing consistency to detect bots/DDoS
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
ip = $1
|
||||
timestamp = $8
|
||||
@@ -1189,11 +1194,12 @@ analyze_request_timing() {
|
||||
# Very consistent timing = bot (typically 0.5-2 seconds apart)
|
||||
# Real users: highly variable (5-60+ seconds)
|
||||
if (avg_interval < 3 && count > 100) {
|
||||
print ip "|consistent_bot_timing|" avg_interval "|" count > "'"$TEMP_DIR"'/timing_anomalies.txt"
|
||||
print ip "|consistent_bot_timing|" avg_interval "|" count > tmpdir "/timing_anomalies.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(tmpdir "/timing_anomalies.txt")
|
||||
}' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Create file if it doesn't exist
|
||||
@@ -1210,7 +1216,7 @@ calculate_bot_fingerprint() {
|
||||
|
||||
# Each signal contributes to confidence that an IP is a bot
|
||||
# Real traffic rarely has ALL signals, bots typically have multiple
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
BEGIN {
|
||||
# Initialize tracking arrays
|
||||
}
|
||||
@@ -1300,9 +1306,10 @@ calculate_bot_fingerprint() {
|
||||
|
||||
# Output fingerprint for high-confidence bots (score >= 60)
|
||||
if (score >= 60) {
|
||||
printf "%s|%d|%d\n", ip, score, signal_count > "'"$TEMP_DIR"'/bot_fingerprints.txt"
|
||||
printf "%s|%d|%d\n", ip, score, signal_count > tmpdir "/bot_fingerprints.txt"
|
||||
}
|
||||
}
|
||||
close(tmpdir "/bot_fingerprints.txt")
|
||||
}
|
||||
' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
@@ -1321,7 +1328,7 @@ analyze_domain_targeting_percentage() {
|
||||
|
||||
# Build per-domain attack data
|
||||
# Format: domain|attack_type|ip|count
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
NR == FNR {
|
||||
# Skip attack vectors file - using parsed_logs for all data
|
||||
next
|
||||
@@ -1360,7 +1367,7 @@ analyze_domain_targeting_percentage() {
|
||||
}
|
||||
END {
|
||||
for (domain in attack_data) {
|
||||
domain_file = "'"$TEMP_DIR"'/domain_attacks_" domain ".txt"
|
||||
domain_file = tmpdir "/domain_attacks_" domain ".txt"
|
||||
for (attack_type in attack_data[domain]) {
|
||||
total = attack_totals[domain][attack_type]
|
||||
for (ip in attack_data[domain][attack_type]) {
|
||||
@@ -1412,7 +1419,7 @@ analyze_success_rates() {
|
||||
print_info "Analyzing request success rates and behavior patterns..."
|
||||
|
||||
# Calculate success rate (200/301/302 vs 404/403) for each IP
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
ip = $1
|
||||
status = $4
|
||||
@@ -1438,17 +1445,20 @@ analyze_success_rates() {
|
||||
|
||||
# High failure rate indicates scanning/probing
|
||||
if (fail_rate >= 80 && total[ip] >= 20) {
|
||||
print ip "|" total[ip] "|" fail_rate "|scanner" > "'"$TEMP_DIR"'/high_failure_ips.txt"
|
||||
print ip "|" total[ip] "|" fail_rate "|scanner" >> tmpdir "/high_failure_ips.txt"
|
||||
}
|
||||
# Very high success rate + high volume could be scraping
|
||||
else if (success_rate >= 90 && total[ip] >= 100) {
|
||||
print ip "|" total[ip] "|" success_rate "|scraper" > "'"$TEMP_DIR"'/high_success_ips.txt"
|
||||
print ip "|" total[ip] "|" success_rate "|scraper" >> tmpdir "/high_success_ips.txt"
|
||||
}
|
||||
|
||||
# Output all rates for later analysis
|
||||
print ip "|" total[ip] "|" success_rate "|" fail_rate > "'"$TEMP_DIR"'/ip_success_rates.txt"
|
||||
print ip "|" total[ip] "|" success_rate "|" fail_rate >> tmpdir "/ip_success_rates.txt"
|
||||
}
|
||||
}' < <(cat "$TEMP_DIR/parsed_logs.txt")
|
||||
close(tmpdir "/high_failure_ips.txt")
|
||||
close(tmpdir "/high_success_ips.txt")
|
||||
close(tmpdir "/ip_success_rates.txt")
|
||||
}' < "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Touch files if they don't exist
|
||||
touch "$TEMP_DIR/high_failure_ips.txt" "$TEMP_DIR/high_success_ips.txt" "$TEMP_DIR/ip_success_rates.txt"
|
||||
@@ -1465,23 +1475,23 @@ detect_botnets() {
|
||||
|
||||
# Group IPs by similar behavior patterns
|
||||
# Pattern 1: Multiple IPs hitting same URLs in coordinated manner
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $1"|"$3}' | \
|
||||
awk -F'|' '{print $1"|"$3}' < "$TEMP_DIR/parsed_logs.txt" | \
|
||||
sort | uniq -c | awk '$1 > 10 {print $2}' | \
|
||||
cut -d'|' -f2 | sort | uniq -c | sort -rn | \
|
||||
awk '$1 > 5 {print $2}' > "$TEMP_DIR/coordinated_urls.txt"
|
||||
|
||||
# Pattern 2: IPs with similar User-Agents hitting multiple domains
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $1"|"$6}' | \
|
||||
awk -F'|' '{print $1"|"$6}' < "$TEMP_DIR/parsed_logs.txt" | \
|
||||
sort | uniq > "$TEMP_DIR/ip_ua_pairs.txt"
|
||||
|
||||
# Pattern 3: Detect IP ranges (Class C networks) with suspicious activity
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $1}' | \
|
||||
awk -F'|' '{print $1}' < "$TEMP_DIR/parsed_logs.txt" | \
|
||||
awk -F'.' '{print $1"."$2"."$3".0/24"}' | \
|
||||
sort | uniq -c | sort -rn | awk '$1 > 20' > "$TEMP_DIR/suspicious_networks.txt"
|
||||
|
||||
# Pattern 4: Rapid fire requests (DDoS indicators)
|
||||
# Extract timestamp and count requests per IP per minute
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{
|
||||
awk -F'|' '{
|
||||
ip = $1
|
||||
timestamp = $8
|
||||
# Extract date/time components (handles format: DD/MMM/YYYY:HH:MM:SS)
|
||||
@@ -1490,7 +1500,7 @@ detect_botnets() {
|
||||
time_key = ts[3] ts[2] ts[1] "_" ts[4] ts[5]
|
||||
print ip "|" time_key
|
||||
}
|
||||
}' | \
|
||||
}' < "$TEMP_DIR/parsed_logs.txt" | \
|
||||
sort | uniq -c | \
|
||||
awk '$1 > 50 {print $1 " " $2}' | \
|
||||
awk -F'|' '{print $1}' | \
|
||||
@@ -1511,23 +1521,23 @@ detect_server_ips() {
|
||||
|
||||
# Method 1: Get all IPs from network interfaces
|
||||
if command -v hostname >/dev/null 2>&1; then
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' >> "$TEMP_DIR/server_ips.txt"
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' | grep -E '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' >> "$TEMP_DIR/server_ips.txt" || true
|
||||
fi
|
||||
|
||||
# Method 2: Parse ip addr output
|
||||
if command -v ip >/dev/null 2>&1; then
|
||||
ip addr show 2>/dev/null | grep -oP 'inet \K[\d.]+' >> "$TEMP_DIR/server_ips.txt"
|
||||
ip addr show 2>/dev/null | grep -oP 'inet \K[\d.]+' >> "$TEMP_DIR/server_ips.txt" || true
|
||||
fi
|
||||
|
||||
# Method 3: Try ifconfig as fallback
|
||||
if command -v ifconfig >/dev/null 2>&1; then
|
||||
ifconfig 2>/dev/null | grep -oP 'inet (addr:)?\K[\d.]+' >> "$TEMP_DIR/server_ips.txt"
|
||||
ifconfig 2>/dev/null | grep -oP 'inet (addr:)?\K[\d.]+' >> "$TEMP_DIR/server_ips.txt" || true
|
||||
fi
|
||||
|
||||
# Method 4: Get public IP from external services (with timeout)
|
||||
# Try multiple services for reliability
|
||||
for service in "ifconfig.me/ip" "icanhazip.com" "ipecho.net/plain" "api.ipify.org"; do
|
||||
public_ip=$(curl -s --max-time 3 "$service" 2>/dev/null | grep -oE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$')
|
||||
public_ip=$(curl -s --max-time 3 "$service" 2>/dev/null | grep -oE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' || true)
|
||||
if [ -n "$public_ip" ]; then
|
||||
echo "$public_ip" >> "$TEMP_DIR/server_ips.txt"
|
||||
break
|
||||
@@ -1540,7 +1550,7 @@ detect_server_ips() {
|
||||
fi
|
||||
|
||||
# Remove duplicates and empty lines
|
||||
sort -u "$TEMP_DIR/server_ips.txt" | grep -v '^$' > "$TEMP_DIR/server_ips_final.txt"
|
||||
sort -u "$TEMP_DIR/server_ips.txt" | grep -v '^$' > "$TEMP_DIR/server_ips_final.txt" || true
|
||||
mv "$TEMP_DIR/server_ips_final.txt" "$TEMP_DIR/server_ips.txt"
|
||||
|
||||
server_ip_count=$(wc -l < "$TEMP_DIR/server_ips.txt" 2>/dev/null || echo 0)
|
||||
@@ -1631,7 +1641,7 @@ analyze_time_series() {
|
||||
hour = ts[4]
|
||||
print hour
|
||||
}
|
||||
}' "$TEMP_DIR/attack_vectors_raw.txt" <(cat "$TEMP_DIR/parsed_logs.txt") | sort | uniq -c > "$TEMP_DIR/hourly_attack_traffic.txt"
|
||||
}' "$TEMP_DIR/attack_vectors_raw.txt" "$TEMP_DIR/parsed_logs.txt" | sort | uniq -c > "$TEMP_DIR/hourly_attack_traffic.txt"
|
||||
fi
|
||||
|
||||
print_success "Time-series analysis complete"
|
||||
@@ -1750,33 +1760,40 @@ calculate_threat_scores() {
|
||||
fi
|
||||
|
||||
score=0
|
||||
req_count=${ip_request_counts[$ip]}
|
||||
req_count=0
|
||||
if [ -n "${ip_request_counts[$ip]}" ]; then
|
||||
req_count=${ip_request_counts[$ip]}
|
||||
fi
|
||||
|
||||
# IMPROVED: Base request volume scoring
|
||||
# Skip volume scoring for legitimate bots (Google, Bing, etc.)
|
||||
if [ -z "${legit_bot_ips[$ip]}" ]; then
|
||||
# Not a legitimate bot - apply volume scoring
|
||||
if [ "$req_count" -gt 10000 ]; then score=$((score + 10))
|
||||
elif [ "$req_count" -gt 5000 ]; then score=$((score + 8))
|
||||
elif [ "$req_count" -gt 1000 ]; then score=$((score + 5))
|
||||
elif [ "$req_count" -gt 500 ]; then score=$((score + 3))
|
||||
if [ "${req_count:-0}" -gt 10000 ]; then score=$((score + 10))
|
||||
elif [ "${req_count:-0}" -gt 5000 ]; then score=$((score + 8))
|
||||
elif [ "${req_count:-0}" -gt 1000 ]; then score=$((score + 5))
|
||||
elif [ "${req_count:-0}" -gt 500 ]; then score=$((score + 3))
|
||||
fi
|
||||
fi
|
||||
|
||||
# NEW: Success rate analysis bonuses
|
||||
# High failure rate (80%+ 404/403) = scanning behavior
|
||||
if [ -n "${scanner_ips[$ip]}" ]; then
|
||||
fail_rate=${scanner_ips[$ip]}
|
||||
if [ "$fail_rate" -ge 90 ]; then
|
||||
fail_rate=0
|
||||
if [ -n "${scanner_ips[$ip]}" ]; then
|
||||
fail_rate=${scanner_ips[$ip]}
|
||||
fi
|
||||
if [ "${fail_rate:-0}" -ge 90 ]; then
|
||||
score=$((score + 8)) # Very high failure rate
|
||||
elif [ "$fail_rate" -ge 80 ]; then
|
||||
elif [ "${fail_rate:-0}" -ge 80 ]; then
|
||||
score=$((score + 5)) # High failure rate
|
||||
fi
|
||||
fi
|
||||
|
||||
# High success rate (90%+ 200/301/302) + high volume = potential scraping
|
||||
if [ -n "${scraper_ips[$ip]}" ] && [ "$req_count" -gt 500 ]; then
|
||||
score=$((score + 7)) # Scraping behavior
|
||||
if [ -n "${scraper_ips[$ip]}" ]; then
|
||||
local safe_req_count=$((req_count + 0))
|
||||
[ "$safe_req_count" -gt 500 ] && score=$((score + 7)) # Scraping behavior
|
||||
fi
|
||||
|
||||
# Attack patterns
|
||||
@@ -1947,7 +1964,7 @@ generate_statistics() {
|
||||
|
||||
# OPTIMIZATION: Use single-pass AWK to generate multiple stats from parsed logs
|
||||
# This reads the uncompressed file ONCE instead of 4+ separate reads
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '
|
||||
cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
# Count by domain (for top sites)
|
||||
domains[$2]++
|
||||
@@ -1961,18 +1978,21 @@ generate_statistics() {
|
||||
END {
|
||||
# Output top sites
|
||||
for (domain in domains) {
|
||||
print domains[domain], domain > "'"$TEMP_DIR"'/top_sites_raw.txt"
|
||||
print domains[domain], domain > tmpdir "/top_sites_raw.txt"
|
||||
}
|
||||
|
||||
# Output top IPs
|
||||
for (ip in ips) {
|
||||
print ips[ip], ip > "'"$TEMP_DIR"'/top_ips_raw.txt"
|
||||
print ips[ip], ip > tmpdir "/top_ips_raw.txt"
|
||||
}
|
||||
|
||||
# Output top URLs
|
||||
for (url in urls) {
|
||||
print urls[url], url > "'"$TEMP_DIR"'/top_urls_raw.txt"
|
||||
print urls[url], url > tmpdir "/top_urls_raw.txt"
|
||||
}
|
||||
close(tmpdir "/top_sites_raw.txt")
|
||||
close(tmpdir "/top_ips_raw.txt")
|
||||
close(tmpdir "/top_urls_raw.txt")
|
||||
}'
|
||||
|
||||
# Sort and limit results
|
||||
@@ -2115,7 +2135,7 @@ generate_comparison_report() {
|
||||
# Track repeat attackers
|
||||
local repeat_attackers=0
|
||||
if [ -f "$history_dir/known_attackers_${yesterday}.txt" ]; then
|
||||
repeat_attackers=$(grep -Fx -f <(awk -F'|' '$1 >= 70 {print $2}' "$TEMP_DIR/threat_scores.txt" 2>/dev/null) "$history_dir/known_attackers_${yesterday}.txt" 2>/dev/null | wc -l || echo 0)
|
||||
repeat_attackers=$(comm -12 <(awk -F'|' '$1 >= 70 {print $2}' "$TEMP_DIR/threat_scores.txt" 2>/dev/null | sort -u) <(sort -u "$history_dir/known_attackers_${yesterday}.txt") 2>/dev/null | wc -l || echo 0)
|
||||
if [ "$repeat_attackers" -gt 0 ]; then
|
||||
echo -e "${RED}🔄 REPEAT ATTACKERS: $repeat_attackers IPs from yesterday${NC}"
|
||||
fi
|
||||
@@ -2265,13 +2285,13 @@ generate_report() {
|
||||
# QUICK STATS DASHBOARD
|
||||
print_header "QUICK STATS DASHBOARD"
|
||||
|
||||
total_requests=$(cat "$TEMP_DIR/parsed_logs.txt" | wc -l)
|
||||
unique_ips=$(cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $1}' | sort -u | wc -l)
|
||||
unique_domains=$(cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $2}' | sort -u | wc -l)
|
||||
bot_requests=$(cat "$TEMP_DIR/classified_bots.txt" | awk -F'|' '$9 != "unknown"' | wc -l)
|
||||
total_requests=$(wc -l < "$TEMP_DIR/parsed_logs.txt")
|
||||
unique_ips=$(awk -F'|' '{print $1}' < "$TEMP_DIR/parsed_logs.txt" | sort -u | wc -l)
|
||||
unique_domains=$(awk -F'|' '{print $2}' < "$TEMP_DIR/parsed_logs.txt" | sort -u | wc -l)
|
||||
bot_requests=$(awk -F'|' '$9 != "unknown"' < "$TEMP_DIR/classified_bots.txt" | wc -l)
|
||||
|
||||
# Count private/internal IPs (excluded from threat analysis)
|
||||
private_ips=$(cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '{print $1}' | sort -u | grep -E '^(127\.|10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.|169\.254\.)' | wc -l)
|
||||
private_ips=$(awk -F'|' '{print $1}' < "$TEMP_DIR/parsed_logs.txt" | sort -u | grep -E '^(127\.|10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.|169\.254\.)' || true | wc -l)
|
||||
|
||||
# Count server's own IPs in the logs
|
||||
server_ip_hits=0
|
||||
@@ -3186,24 +3206,24 @@ analyze_domain_threats() {
|
||||
# Old approach: O(domains × high_risk_IPs × file_size) = 83 minutes for 500 domains
|
||||
# New approach: O(file_size) = seconds
|
||||
|
||||
awk -F'|' '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
BEGIN {
|
||||
# Load high-risk IPs into memory
|
||||
while ((getline < "'"$TEMP_DIR"'/threat_scores.txt") > 0) {
|
||||
while ((getline < tmpdir "/threat_scores.txt") > 0) {
|
||||
score = $1
|
||||
ip = $2
|
||||
if (score >= 70) {
|
||||
high_risk[ip] = score
|
||||
}
|
||||
}
|
||||
close("'"$TEMP_DIR"'/threat_scores.txt")
|
||||
close(tmpdir "/threat_scores.txt")
|
||||
|
||||
# Load attack vectors
|
||||
while ((getline < "'"$TEMP_DIR"'/attack_vectors_raw.txt") > 0) {
|
||||
while ((getline < tmpdir "/attack_vectors_raw.txt") > 0) {
|
||||
domain = $2
|
||||
attack_counts[domain]++
|
||||
}
|
||||
close("'"$TEMP_DIR"'/attack_vectors_raw.txt")
|
||||
close(tmpdir "/attack_vectors_raw.txt")
|
||||
}
|
||||
|
||||
# Process parsed logs (single pass)
|
||||
@@ -3222,11 +3242,11 @@ analyze_domain_threats() {
|
||||
}
|
||||
END {
|
||||
# Now process classified bots
|
||||
while ((getline < "'"$TEMP_DIR"'/classified_bots.txt") > 0) {
|
||||
while ((getline < tmpdir "/classified_bots.txt") > 0) {
|
||||
domain = $2
|
||||
bot_counts[domain]++
|
||||
}
|
||||
close("'"$TEMP_DIR"'/classified_bots.txt")
|
||||
close(tmpdir "/classified_bots.txt")
|
||||
|
||||
# Output results for each domain
|
||||
for (domain in domain_requests) {
|
||||
@@ -3238,13 +3258,15 @@ analyze_domain_threats() {
|
||||
high_risk_detail = domain_high_risk_ips[domain]
|
||||
|
||||
# domain|total_requests|bot_requests|bot_percentage|high_risk_ip_count|attack_attempts|high_risk_ips_detail
|
||||
printf "%s|%d|%d|%.1f|%d|%d|%s\n", domain, total_req, bot_req, bot_pct, high_risk_count, attacks, high_risk_detail > "'"$TEMP_DIR"'/domain_threats.txt"
|
||||
printf "%s|%d|%d|%.1f|%d|%d|%s\n", domain, total_req, bot_req, bot_pct, high_risk_count, attacks, high_risk_detail > tmpdir "/domain_threats.txt"
|
||||
|
||||
# Track high-risk IPs per domain
|
||||
if (high_risk_count > 0) {
|
||||
printf "%s|%d|%s\n", domain, high_risk_count, high_risk_detail > "'"$TEMP_DIR"'/domain_high_risk_ips.txt"
|
||||
printf "%s|%d|%s\n", domain, high_risk_count, high_risk_detail > tmpdir "/domain_high_risk_ips.txt"
|
||||
}
|
||||
}
|
||||
close(tmpdir "/domain_threats.txt")
|
||||
close(tmpdir "/domain_high_risk_ips.txt")
|
||||
}' "$TEMP_DIR/parsed_logs.txt"
|
||||
|
||||
# Sort by high-risk IP count (descending)
|
||||
@@ -3648,7 +3670,7 @@ show_detailed_recommendations() {
|
||||
awk -F'|' '$1 >= 70 {printf " • %s (score: %s)\n", $2, $1}' "$TEMP_DIR/threat_scores.txt" 2>/dev/null | head -10
|
||||
;;
|
||||
htaccess_domain)
|
||||
local target_domain=$(echo "$action_title" | grep -oP 'to \K[^ ]+' 2>/dev/null)
|
||||
local target_domain=$(echo "$action_title" | grep -oP 'to \K[^ ]+' 2>/dev/null || echo "")
|
||||
echo "Target Domain: $target_domain"
|
||||
if [ -s "$TEMP_DIR/domain_threats_sorted.txt" ]; then
|
||||
grep "^$target_domain|" "$TEMP_DIR/domain_threats_sorted.txt" 2>/dev/null | while IFS='|' read -r domain total_req bot_req bot_pct high_risk attacks ips; do
|
||||
@@ -4173,7 +4195,7 @@ execute_htaccess_domain_blocking() {
|
||||
print_info "Adding bot blocking rules..."
|
||||
|
||||
# Get high-risk IPs for this domain
|
||||
local block_ips=$(cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | grep "^[^|]*|$target_domain|" 2>/dev/null | cut -d'|' -f1 | sort -u | while read ip; do
|
||||
local block_ips=$(cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | grep "^[^|]*|$target_domain|" 2>/dev/null || true | cut -d'|' -f1 | sort -u | while read ip; do
|
||||
# Check if this IP has high threat score
|
||||
if grep -q "|$ip$" "$TEMP_DIR/threat_scores.txt" 2>/dev/null; then
|
||||
local score=$(grep "|$ip$" "$TEMP_DIR/threat_scores.txt" 2>/dev/null | cut -d'|' -f1 || echo "0")
|
||||
|
||||
Reference in New Issue
Block a user