Fix: Remove UUOC (Useless Use Of Cat) patterns throughout script
Replaced 'cat file | awk' with 'awk file' patterns for efficiency. This eliminates unnecessary child processes and improves performance. Changes: - Lines 1629-1635: hourly bot traffic analysis - Lines 1915-1955: false positive detection (awk single script) - Lines 1969-1998: statistics generation (added file argument) - Lines 2006-2007: top bots calculation - Lines 2010-2011: traffic breakdown calculation - Line 2016: domain bot types indexing - Lines 2636, 2645: bandwidth impact calculation These are all simple pipe-to-awk patterns that can be inverted to pass the file directly to awk instead of piping from cat.
This commit is contained in:
@@ -1626,13 +1626,13 @@ analyze_time_series() {
|
||||
print_info "Analyzing time-series patterns..."
|
||||
|
||||
# Extract hourly bot traffic
|
||||
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '$9 != "unknown" {
|
||||
awk -F'|' '$9 != "unknown" {
|
||||
timestamp = $8
|
||||
if (match(timestamp, /([0-9]{2})\/([A-Za-z]{3})\/([0-9]{4}):([0-9]{2}):([0-9]{2}):([0-9]{2})/, ts)) {
|
||||
hour = ts[4]
|
||||
print hour
|
||||
}
|
||||
}' | sort | uniq -c > "$TEMP_DIR/hourly_bot_traffic.txt" || true
|
||||
}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | sort | uniq -c > "$TEMP_DIR/hourly_bot_traffic.txt" || true
|
||||
|
||||
# Extract hourly attack traffic
|
||||
if [ -f "$TEMP_DIR/attack_vectors_raw.txt" ]; then
|
||||
@@ -1912,7 +1912,7 @@ detect_false_positives() {
|
||||
print_info "Detecting legitimate services (false positives)..."
|
||||
|
||||
# Known monitoring service patterns and legitimate CDNs
|
||||
cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | awk -F'|' '{
|
||||
awk -F'|' '{
|
||||
ip = $1
|
||||
domain = $2
|
||||
url = $3
|
||||
@@ -1952,7 +1952,7 @@ detect_false_positives() {
|
||||
else if (match(url, /checkout|payment|paypal|stripe|square/) && match(ua, /paypal|stripe|square/)) {
|
||||
print ip "|Payment Processor|" ua "|" domain
|
||||
}
|
||||
}' | sort -u > "$TEMP_DIR/false_positives.txt" || true
|
||||
}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | sort -u > "$TEMP_DIR/false_positives.txt" || true
|
||||
|
||||
print_success "False positive detection complete ($(wc -l < "$TEMP_DIR/false_positives.txt" 2>/dev/null || echo 0) legitimate services identified)"
|
||||
}
|
||||
@@ -1966,7 +1966,7 @@ generate_statistics() {
|
||||
|
||||
# OPTIMIZATION: Use single-pass AWK to generate multiple stats from parsed logs
|
||||
# This reads the uncompressed file ONCE instead of 4+ separate reads
|
||||
cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
awk -F'|' -v tmpdir="$TEMP_DIR" '
|
||||
{
|
||||
# Count by domain (for top sites)
|
||||
domains[$2]++
|
||||
@@ -1995,7 +1995,7 @@ generate_statistics() {
|
||||
close(tmpdir "/top_sites_raw.txt")
|
||||
close(tmpdir "/top_ips_raw.txt")
|
||||
close(tmpdir "/top_urls_raw.txt")
|
||||
}'
|
||||
}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null
|
||||
|
||||
# Sort and limit results (files may not exist if no data)
|
||||
[ -f "$TEMP_DIR/top_sites_raw.txt" ] && sort -rn "$TEMP_DIR/top_sites_raw.txt" | head -5 > "$TEMP_DIR/top_sites.txt" || touch "$TEMP_DIR/top_sites.txt"
|
||||
@@ -2003,17 +2003,17 @@ generate_statistics() {
|
||||
[ -f "$TEMP_DIR/top_urls_raw.txt" ] && sort -rn "$TEMP_DIR/top_urls_raw.txt" | head -5 > "$TEMP_DIR/top_urls.txt" || touch "$TEMP_DIR/top_urls.txt"
|
||||
|
||||
# Top 5 bots by request count (single decompression)
|
||||
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '$9 != "unknown" {print $10}' | \
|
||||
awk -F'|' '$9 != "unknown" {print $10}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | \
|
||||
sort | uniq -c | sort -rn | head -5 > "$TEMP_DIR/top_bots.txt" || true
|
||||
|
||||
# Traffic breakdown by bot type (single decompression)
|
||||
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '{print $9}' | \
|
||||
awk -F'|' '{print $9}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | \
|
||||
sort | uniq -c | sort -rn > "$TEMP_DIR/traffic_breakdown.txt" || true
|
||||
|
||||
# Per-domain traffic sources (OPTIMIZED: read uncompressed file once, use grep)
|
||||
if [ -f "$TEMP_DIR/all_domains.txt" ]; then
|
||||
# Create indexed bot traffic file (decompress once)
|
||||
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '{print $2"|"$9}' > "$TEMP_DIR/domain_bot_types.txt" || true
|
||||
awk -F'|' '{print $2"|"$9}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null > "$TEMP_DIR/domain_bot_types.txt" || true
|
||||
|
||||
while read -r domain; do
|
||||
echo "$domain" > "$TEMP_DIR/domain_${domain}_stats.txt"
|
||||
@@ -2633,7 +2633,7 @@ generate_report() {
|
||||
# Calculate total bot bandwidth
|
||||
total_bot_bandwidth=0
|
||||
if [ -f "$TEMP_DIR/classified_bots.txt.gz" ]; then
|
||||
total_bot_bandwidth=$(cat "$TEMP_DIR/classified_bots.txt" | awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
|
||||
total_bot_bandwidth=$(awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/classified_bots.txt")
|
||||
fi
|
||||
|
||||
if [ -n "$total_bot_bandwidth" ] && [ "$total_bot_bandwidth" -gt 0 ]; then
|
||||
@@ -2642,7 +2642,7 @@ generate_report() {
|
||||
# Estimate cost at $0.09/GB (typical CDN pricing)
|
||||
estimated_cost=$(awk "BEGIN {printf \"%.2f\", ($total_bot_bandwidth/1073741824) * 0.09}")
|
||||
|
||||
total_bandwidth=$(cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
|
||||
total_bandwidth=$(awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/parsed_logs.txt")
|
||||
bot_pct=$(awk "BEGIN {printf \"%.1f\", ($total_bot_bandwidth/$total_bandwidth)*100}")
|
||||
|
||||
echo ""
|
||||
|
||||
Reference in New Issue
Block a user