Fix: Remove UUOC (Useless Use Of Cat) patterns throughout script

Replaced 'cat file | awk' with 'awk file' patterns for efficiency.
This eliminates unnecessary child processes and improves performance.

Changes:
- Lines 1629-1635: hourly bot traffic analysis
- Lines 1915-1955: false positive detection (awk single script)
- Lines 1969-1998: statistics generation (added file argument)
- Lines 2006-2007: top bots calculation
- Lines 2010-2011: traffic breakdown calculation
- Line 2016: domain bot types indexing
- Lines 2636, 2645: bandwidth impact calculation

These are all simple pipe-to-awk patterns that can be inverted
to pass the file directly to awk instead of piping from cat.
This commit is contained in:
Developer
2026-04-23 21:26:37 -04:00
parent d159dd28d8
commit 9471355e77
+11 -11
View File
@@ -1626,13 +1626,13 @@ analyze_time_series() {
print_info "Analyzing time-series patterns..."
# Extract hourly bot traffic
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '$9 != "unknown" {
awk -F'|' '$9 != "unknown" {
timestamp = $8
if (match(timestamp, /([0-9]{2})\/([A-Za-z]{3})\/([0-9]{4}):([0-9]{2}):([0-9]{2}):([0-9]{2})/, ts)) {
hour = ts[4]
print hour
}
}' | sort | uniq -c > "$TEMP_DIR/hourly_bot_traffic.txt" || true
}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | sort | uniq -c > "$TEMP_DIR/hourly_bot_traffic.txt" || true
# Extract hourly attack traffic
if [ -f "$TEMP_DIR/attack_vectors_raw.txt" ]; then
@@ -1912,7 +1912,7 @@ detect_false_positives() {
print_info "Detecting legitimate services (false positives)..."
# Known monitoring service patterns and legitimate CDNs
cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | awk -F'|' '{
awk -F'|' '{
ip = $1
domain = $2
url = $3
@@ -1952,7 +1952,7 @@ detect_false_positives() {
else if (match(url, /checkout|payment|paypal|stripe|square/) && match(ua, /paypal|stripe|square/)) {
print ip "|Payment Processor|" ua "|" domain
}
}' | sort -u > "$TEMP_DIR/false_positives.txt" || true
}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | sort -u > "$TEMP_DIR/false_positives.txt" || true
print_success "False positive detection complete ($(wc -l < "$TEMP_DIR/false_positives.txt" 2>/dev/null || echo 0) legitimate services identified)"
}
@@ -1966,7 +1966,7 @@ generate_statistics() {
# OPTIMIZATION: Use single-pass AWK to generate multiple stats from parsed logs
# This reads the uncompressed file ONCE instead of 4+ separate reads
cat "$TEMP_DIR/parsed_logs.txt" 2>/dev/null | awk -F'|' -v tmpdir="$TEMP_DIR" '
awk -F'|' -v tmpdir="$TEMP_DIR" '
{
# Count by domain (for top sites)
domains[$2]++
@@ -1995,7 +1995,7 @@ generate_statistics() {
close(tmpdir "/top_sites_raw.txt")
close(tmpdir "/top_ips_raw.txt")
close(tmpdir "/top_urls_raw.txt")
}'
}' "$TEMP_DIR/parsed_logs.txt" 2>/dev/null
# Sort and limit results (files may not exist if no data)
[ -f "$TEMP_DIR/top_sites_raw.txt" ] && sort -rn "$TEMP_DIR/top_sites_raw.txt" | head -5 > "$TEMP_DIR/top_sites.txt" || touch "$TEMP_DIR/top_sites.txt"
@@ -2003,17 +2003,17 @@ generate_statistics() {
[ -f "$TEMP_DIR/top_urls_raw.txt" ] && sort -rn "$TEMP_DIR/top_urls_raw.txt" | head -5 > "$TEMP_DIR/top_urls.txt" || touch "$TEMP_DIR/top_urls.txt"
# Top 5 bots by request count (single decompression)
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '$9 != "unknown" {print $10}' | \
awk -F'|' '$9 != "unknown" {print $10}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | \
sort | uniq -c | sort -rn | head -5 > "$TEMP_DIR/top_bots.txt" || true
# Traffic breakdown by bot type (single decompression)
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '{print $9}' | \
awk -F'|' '{print $9}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null | \
sort | uniq -c | sort -rn > "$TEMP_DIR/traffic_breakdown.txt" || true
# Per-domain traffic sources (OPTIMIZED: read uncompressed file once, use grep)
if [ -f "$TEMP_DIR/all_domains.txt" ]; then
# Create indexed bot traffic file (decompress once)
cat "$TEMP_DIR/classified_bots.txt" 2>/dev/null | awk -F'|' '{print $2"|"$9}' > "$TEMP_DIR/domain_bot_types.txt" || true
awk -F'|' '{print $2"|"$9}' "$TEMP_DIR/classified_bots.txt" 2>/dev/null > "$TEMP_DIR/domain_bot_types.txt" || true
while read -r domain; do
echo "$domain" > "$TEMP_DIR/domain_${domain}_stats.txt"
@@ -2633,7 +2633,7 @@ generate_report() {
# Calculate total bot bandwidth
total_bot_bandwidth=0
if [ -f "$TEMP_DIR/classified_bots.txt.gz" ]; then
total_bot_bandwidth=$(cat "$TEMP_DIR/classified_bots.txt" | awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
total_bot_bandwidth=$(awk -F'|' '$9 != "unknown" && $5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/classified_bots.txt")
fi
if [ -n "$total_bot_bandwidth" ] && [ "$total_bot_bandwidth" -gt 0 ]; then
@@ -2642,7 +2642,7 @@ generate_report() {
# Estimate cost at $0.09/GB (typical CDN pricing)
estimated_cost=$(awk "BEGIN {printf \"%.2f\", ($total_bot_bandwidth/1073741824) * 0.09}")
total_bandwidth=$(cat "$TEMP_DIR/parsed_logs.txt" | awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}')
total_bandwidth=$(awk -F'|' '$5 ~ /^[0-9]+$/ {sum += $5} END {print sum}' "$TEMP_DIR/parsed_logs.txt")
bot_pct=$(awk "BEGIN {printf \"%.1f\", ($total_bot_bandwidth/$total_bandwidth)*100}")
echo ""