Compare commits
3 Commits
b45735981e
...
29f069be52
| Author | SHA1 | Date | |
|---|---|---|---|
| 29f069be52 | |||
| db10d4a8d9 | |||
| b6cfd3be3b |
@@ -140,12 +140,175 @@ After installing, run: systemctl enable smartd && systemctl start smartd"
|
|||||||
local healthy_count=0
|
local healthy_count=0
|
||||||
local warning_count=0
|
local warning_count=0
|
||||||
local failed_count=0
|
local failed_count=0
|
||||||
|
local skipped_count=0
|
||||||
|
local skipped_raid=0
|
||||||
|
local skipped_virtual=0
|
||||||
|
local skipped_lvm=0
|
||||||
|
local skipped_other=0
|
||||||
|
|
||||||
for disk in $disks; do
|
for disk in $disks; do
|
||||||
disk_count=$((disk_count + 1))
|
disk_count=$((disk_count + 1))
|
||||||
|
|
||||||
# Check if SMART is available
|
# Get device info to determine if SMART is applicable
|
||||||
if ! smartctl -i "$disk" &>/dev/null; then
|
local device_info=$(smartctl -i "$disk" 2>&1)
|
||||||
|
|
||||||
|
# COMPREHENSIVE DEVICE DETECTION - Works for ALL storage types
|
||||||
|
|
||||||
|
# 1. CHECK: Device exists and smartctl can communicate
|
||||||
|
if echo "$device_info" | grep -qiE "No such device|open device.*failed|Permission denied"; then
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk (device not accessible)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_other=$((skipped_other + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. CHECK: SMART support availability
|
||||||
|
if echo "$device_info" | grep -qiE "SMART support is:.*Unavailable|SMART support is:.*Disabled|Unable to detect device type"; then
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk (SMART not supported or disabled)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_other=$((skipped_other + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3. EXTRACT: Device type, model, vendor for intelligent detection
|
||||||
|
local model=$(echo "$device_info" | grep -iE "Device Model:|Product:|Model Number:|Model Family:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||||||
|
local vendor=$(echo "$device_info" | grep -iE "Vendor:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||||||
|
local device_type=$(echo "$device_info" | grep -iE "Device type:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||||||
|
local serial=$(echo "$device_info" | grep -iE "Serial Number:" | head -1 | sed -E 's/^[^:]*:[ \t]*//' | xargs)
|
||||||
|
|
||||||
|
# Combine model and vendor for comprehensive matching
|
||||||
|
local full_id="${vendor} ${model} ${device_type}"
|
||||||
|
|
||||||
|
# 4. DETECT: Hardware RAID Controllers (all major brands)
|
||||||
|
# These devices are RAID controller logical volumes, not physical disks
|
||||||
|
if echo "$full_id" | grep -qiE "MR[0-9]{4}|LSI|MegaRAID|PERC|Avago|Broadcom.*RAID|HPE.*Smart Array|HP.*Smart Array|Dell.*RAID|IBM.*ServeRAID|Adaptec|3ware|Areca|HighPoint|Promise.*RAID"; then
|
||||||
|
local raid_type="Hardware RAID Controller"
|
||||||
|
local tools="Unknown RAID tools"
|
||||||
|
|
||||||
|
# Identify specific RAID type and provide exact tools
|
||||||
|
if echo "$full_id" | grep -qiE "MR[0-9]{4}|MegaRAID"; then
|
||||||
|
raid_type="MegaRAID Controller"
|
||||||
|
tools="megacli -LDInfo -Lall -aALL or storcli /c0 /vall show all"
|
||||||
|
elif echo "$full_id" | grep -qiE "LSI|Avago|Broadcom"; then
|
||||||
|
raid_type="LSI/Broadcom RAID Controller"
|
||||||
|
tools="sas2ircu LIST or storcli show"
|
||||||
|
elif echo "$full_id" | grep -qiE "PERC|Dell"; then
|
||||||
|
raid_type="Dell PERC RAID Controller"
|
||||||
|
tools="perccli /c0 /vall show all or omreport storage vdisk"
|
||||||
|
elif echo "$full_id" | grep -qiE "HP|HPE.*Smart"; then
|
||||||
|
raid_type="HP Smart Array Controller"
|
||||||
|
tools="hpacucli ctrl all show config or ssacli ctrl all show config"
|
||||||
|
elif echo "$full_id" | grep -qiE "Adaptec"; then
|
||||||
|
raid_type="Adaptec RAID Controller"
|
||||||
|
tools="arcconf getconfig 1"
|
||||||
|
elif echo "$full_id" | grep -qiE "3ware"; then
|
||||||
|
raid_type="3ware RAID Controller"
|
||||||
|
tools="tw_cli info c0"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($raid_type: $model)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_raid=$((skipped_raid + 1))
|
||||||
|
add_finding "INFO" "ℹ️ $raid_type Detected: $disk" \
|
||||||
|
"Device: $disk
|
||||||
|
Controller: $model
|
||||||
|
Type: $raid_type
|
||||||
|
SMART Status: Not applicable (logical volume from RAID controller)
|
||||||
|
|
||||||
|
This is a logical volume presented by a hardware RAID controller.
|
||||||
|
SMART data is not available for these devices - the controller manages
|
||||||
|
the physical disks and presents them as a single logical volume.
|
||||||
|
|
||||||
|
To monitor RAID health, use controller-specific tools:
|
||||||
|
Command: $tools
|
||||||
|
|
||||||
|
Physical disk health is monitored by the RAID controller itself.
|
||||||
|
Check controller logs and status for drive failures." \
|
||||||
|
"Monitor RAID array health using controller tools, not SMART"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5. DETECT: Virtual/Emulated Devices (VMs and containers)
|
||||||
|
if echo "$full_id" | grep -qiE "QEMU|VirtIO|Virtual|VMware|Xen.*Virtual|Msft.*Virtual|Google.*Persistent|Amazon.*Elastic"; then
|
||||||
|
local virt_type="Virtual Disk"
|
||||||
|
|
||||||
|
if echo "$full_id" | grep -qiE "QEMU"; then
|
||||||
|
virt_type="QEMU Virtual Disk (KVM)"
|
||||||
|
elif echo "$full_id" | grep -qiE "VMware"; then
|
||||||
|
virt_type="VMware Virtual Disk"
|
||||||
|
elif echo "$full_id" | grep -qiE "VirtIO"; then
|
||||||
|
virt_type="VirtIO Virtual Disk"
|
||||||
|
elif echo "$full_id" | grep -qiE "Msft.*Virtual"; then
|
||||||
|
virt_type="Hyper-V Virtual Disk"
|
||||||
|
elif echo "$full_id" | grep -qiE "Xen"; then
|
||||||
|
virt_type="Xen Virtual Disk"
|
||||||
|
elif echo "$full_id" | grep -qiE "Google"; then
|
||||||
|
virt_type="Google Persistent Disk"
|
||||||
|
elif echo "$full_id" | grep -qiE "Amazon"; then
|
||||||
|
virt_type="AWS EBS Volume"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk ($virt_type)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_virtual=$((skipped_virtual + 1))
|
||||||
|
# Already handled by VM detection at start of function
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 6. DETECT: Software RAID / LVM / Device Mapper
|
||||||
|
if echo "$disk" | grep -qE "md[0-9]|dm-[0-9]"; then
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk (software RAID/LVM device)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_lvm=$((skipped_lvm + 1))
|
||||||
|
add_finding "INFO" "ℹ️ Software RAID/LVM Detected: $disk" \
|
||||||
|
"Device: $disk
|
||||||
|
Type: Software RAID or LVM logical volume
|
||||||
|
|
||||||
|
This is a logical device managed by the kernel (mdadm or LVM).
|
||||||
|
SMART monitoring should be performed on the underlying physical disks.
|
||||||
|
|
||||||
|
For software RAID (md devices):
|
||||||
|
• Check RAID status: cat /proc/mdstat
|
||||||
|
• Monitor physical disks: smartctl -a /dev/sd[X]
|
||||||
|
|
||||||
|
For LVM (dm- devices):
|
||||||
|
• Check LV status: lvdisplay
|
||||||
|
• Monitor physical volumes: pvdisplay
|
||||||
|
• Check underlying disks: smartctl -a /dev/sd[X]" \
|
||||||
|
"Monitor underlying physical disks, not the logical volume"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 7. DETECT: Loop devices, RAM disks, other special devices
|
||||||
|
if echo "$disk" | grep -qE "loop|ram|zram|nbd"; then
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk (special device: loop/ram/network)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_other=$((skipped_other + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 8. FINAL CHECK: Is this a real disk with SMART data?
|
||||||
|
# Try to get SMART attributes - if this fails, skip
|
||||||
|
if ! smartctl -A "$disk" &>/dev/null; then
|
||||||
|
echo -e "${CYAN}[INFO]${NC} Skipping $disk (no SMART attributes available)"
|
||||||
|
skipped_count=$((skipped_count + 1))
|
||||||
|
skipped_other=$((skipped_other + 1))
|
||||||
|
add_finding "INFO" "ℹ️ Device Without SMART: $disk" \
|
||||||
|
"Device: $disk
|
||||||
|
Model: ${model:-Unknown}
|
||||||
|
|
||||||
|
This device does not provide SMART attributes.
|
||||||
|
Common reasons:
|
||||||
|
• USB-connected drives (SMART data not passed through)
|
||||||
|
• Some hardware RAID configurations
|
||||||
|
• Older drives without SMART support
|
||||||
|
• Passthrough issues in virtual environments
|
||||||
|
|
||||||
|
If this is a critical disk, verify health through other means:
|
||||||
|
• Check dmesg for errors: dmesg | grep -i '$disk'
|
||||||
|
• Monitor I/O errors: iostat -x $disk
|
||||||
|
• Check filesystem errors: mount | grep $disk" \
|
||||||
|
"Monitor through system logs and I/O statistics"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -275,14 +438,15 @@ After installing, run: systemctl enable smartd && systemctl start smartd"
|
|||||||
[ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n'
|
[ -n "$nvme_spare" ] && risk_factors+="$nvme_spare"$'\n'
|
||||||
|
|
||||||
# Determine severity and report
|
# Determine severity and report
|
||||||
if [[ ! "$health" =~ PASSED ]]; then
|
# Be SMART about health status - only flag if explicitly FAILED
|
||||||
# SMART health check FAILED
|
if [[ "$health" =~ FAILED ]]; then
|
||||||
|
# SMART health check explicitly FAILED
|
||||||
failed_count=$((failed_count + 1))
|
failed_count=$((failed_count + 1))
|
||||||
add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \
|
add_finding "CRITICAL" "🔴 DISK FAILURE: $disk - REPLACE IMMEDIATELY" \
|
||||||
"Device: $disk
|
"Device: $disk
|
||||||
Model: $model
|
Model: $model
|
||||||
Serial: $serial
|
Serial: $serial
|
||||||
Health: ${health:-UNKNOWN} ❌
|
Health: FAILED ❌
|
||||||
|
|
||||||
SMART Status: FAILED
|
SMART Status: FAILED
|
||||||
Reallocated Sectors: ${reallocated:-N/A}
|
Reallocated Sectors: ${reallocated:-N/A}
|
||||||
@@ -393,12 +557,33 @@ SMART Attributes:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Summary finding
|
# Summary finding with skip breakdown
|
||||||
|
local summary_details="Total devices found: $disk_count
|
||||||
|
Physical disks monitored: $healthy_count healthy, $warning_count warning, $failed_count failed"
|
||||||
|
|
||||||
|
if [ "$skipped_count" -gt 0 ]; then
|
||||||
|
summary_details="${summary_details}
|
||||||
|
Devices skipped (SMART not applicable): $skipped_count"
|
||||||
|
if [ "$skipped_raid" -gt 0 ]; then
|
||||||
|
summary_details="${summary_details}
|
||||||
|
• Hardware RAID controllers: $skipped_raid (use vendor tools)"
|
||||||
|
fi
|
||||||
|
if [ "$skipped_lvm" -gt 0 ]; then
|
||||||
|
summary_details="${summary_details}
|
||||||
|
• Software RAID/LVM: $skipped_lvm (monitor underlying disks)"
|
||||||
|
fi
|
||||||
|
if [ "$skipped_virtual" -gt 0 ]; then
|
||||||
|
summary_details="${summary_details}
|
||||||
|
• Virtual/cloud disks: $skipped_virtual (managed by hypervisor)"
|
||||||
|
fi
|
||||||
|
if [ "$skipped_other" -gt 0 ]; then
|
||||||
|
summary_details="${summary_details}
|
||||||
|
• Other (USB/special): $skipped_other (see findings for details)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
add_finding "INFO" "Disk Health Summary" \
|
add_finding "INFO" "Disk Health Summary" \
|
||||||
"Total disks checked: $disk_count
|
"$summary_details" \
|
||||||
Healthy: $healthy_count
|
|
||||||
Warning: $warning_count
|
|
||||||
Failed: $failed_count" \
|
|
||||||
"Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
|
"Regular SMART monitoring recommended: smartctl -a /dev/[disk]"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1650,15 +1835,29 @@ main() {
|
|||||||
press_enter
|
press_enter
|
||||||
|
|
||||||
# Severity-based exit codes for monitoring system integration
|
# Severity-based exit codes for monitoring system integration
|
||||||
# exit 0 = healthy (INFO only)
|
# Only use exit codes when script is run standalone (not sourced by launcher)
|
||||||
# exit 1 = warnings detected
|
# When sourced, the return value is available via $? but won't exit the parent shell
|
||||||
# exit 2 = critical issues detected
|
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||||||
|
# Script is being run directly, use exit codes
|
||||||
case "$overall" in
|
case "$overall" in
|
||||||
CRITICAL) exit 2 ;;
|
CRITICAL) exit 2 ;;
|
||||||
WARNING) exit 1 ;;
|
WARNING) exit 1 ;;
|
||||||
*) exit 0 ;;
|
*) exit 0 ;;
|
||||||
esac
|
esac
|
||||||
|
else
|
||||||
|
# Script is being sourced (called from launcher), use return codes
|
||||||
|
case "$overall" in
|
||||||
|
CRITICAL) return 2 ;;
|
||||||
|
WARNING) return 1 ;;
|
||||||
|
*) return 0 ;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Run main function
|
# Run main function only if script is executed directly (not sourced)
|
||||||
main
|
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||||||
|
main
|
||||||
|
else
|
||||||
|
# When sourced, call main but don't auto-run
|
||||||
|
main
|
||||||
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user