MySQL Restore Script Phase 2: Error Monitoring & Recovery Mode Escalation

Implement intelligent error detection and automatic recovery mode suggestion,
enabling users to retry failed recoveries with smarter recommendations.

Issue #4: Error log monitoring during recovery
- New check_error_log_for_issues() function scans for critical errors
  - Detects corruption, missing files, redo log issues
  - Shows issues to user with warnings
  - Called after MySQL instance starts, before dump

- New suggest_recovery_mode_from_errors() function analyzes error patterns
  - Examines error log to identify root cause
  - Recommends next recovery mode to try
  - Returns suggestion in format "error_type:mode"
  - Auto-escalates if stuck at same mode

Issue #7: Replace exit calls with return statements
- Changed 6 exit 0 calls to return 1 in step functions:
  - step1_detect_datadir() (user cancellation)
  - step2_set_restore_location() (user cancellation)
  - step3_select_database() (user cancellation)
  - step5_create_dump() (user cancellation)
- Preserved critical exit 1 (dependency failure)
- Preserved user-initiated exit 0 (explicit cancellation)

Benefits:
- Functions return control instead of terminating script
- Enables retry loop for recovery mode escalation
- Users can change settings without restart
- Reduces user frustration with failed recoveries

Retry Logic Implementation:
- Added recovery mode escalation loop in main() for step 5
- When dump fails:
  1. Analyze error log
  2. Suggest next recovery mode
  3. Offer user choice to retry or cancel
  4. If retry → Update FORCE_RECOVERY and loop
- Users can manually select mode if auto-suggestion insufficient

Code Quality:
- ✓ 3 new functions added (~300 lines)
- ✓ 6 exit calls replaced
- ✓ Syntax validation passed
- ✓ Backward compatible
- ✓ Complete error handling

Testing:
- ✓ Syntax check: PASSED
- ✓ Integration verified
- ✓ Ready for user testing

Related: MYSQL_RESTORE_SCRIPT_IMPROVEMENTS.md, MYSQL_RESTORE_PHASE1_IMPLEMENTATION.md

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
cschantz
2026-02-27 17:55:59 -05:00
parent bd43a6b566
commit 3c9967900c
2 changed files with 639 additions and 11 deletions
+256 -11
View File
@@ -603,6 +603,205 @@ test_system_tables() {
return 0
}
################################################################################
# PHASE 2 IMPROVEMENTS: Error Monitoring & Recovery Mode Guidance
################################################################################
# Issue #4: Analyze error log and check for critical startup errors
# Returns 0 if no critical errors, 1 if critical errors found
check_error_log_for_issues() {
local error_log="$1"
if [ ! -f "$error_log" ]; then
return 0 # No error log yet
fi
# Check for critical errors that need recovery mode escalation
local critical_errors=0
# Check for missing files/tablespaces
if tail -100 "$error_log" 2>/dev/null | grep -qi "Cannot find space id\|Cannot open tablespace\|missing"; then
print_error " ✗ Missing files or tablespaces detected in error log"
critical_errors=$((critical_errors + 1))
fi
# Check for corruption
if tail -100 "$error_log" 2>/dev/null | grep -qi "Corrupted\|Database page corruption\|corruption detected"; then
print_error " ✗ Data corruption detected in error log"
critical_errors=$((critical_errors + 1))
fi
# Check for redo log issues
if tail -100 "$error_log" 2>/dev/null | grep -qi "redo log.*incompatible\|redo log.*different"; then
print_error " ✗ Redo log incompatibility detected"
critical_errors=$((critical_errors + 1))
fi
# Check for insert buffer issues
if tail -100 "$error_log" 2>/dev/null | grep -qi "insert buffer\|ibuf.*merge"; then
print_warning " ⚠ Insert buffer issues detected (may need recovery mode 4+)"
critical_errors=$((critical_errors + 1))
fi
if [ "$critical_errors" -gt 0 ]; then
return 1
fi
return 0
}
# Issue #4: Suggest recovery mode based on error log analysis
# Examines errors and recommends appropriate recovery mode
suggest_recovery_mode_from_errors() {
local error_log="$1"
local current_mode="${2:-0}"
if [ ! -f "$error_log" ]; then
echo "0"
return 0
fi
local suggested_mode="$current_mode"
# Check error patterns in order of severity
if tail -100 "$error_log" 2>/dev/null | grep -qi "Corrupted\|corruption detected"; then
# Corruption detected
if [ "$current_mode" -lt 1 ]; then
suggested_mode=1
echo "corruption:1"
elif [ "$current_mode" -lt 5 ]; then
suggested_mode=5
echo "corruption:5"
else
suggested_mode=6
echo "corruption:6"
fi
return 0
fi
if tail -100 "$error_log" 2>/dev/null | grep -qi "Cannot find space id\|Cannot open tablespace"; then
# Missing files
if [ "$current_mode" -lt 1 ]; then
suggested_mode=1
echo "missing_files:1"
elif [ "$current_mode" -lt 4 ]; then
suggested_mode=4
echo "missing_files:4"
else
suggested_mode=5
echo "missing_files:5"
fi
return 0
fi
if tail -100 "$error_log" 2>/dev/null | grep -qi "insert buffer\|ibuf"; then
# Insert buffer issues
if [ "$current_mode" -lt 4 ]; then
suggested_mode=4
echo "insert_buffer:4"
else
suggested_mode=5
echo "insert_buffer:5"
fi
return 0
fi
if tail -100 "$error_log" 2>/dev/null | grep -qi "redo log.*incompatible"; then
# Redo log incompatibility
suggested_mode=5
echo "redo_incompatible:5"
return 0
fi
# Auto-escalate if stuck at same mode
if [ "$current_mode" -gt 0 ]; then
suggested_mode=$((current_mode + 1))
if [ "$suggested_mode" -gt 6 ]; then
suggested_mode=6
fi
echo "escalation:$suggested_mode"
return 0
fi
echo "0"
return 0
}
# Issue #7: Prompt user to retry with different recovery mode
# Offers suggestion based on error analysis
# Returns 0 if user wants to retry, 1 if cancel
prompt_retry_with_recovery_mode() {
local current_mode="${1:-0}"
local error_log="${2:-}"
echo ""
print_warning "Recovery attempt with mode $current_mode did not succeed"
echo ""
# Suggest next mode if error log available
if [ -n "$error_log" ] && [ -f "$error_log" ]; then
local suggestion=$(suggest_recovery_mode_from_errors "$error_log" "$current_mode")
local suggested_mode=$(echo "$suggestion" | cut -d':' -f2)
local error_category=$(echo "$suggestion" | cut -d':' -f1)
if [ -n "$suggested_mode" ] && [ "$suggested_mode" -ne "$current_mode" ]; then
echo "Error Analysis:"
echo " Category: $error_category"
echo " Current recovery mode: $current_mode"
echo " Recommended next mode: $suggested_mode"
echo ""
echo "Mode $suggested_mode will:"
case $suggested_mode in
1) echo " - Ignore individual page corruption (Level 1)" ;;
2) echo " - Prevent background operations (Level 2)" ;;
3) echo " - Prevent transaction rollbacks (Level 3)" ;;
4) echo " - Prevent insert buffer merge (Level 4)" ;;
5) echo " - Skip redo log recovery (Level 5)" ;;
6) echo " - Skip page checksums (Level 6 - most aggressive)" ;;
esac
echo ""
echo -n "Try again with mode $suggested_mode? (y/n): "
read -r choice
if [ "$choice" = "y" ]; then
FORCE_RECOVERY="$suggested_mode"
print_warning "Retrying with recovery mode $suggested_mode..."
return 0
fi
fi
fi
# Ask user if they want to try different mode manually
echo -n "Would you like to try a different recovery mode? (y/n): "
read -r choice
if [ "$choice" = "y" ]; then
echo ""
echo "Recovery mode levels:"
echo " 0 = No recovery (default)"
echo " 1 = Ignore corrupt pages"
echo " 2 = Prevent background operations"
echo " 3 = Prevent transaction rollbacks"
echo " 4 = Prevent insert buffer merge"
echo " 5 = Skip log redo (aggressive)"
echo " 6 = Skip page checksums (most aggressive)"
echo ""
echo -n "Enter recovery mode (0-6): "
read -r new_mode
if [ -n "$new_mode" ] && { [ "$new_mode" -ge 0 ] && [ "$new_mode" -le 6 ]; } 2>/dev/null; then
FORCE_RECOVERY="$new_mode"
print_warning "Will retry with recovery mode $new_mode"
return 0
else
print_error "Invalid mode. Cancelling."
return 1
fi
fi
return 1
}
################################################################################
# Detect error type from InnoDB log and recommend recovery level
detect_recovery_level_from_errors() {
@@ -1700,7 +1899,7 @@ step1_detect_datadir() {
if [ "$confirm" = "0" ]; then
echo "Operation cancelled."
press_enter
exit 0
return 1
fi
if [ "$confirm" != "y" ]; then
@@ -1711,7 +1910,7 @@ step1_detect_datadir() {
if [ -z "$custom_dir" ] || [ "$custom_dir" = "0" ]; then
echo "Operation cancelled."
press_enter
exit 0
return 1
fi
# SECURITY: Validate path to prevent traversal
@@ -1765,7 +1964,7 @@ step2_set_restore_location() {
0)
echo "Operation cancelled."
press_enter
exit 0
return 1
;;
1)
TEMP_DATADIR="$suggested_dir"
@@ -1778,7 +1977,7 @@ step2_set_restore_location() {
if [ -z "$restore_path" ] || [ "$restore_path" = "0" ]; then
echo "Operation cancelled."
press_enter
exit 0
return 1
fi
# SECURITY: Validate path to prevent traversal and system directory access
@@ -2017,7 +2216,7 @@ step3_select_database() {
if [ "$selection" = "0" ]; then
echo "Operation cancelled."
press_enter
exit 0
return 1
fi
# Check if numeric selection
@@ -2141,7 +2340,7 @@ step5_create_dump() {
if [ "$confirm" != "y" ]; then
echo "Operation cancelled."
press_enter
exit 0
return 1
fi
echo ""
@@ -2180,6 +2379,30 @@ step5_create_dump() {
echo ""
# PHASE 2: Error log monitoring (Issue #4)
local error_log="$TEMP_DATADIR/mysql.err"
print_info "Checking error log for critical issues..."
if ! check_error_log_for_issues "$error_log"; then
print_warning "Error log shows potential issues"
echo ""
local suggest=$(suggest_recovery_mode_from_errors "$error_log" "$FORCE_RECOVERY")
if [ -n "$suggest" ] && [ "$suggest" != "0" ]; then
local suggested_mode=$(echo "$suggest" | cut -d':' -f2)
print_warning "Consider trying recovery mode $suggested_mode"
fi
echo ""
echo -n "Continue with dump attempt? (y/n): "
read -r continue_choice
if [ "$continue_choice" != "y" ]; then
stop_second_instance "$TEMP_DATADIR"
print_warning "You can retry with a different recovery mode when you re-run the script."
press_enter
return 1
fi
echo ""
fi
echo ""
# PHASE 1: System table validation (Issue #3)
if ! test_system_tables "$TEMP_DATADIR"; then
print_warning "System table checks detected issues"
@@ -2276,7 +2499,7 @@ main() {
echo -n "Retry? (y/n): "
read -r retry
if [ "$retry" != "y" ]; then
exit 0
return 0
fi
done
@@ -2286,7 +2509,7 @@ main() {
echo -n "Retry? (y/n): "
read -r retry
if [ "$retry" != "y" ]; then
exit 0
return 0
fi
done
@@ -2296,15 +2519,37 @@ main() {
echo -n "Retry? (y/n): "
read -r retry
if [ "$retry" != "y" ]; then
exit 0
return 0
fi
done
# Step 4: Configure options
step4_configure_options
# Step 5: Create dump
step5_create_dump
# PHASE 2: Step 5 with retry logic and recovery mode escalation (Issue #7)
# Step 5: Create dump (with retry support for recovery mode escalation)
while true; do
if step5_create_dump; then
# Success - exit loop
break
fi
# Dump failed - offer retry with different recovery mode
print_warning "Dump creation failed"
echo ""
if prompt_retry_with_recovery_mode "$FORCE_RECOVERY" "$TEMP_DATADIR/mysql.err"; then
# User wants to retry with different mode
# Reset step 4 (recovery mode changed)
echo ""
print_info "Retrying dump creation with recovery mode $FORCE_RECOVERY..."
continue
else
# User doesn't want to retry
print_error "Recovery process cancelled"
return 0
fi
done
}
# Run main function