diff --git a/modules/backup/mysql-restore-to-sql.sh b/modules/backup/mysql-restore-to-sql.sh index 146dbc8..7087862 100755 --- a/modules/backup/mysql-restore-to-sql.sh +++ b/modules/backup/mysql-restore-to-sql.sh @@ -50,10 +50,28 @@ cleanup_on_exit() { if [ "$SECOND_INSTANCE_RUNNING" -eq 1 ] && [ -n "$TEMP_DATADIR" ]; then echo "" print_warning "Script interrupted - cleaning up second MySQL instance..." + if [ -S "$TEMP_DATADIR/socket.mysql" ]; then + # Graceful shutdown with validation mysqladmin -h localhost -S "$TEMP_DATADIR/socket.mysql" shutdown 2>/dev/null || true - sleep 1 - print_success "Second instance shut down safely" + + # Wait for socket to disappear (max 5 seconds in cleanup) + local cleanup_wait=0 + while [ -S "$TEMP_DATADIR/socket.mysql" ] && [ "$cleanup_wait" -lt 5 ]; do + sleep 1 + cleanup_wait=$((cleanup_wait + 1)) + done + + # Force cleanup if socket still exists + if [ -S "$TEMP_DATADIR/socket.mysql" ]; then + # Get PID and force kill + if [ -f "$TEMP_DATADIR/mysql.pid" ]; then + kill -9 $(cat "$TEMP_DATADIR/mysql.pid" 2>/dev/null) 2>/dev/null || true + fi + rm -f "$TEMP_DATADIR/socket.mysql" "$TEMP_DATADIR/mysql.lock" 2>/dev/null || true + fi + + print_success "Second instance cleaned up" fi fi } @@ -207,7 +225,78 @@ validate_restore_structure() { return 0 } -# Check error log for InnoDB startup issues +# Detect error type from InnoDB log and recommend recovery level +detect_recovery_level_from_errors() { + local error_log="$1" + local last_recovery_level="${2:-0}" + + if [ ! -f "$error_log" ]; then + echo "0" # No errors = no recovery needed + return 0 + fi + + local log_content=$(cat "$error_log" 2>/dev/null) + + # Error type detection (in order of severity/recovery level needed) + local error_type="" + local recommended_level=0 + + # Check for MISSING FILES (missing tablespaces, unopenable files) + # These need Level 1 (ignore corrupt pages) - missing files aren't corrupt, just absent + if echo "$log_content" | grep -qiE "Cannot open tablespace|Tablespace.*missing|was not found at|Cannot find space id"; then + error_type="missing_files" + recommended_level=1 + + # Check for REDO LOG INCOMPATIBILITY (version mismatch, format issues) + # These need Level 5 (skip log redo) or higher + elif echo "$log_content" | grep -qiE "redo log.*incompatible|redo log.*different|redo log format.*does not match"; then + error_type="redo_incompatible" + recommended_level=5 + + # Check for CORRUPTION (page corruption, corrupted data) + # These need Level 1-4 depending on severity + elif echo "$log_content" | grep -qiE "Corrupted|Database page corruption|Corruption detected"; then + error_type="corruption" + # Start with Level 1 if fresh, escalate if retry + if [ "$last_recovery_level" -eq 0 ]; then + recommended_level=1 + elif [ "$last_recovery_level" -eq 1 ]; then + recommended_level=4 + else + recommended_level=6 + fi + + # Check for INSERT BUFFER ISSUES (insert buffer merge failures) + # These need Level 4 (prevent insert buffer merge) + elif echo "$log_content" | grep -qiE "insert buffer|ibuf|buffer pool.*error"; then + error_type="insert_buffer" + recommended_level=4 + + # Check for MEMORY ISSUES (allocation failures, OOM) + # These need system fix, not recovery mode + elif echo "$log_content" | grep -qiE "Cannot allocate memory|Out of memory|memory error"; then + error_type="memory_issue" + recommended_level=0 + + # Check for ROLLBACK ISSUES (transaction rollback problems) + # These need Level 3 (prevent transaction rollbacks) + elif echo "$log_content" | grep -qiE "rollback.*error|Cannot rollback|Rollback failed"; then + error_type="rollback_issue" + recommended_level=3 + fi + + # Auto-escalate if retry at same level + if [ "$last_recovery_level" -gt 0 ] && [ "$recommended_level" -eq "$last_recovery_level" ]; then + recommended_level=$((last_recovery_level + 1)) + if [ "$recommended_level" -gt 6 ]; then + recommended_level=6 + fi + fi + + echo "$recommended_level|$error_type" +} + +# Check error log for InnoDB startup issues (returns error type) check_innodb_errors() { local error_log="$1" local check_recent="${2:-no}" # "yes" = only check recent errors, "no" = full check @@ -270,6 +359,20 @@ show_recovery_options() { # Analyze the error log to determine failure type local error_log="$datadir/mysql.err" + + # First, use error-based detection to determine root cause and recommended level + if [ -f "$error_log" ]; then + local detection_result=$(detect_recovery_level_from_errors "$error_log" "$current_recovery") + local recommended_level=$(echo "$detection_result" | cut -d'|' -f1) + local error_type=$(echo "$detection_result" | cut -d'|' -f2) + + if [ -n "$error_type" ]; then + echo "Based on error log analysis:" + echo " Error Type: $error_type" + echo " Recommended Recovery Level: $recommended_level" + echo "" + fi + fi local missing_files="" local corruption_detected="" local redo_incompatible="" @@ -496,30 +599,54 @@ show_recovery_options() { elif [ -n "$corruption_detected" ]; then print_error "DIAGNOSIS: InnoDB corruption detected" echo "" + + # Use error-based detection to recommend appropriate recovery level + local detection_result=$(detect_recovery_level_from_errors "$error_log" "$current_recovery") + local recommended_level=$(echo "$detection_result" | cut -d'|' -f1) + local error_type=$(echo "$detection_result" | cut -d'|' -f2) + + # Build escalation path based on corruption type + local level_1_desc="Ignores corrupt pages (most conservative)" + local level_4_desc="Prevents insert buffer merge operations" + local level_6_desc="Skips page checksums (maximum recovery, most data loss risk)" + print_warning "RECOMMENDED ACTIONS (IN ORDER):" echo "" + + # If we haven't tried any recovery yet, start with level 1 if [ "$current_recovery" = "0" ] || [ -z "$current_recovery" ]; then - echo " Option 1: Try Force Recovery Level 1" + echo " Step 1: Try Force Recovery Level 1" echo " ────────────────────────────────────────────────" echo " Re-run script → Step 4 → Select recovery mode 1" - echo " (Ignores corrupt pages)" + echo " $level_1_desc" + echo "" + echo " (If level 1 fails, proceed to level 4)" echo "" fi + + # If level 1 was tried, recommend level 4 if [ "$current_recovery" = "1" ]; then - echo " Option 2: Try Force Recovery Level 4" + echo " Step 1: Try Force Recovery Level 4" echo " ────────────────────────────────────────────────" echo " Re-run script → Step 4 → Select recovery mode 4" - echo " (Prevents insert buffer merge)" + echo " $level_4_desc" + echo "" + echo " (If level 4 fails, proceed to level 6)" echo "" fi + + # If level 4+ was tried, recommend level 6 (last resort) if [ "${current_recovery:-0}" -ge 4 ]; then - echo " Option 2: Try Force Recovery Level 6 (LAST RESORT)" + echo " Step 1: Try Force Recovery Level 6 (LAST RESORT)" echo " ────────────────────────────────────────────────" echo " Re-run script → Step 4 → Select recovery mode 6" - echo " (Skips page checksums - maximum data recovery)" + echo " $level_6_desc" + echo "" + echo " NOTE: This may recover more data but at risk of data consistency" echo "" fi - echo " Option 3: Start Fresh" + + echo " Step 2: If All Recovery Levels Fail" echo " ────────────────────────────────────────────────" echo " 1. Corruption may be in the backup itself" echo " 2. Try restoring from an older backup date" @@ -783,7 +910,7 @@ start_second_instance() { # Wait for instance to start (max 30 seconds) local count=0 - while [ -n "$count" ] && [ "$count" -lt 30 ]; do + while [ "$count" -lt 30 ]; do if [ -S "$datadir/socket.mysql" ]; then print_success "Second MySQL instance started (PID: $pid)" @@ -839,19 +966,58 @@ start_second_instance() { return 1 } -# Stop second MySQL instance +# Stop second MySQL instance with proper validation stop_second_instance() { local datadir="$1" - if [ -S "$datadir/socket.mysql" ]; then - print_info "Shutting down second MySQL instance..." - mysqladmin -h localhost -S "$datadir/socket.mysql" shutdown 2>/dev/null || true - sleep 2 - print_success "Second instance shut down" - - # Mark as no longer running + if [ ! -S "$datadir/socket.mysql" ]; then + # Socket doesn't exist, instance likely already stopped SECOND_INSTANCE_RUNNING=0 + return 0 fi + + print_info "Shutting down second MySQL instance..." + + # Get the PID from pid file if available + local pid="" + if [ -f "$datadir/mysql.pid" ]; then + pid=$(cat "$datadir/mysql.pid" 2>/dev/null) + fi + + # Send graceful shutdown + mysqladmin -h localhost -S "$datadir/socket.mysql" shutdown 2>/dev/null || true + + # CRITICAL FIX: Verify shutdown actually happened (not just fire-and-forget) + # Wait up to 15 seconds for socket to disappear (indicates clean shutdown) + local wait_count=0 + while [ -S "$datadir/socket.mysql" ] && [ "$wait_count" -lt 15 ]; do + sleep 1 + wait_count=$((wait_count + 1)) + done + + # If socket still exists, attempt force kill + if [ -S "$datadir/socket.mysql" ]; then + print_warning "Socket still exists after shutdown. Forcing termination..." + + # Try to kill the process if we have the PID + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null || true + sleep 1 + fi + + # Remove stale socket and lock files + rm -f "$datadir/socket.mysql" "$datadir/mysql.lock" 2>/dev/null || true + fi + + # Verify process is actually dead + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + print_warning "MySQL process still running after shutdown attempt. Will retry on exit." + else + print_success "Second instance shut down successfully" + fi + + # Mark as no longer running + SECOND_INSTANCE_RUNNING=0 } # Validate SQL dump integrity