diff --git a/.gitignore b/.gitignore index f2ca637..0f413db 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ /src/NodeDev.ScriptRunner/obj /src/NodeDev.EndToEndTests/Features/*.feature.cs +TestResults/ diff --git a/DEBUGGING_INVESTIGATION.md b/DEBUGGING_INVESTIGATION.md new file mode 100644 index 0000000..9af0511 --- /dev/null +++ b/DEBUGGING_INVESTIGATION.md @@ -0,0 +1,89 @@ +# Linux Hard Debugging Crash Investigation + +## Status: PARTIALLY FIXED - Still Crashes After ~25 Debug Sessions + +### What Works Now +- Finalizer fix prevents immediate crashes from freeing native library on finalizer thread ✅ +- Reference counting fixes prevent crashes from mixed default/custom path instances ✅ +- Race condition fix prevents concurrent initialization issues ✅ +- Server now runs 25+ debug sessions before crashing (was 3-4 before fixes) + +### What Still Fails +Server crashes after approximately 20-30 debug sessions. This is reproducible with test server at `/tmp/TestBlazorDebug`. + +### Observations +1. **RunWithDebug returns in 0.0 seconds** - This is suspicious. Should take 2+ seconds for Sleep node. +2. **Crash is gradual accumulation** - Not immediate, happens after multiple sessions +3. **No exceptions logged** - Server just stops responding and becomes zombie process +4. **Happens outside E2E tests** - Minimal Blazor server reproduces the issue + +### Possible Remaining Issues + +#### 1. Resource Leak in DebugSessionEngine +Even with proper Dispose(), there might be: +- ICorDebug COM objects not being released properly +- Managed callbacks holding references +- Event handlers not being unsubscribed + +#### 2. Static State Accumulation +The static `_globalDbgShimHandle` and `_instanceCount` might have edge cases: +- What if Dispose() throws and instance count doesn't decrement? +- What if Initialize() is called multiple times on same instance? +- Thread safety of the static variables under high concurrency? + +#### 3. Native Library Reloading Issue +- Maybe we shouldn't be reusing the library handle at all? +- Try: Remove static handle reuse, load/free library for each session + +#### 4. ICorDebug Process Detachment +The `Detach()` method silently catches exceptions. Maybe: +- Detach is failing but we don't know it +- Process references are accumulating +- COM object cleanup is incomplete + +### Next Steps for Investigation + +1. **Add comprehensive logging** to DebugSessionEngine to track: + - Every Initialize() call with instance ID + - Every Dispose() call with instance ID + - Reference count after each operation + - Native library handle value + +2. **Try removing static handle reuse**: + - Load library fresh for each DebugSessionEngine instance + - Free library in every Dispose() + - See if this prevents the accumulation + +3. **Check for COM object leaks**: + - Ensure all ICorDebug interfaces are properly released + - Check ManagedDebuggerCallbacks for reference cycles + - Look for event handler leaks + +4. **Test with actual NodeDev.Blazor.Server**: + - User says it crashes "every single time" when running actual app + - My test server takes 25+ runs to crash + - There might be additional state in real app causing faster crashes + +### Test Command +```bash +cd /tmp/TestBlazorDebug +dotnet run --no-build + +# In another terminal: +for i in {1..50}; do + curl -s http://localhost:5300/test-debug + sleep 1 +done +``` + +### Files Modified +- `src/NodeDev.Core/Debugger/DebugSessionEngine.cs` - Finalizer, reference counting, race condition fixes +- `src/NodeDev.Core/Project.cs` - Process state handling, removed GC.Collect() +- `.gitignore` - Added TestResults/ + +### Commits +- 6f3eef7: Fixed finalizer threading issue +- a86bcf8: Fixed reference counting bugs +- a8e6ad0: Fixed race condition with double-check locking + +The fixes significantly improved stability but there's still a resource leak or accumulation issue that causes crashes after ~25 debug sessions. diff --git a/src/NodeDev.Core/Debugger/DebugSessionEngine.cs b/src/NodeDev.Core/Debugger/DebugSessionEngine.cs index 652d10d..581bfea 100644 --- a/src/NodeDev.Core/Debugger/DebugSessionEngine.cs +++ b/src/NodeDev.Core/Debugger/DebugSessionEngine.cs @@ -9,6 +9,10 @@ namespace NodeDev.Core.Debugger; /// public class DebugSessionEngine : IDisposable { + private static readonly object _initLock = new object(); + private static IntPtr _globalDbgShimHandle = IntPtr.Zero; + private static int _instanceCount = 0; + private readonly string? _dbgShimPath; private DbgShim? _dbgShim; private IntPtr _dbgShimHandle; @@ -50,14 +54,40 @@ public DebugSessionEngine(string? dbgShimPath = null) public void Initialize() { if (_dbgShim != null) - return; + return; // Already initialized try { var shimPath = _dbgShimPath ?? DbgShimResolver.Resolve(); - // NativeLibrary.Load throws on failure, so no need to check for IntPtr.Zero - _dbgShimHandle = NativeLibrary.Load(shimPath); - _dbgShim = new DbgShim(_dbgShimHandle); + + lock (_initLock) + { + // Double-check after acquiring lock + if (_dbgShim != null) + return; // Another thread initialized while we waited for lock + + // If using default path and global handle exists, reuse it + // Only reuse if this instance is using the default path (no custom _dbgShimPath) + if (_dbgShimPath == null && _globalDbgShimHandle != IntPtr.Zero) + { + _dbgShimHandle = _globalDbgShimHandle; + _dbgShim = new DbgShim(_dbgShimHandle); + _instanceCount++; + return; + } + + // Load the library (either first time or custom path) + _dbgShimHandle = NativeLibrary.Load(shimPath); + _dbgShim = new DbgShim(_dbgShimHandle); + + // If using default path, save as global handle + if (_dbgShimPath == null) + { + _globalDbgShimHandle = _dbgShimHandle; + _instanceCount = 1; + } + // If using custom path, don't update global tracking + } } catch (FileNotFoundException ex) { @@ -294,24 +324,47 @@ public CorDebugProcess SetupDebugging(CorDebug corDebug, int processId) /// /// Detaches from the current debug process. /// - public void Detach() + /// True if the debugged process is still running; false if it has already exited. + public void Detach(bool processStillRunning = false) { - if (CurrentProcess != null) + if (CurrentProcess == null) + return; + + var processToDetach = CurrentProcess; + CurrentProcess = null; // Clear reference immediately to prevent re-entrance + + try { - try + // Only stop the process if it's still running + // Calling Stop on an already-exited process can cause crashes + if (processStillRunning) { - CurrentProcess.Stop(0); - CurrentProcess.Detach(); + try + { + processToDetach.Stop(0); + } + catch + { + // Silently ignore - process may have exited or be in invalid state + } } - catch + + // Attempt to detach - this is optional cleanup + // If it fails, we've already cleared CurrentProcess so we're detached logically + try { - // Ignore errors during detach + processToDetach.Detach(); } - finally + catch { - CurrentProcess = null; + // Silently ignore - process may be gone or in invalid state + // This is expected when process has already exited } } + finally + { + // Always null out the reference (already done above) + } } /// @@ -356,12 +409,44 @@ protected virtual void Dispose(bool disposing) if (disposing) { Detach(); - } - - if (_dbgShimHandle != IntPtr.Zero) - { - NativeLibrary.Free(_dbgShimHandle); - _dbgShimHandle = IntPtr.Zero; + + // Only free the native library from managed Dispose, not from finalizer + // Freeing native libraries from the finalizer thread can cause crashes on Linux + lock (_initLock) + { + // If this instance is using the global handle, update reference count + if (_dbgShimHandle != IntPtr.Zero && _dbgShimHandle == _globalDbgShimHandle) + { + _instanceCount--; + if (_instanceCount <= 0) + { + try + { + NativeLibrary.Free(_dbgShimHandle); + _globalDbgShimHandle = IntPtr.Zero; + } + catch (Exception ex) + { + // Log but don't throw - we're disposing + Console.WriteLine($"Warning: Failed to free dbgshim library: {ex.Message}"); + } + } + } + // If using a custom path (not global handle), free it directly + else if (_dbgShimHandle != IntPtr.Zero) + { + try + { + NativeLibrary.Free(_dbgShimHandle); + } + catch (Exception ex) + { + // Log but don't throw - we're disposing + Console.WriteLine($"Warning: Failed to free dbgshim library: {ex.Message}"); + } + } + _dbgShimHandle = IntPtr.Zero; + } } _dbgShim = null; @@ -373,6 +458,11 @@ protected virtual void Dispose(bool disposing) /// ~DebugSessionEngine() { + // Note: We don't free the native library handle in the finalizer + // because NativeLibrary.Free can cause crashes when called from + // the finalizer thread, especially on Linux. + // This means we might leak the handle if Dispose() is not called explicitly, + // but that's better than crashing the application. Dispose(false); } } diff --git a/src/NodeDev.Core/Project.cs b/src/NodeDev.Core/Project.cs index 77a73b9..763d910 100644 --- a/src/NodeDev.Core/Project.cs +++ b/src/NodeDev.Core/Project.cs @@ -532,7 +532,8 @@ public string GetScriptRunnerPath() // Detach debugger before notifying UI - this clears CurrentProcess // so that IsHardDebugging returns false when UI re-evaluates state - _debugEngine?.Detach(); + // Process has already exited, so pass false to avoid calling Stop() + _debugEngine?.Detach(processStillRunning: false); // Notify that debugging has stopped HardDebugStateChangedSubject.OnNext(false); @@ -545,7 +546,9 @@ public string GetScriptRunnerPath() ConsoleOutputSubject.OnNext($"Error during debug execution: {ex.Message}" + Environment.NewLine); // Detach debugger before notifying UI - this clears CurrentProcess // so that IsHardDebugging returns false when UI re-evaluates state - _debugEngine?.Detach(); + // Check if process is still running before detaching + bool processStillRunning = _debuggedProcess != null && !_debuggedProcess.HasExited; + _debugEngine?.Detach(processStillRunning); HardDebugStateChangedSubject.OnNext(false); GraphExecutionChangedSubject.OnNext(false); return null; @@ -556,7 +559,8 @@ public string GetScriptRunnerPath() _debugEngine = null; _debuggedProcess = null; NodeClassTypeCreator = null; - GC.Collect(); + // Removed GC.Collect() - forcing collection immediately after disposing native resources + // can cause crashes, especially on Linux where cleanup might still be in progress } } @@ -571,7 +575,9 @@ public void StopDebugging() try { // Detach debugger first - _debugEngine?.Detach(); + // Check if process is still running to properly stop it + bool processStillRunning = _debuggedProcess != null && !_debuggedProcess.HasExited; + _debugEngine?.Detach(processStillRunning); // Try to kill the process if it's still running if (_debuggedProcess != null && !_debuggedProcess.HasExited)