Skip to content

Commit 230369d

Browse files
authored
Improve Cython Lifetime Management by Adding References in DeviceBuffer (#661)
As discussed with @shwina @harrism and @kkraus14, this PR adds 2 properties to `DeviceBuffer` to allow for automatic reference counting of `MemoryResource` and `Stream` objects. This will prevent any `MemoryResource` from being destructed while any `DeviceBuffer` that needs the MR for deallocation is still alive. There are a few outstanding issues I could use input on: 1. The test `test_rmm_device_buffer` is failing due to the line: `sys.getsizeof(b) == b.size`. Need input on the best way forward. 1. This test is failing since `DeviceBuffer` is now involved in GC. Python automatically adds the GC memory overhead to `__size__` (see [here](https://github.com/python/cpython/blob/master/Python/sysmodule.c#L1701)) which makes it difficult to continue working the same way it has before. 1. Only options I can think of are: 1. Remove this check from the test or alter the "correct" value 1. Add `@cython.no_gc` which is very risky. 1. The current PR implementation includes cuda stream object reference counting but treats all `Stream` objects the same. @harrism mentioned only streams owned by RMM should be tracked this way but I am not sure if thats necessary or how to distinguish them at this point. Other than the above items, all test are passing and I ran this through the cuML test suite without any issues. Thanks for your help. Authors: - Michael Demoret (@mdemoret-nv) Approvers: - Keith Kraus (@kkraus14) URL: #661
1 parent dc5889b commit 230369d

File tree

5 files changed

+58
-11
lines changed

5 files changed

+58
-11
lines changed

python/rmm/_lib/device_buffer.pxd

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ from libc.stdint cimport uintptr_t
1717

1818
from rmm._lib.cuda_stream_view cimport cuda_stream_view
1919
from rmm._cuda.stream cimport Stream
20+
from rmm._lib.memory_resource cimport MemoryResource
2021

2122

2223
cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
@@ -38,6 +39,15 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
3839
cdef class DeviceBuffer:
3940
cdef unique_ptr[device_buffer] c_obj
4041

42+
# Holds a reference to the MemoryResource used for allocation. Ensures the
43+
# MR does not get destroyed before this DeviceBuffer. `mr` is needed for
44+
# deallocation
45+
cdef MemoryResource mr
46+
47+
# Holds a reference to the stream used by the underlying `device_buffer`.
48+
# Ensures the stream does not get destroyed before this DeviceBuffer
49+
cdef Stream stream
50+
4151
@staticmethod
4252
cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr)
4353

python/rmm/_lib/device_buffer.pyx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ from libcpp.utility cimport move
2222

2323
from rmm._cuda.gpu cimport cudaError, cudaError_t
2424
from rmm._cuda.stream cimport Stream
25+
2526
from rmm._cuda.stream import DEFAULT_STREAM
27+
2628
from rmm._lib.lib cimport (
2729
cudaMemcpyAsync,
2830
cudaMemcpyDeviceToDevice,
@@ -32,6 +34,7 @@ from rmm._lib.lib cimport (
3234
cudaStream_t,
3335
cudaStreamSynchronize,
3436
)
37+
from rmm._lib.memory_resource cimport get_current_device_resource
3538

3639

3740
cdef class DeviceBuffer:
@@ -81,6 +84,10 @@ cdef class DeviceBuffer:
8184
if stream.c_is_default():
8285
stream.c_synchronize()
8386

87+
# Save a reference to the MR and stream used for allocation
88+
self.mr = get_current_device_resource()
89+
self.stream = stream
90+
8491
def __len__(self):
8592
return self.size
8693

python/rmm/_lib/memory_resource.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,5 @@ cdef class LoggingResourceAdaptor(MemoryResource):
104104
cpdef MemoryResource get_upstream(self)
105105
cpdef get_file_name(self)
106106
cpdef flush(self)
107+
108+
cpdef MemoryResource get_current_device_resource()

python/rmm/_lib/memory_resource.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ cpdef get_per_device_resource_type(int device):
437437
return type(get_per_device_resource(device))
438438

439439

440-
cpdef get_current_device_resource():
440+
cpdef MemoryResource get_current_device_resource():
441441
"""
442442
Get the memory resource used for RMM device allocations on the current
443443
device.

python/rmm/tests/test_rmm.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2020, NVIDIA CORPORATION.
2+
import gc
23
import os
34
import sys
45
from itertools import product
@@ -8,6 +9,7 @@
89
from numba import cuda
910

1011
import rmm
12+
import rmm._cuda.stream
1113

1214
if sys.version_info < (3, 8):
1315
try:
@@ -20,6 +22,17 @@
2022
cuda.set_memory_manager(rmm.RMMNumbaManager)
2123

2224

25+
@pytest.fixture(scope="function", autouse=True)
26+
def rmm_auto_reinitialize():
27+
28+
# Run the test
29+
yield
30+
31+
# Automatically reinitialize the current memory resource after running each
32+
# test
33+
rmm.reinitialize()
34+
35+
2336
def array_tester(dtype, nelem, alloc):
2437
# data
2538
h_in = np.full(nelem, 3.2, dtype)
@@ -70,7 +83,6 @@ def test_rmm_modes(dtype, nelem, alloc, managed, pool):
7083
assert rmm.is_initialized()
7184

7285
array_tester(dtype, nelem, alloc)
73-
rmm.reinitialize()
7486

7587

7688
@pytest.mark.parametrize("dtype", _dtypes)
@@ -92,7 +104,6 @@ def test_rmm_csv_log(dtype, nelem, alloc, tmpdir):
92104
assert csv.find(b"Time,Action,Pointer,Size,Stream") >= 0
93105
finally:
94106
os.remove(fname)
95-
rmm.reinitialize()
96107

97108

98109
@pytest.mark.parametrize("size", [0, 5])
@@ -109,7 +120,7 @@ def test_rmm_device_buffer(size):
109120
assert len(b) == b.size
110121
assert b.nbytes == b.size
111122
assert b.capacity() >= b.size
112-
assert sys.getsizeof(b) == b.size
123+
assert b.__sizeof__() == b.size
113124

114125
# Test `__cuda_array_interface__`
115126
keyset = {"data", "shape", "strides", "typestr", "version"}
@@ -299,7 +310,6 @@ def test_pool_memory_resource(dtype, nelem, alloc):
299310
rmm.mr.set_current_device_resource(mr)
300311
assert rmm.mr.get_current_device_resource_type() is type(mr)
301312
array_tester(dtype, nelem, alloc)
302-
rmm.reinitialize()
303313

304314

305315
@pytest.mark.parametrize("dtype", _dtypes)
@@ -319,7 +329,6 @@ def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream):
319329
rmm.mr.set_current_device_resource(mr)
320330
assert rmm.mr.get_current_device_resource_type() is type(mr)
321331
array_tester(dtype, nelem, alloc)
322-
rmm.reinitialize()
323332

324333

325334
@pytest.mark.parametrize("dtype", _dtypes)
@@ -350,15 +359,13 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
350359
rmm.mr.set_current_device_resource(mr)
351360
assert rmm.mr.get_current_device_resource_type() is type(mr)
352361
array_tester(dtype, nelem, alloc)
353-
rmm.reinitialize()
354362

355363

356364
def test_reinitialize_max_pool_size():
357365
rmm.reinitialize(
358366
pool_allocator=True, initial_pool_size=0, maximum_pool_size=1 << 23
359367
)
360368
rmm.DeviceBuffer().resize((1 << 23) - 1)
361-
rmm.reinitialize()
362369

363370

364371
def test_reinitialize_max_pool_size_exceeded():
@@ -367,7 +374,6 @@ def test_reinitialize_max_pool_size_exceeded():
367374
)
368375
with pytest.raises(MemoryError):
369376
rmm.DeviceBuffer().resize(1 << 24)
370-
rmm.reinitialize()
371377

372378

373379
def test_reinitialize_initial_pool_size_gt_max():
@@ -378,7 +384,30 @@ def test_reinitialize_initial_pool_size_gt_max():
378384
maximum_pool_size=1 << 10,
379385
)
380386
assert "Initial pool size exceeds the maximum pool size" in str(e.value)
381-
rmm.reinitialize()
387+
388+
389+
def test_mr_devicebuffer_lifetime():
390+
# Test ensures MR/Stream lifetime is longer than DeviceBuffer. Even if all
391+
# references go out of scope
392+
# Create new Pool MR
393+
rmm.mr.set_current_device_resource(
394+
rmm.mr.PoolMemoryResource(rmm.mr.get_current_device_resource())
395+
)
396+
397+
# Creates a new non-default stream
398+
stream = rmm._cuda.stream.Stream()
399+
400+
# Allocate DeviceBuffer with Pool and Stream
401+
a = rmm.DeviceBuffer(size=10, stream=stream)
402+
403+
# Change current MR. Will cause Pool to go out of scope
404+
rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
405+
406+
# Force collection to ensure objects are cleaned up
407+
gc.collect()
408+
409+
# Delete a. Used to crash before. Pool MR should still be alive
410+
del a
382411

383412

384413
@pytest.mark.parametrize("dtype", _dtypes)
@@ -404,4 +433,3 @@ def test_rmm_enable_disable_logging(dtype, nelem, alloc, tmpdir):
404433
os.remove(fname)
405434

406435
rmm.disable_logging()
407-
rmm.reinitialize()

0 commit comments

Comments
 (0)