Skip to content

Commit ae7b656

Browse files
committed
[cuda.cooperative] Add tests for multidimensional block loads and stores and add
documentation for block loads and stores.
1 parent 1eb8d81 commit ae7b656

File tree

7 files changed

+180
-15
lines changed

7 files changed

+180
-15
lines changed

python/cuda_cooperative/cuda/cooperative/experimental/_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def find_dim3(name, txt):
7070
find_unsigned(f"{name}_z", txt),
7171
)
7272

73+
7374
def normalize_dim_param(dim):
7475
x = dim[0] if type(dim) is not int else dim
7576
y = dim[1] if type(dim) is not int and len(dim) >= 2 else 1

python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
import numba
77

8-
from cuda.cooperative.experimental._common import make_binary_tempfile, normalize_dim_param
8+
from cuda.cooperative.experimental._common import (
9+
make_binary_tempfile,
10+
normalize_dim_param,
11+
)
912
from cuda.cooperative.experimental._types import (
1013
Algorithm,
1114
Dependency,
@@ -34,7 +37,50 @@
3437
"warp_transpose_timesliced": "::cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED",
3538
}
3639

40+
3741
def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
42+
"""Creates an operation that performs a block-wide load.
43+
44+
Returns a callable object that can be linked to and invoked from device code. It can be
45+
invoked with the following signatures:
46+
47+
- `(src: numba.types.Array, dest: numba.types.Array) -> dtype`: Each thread loads
48+
`items_per_thread` items from `src` into `dest`. `dest` must contain at least
49+
`items_per_thread` items.
50+
51+
Different data movement strategies can be selected via the `algorithm` parameter:
52+
53+
- `algorithm="direct"` (default): A blocked arrangement of data is read directly from memory.
54+
- `algorithm="striped"`: A striped arrangement of data is read directly from memory.
55+
- `algorithm="vectorize"`: A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization.
56+
- `algorithm="transpose"`: A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
57+
- `algorithm="warp_transpose"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
58+
- `algorithm="warp_transpose_timesliced"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement one warp at a time.
59+
60+
For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockLoad.html).
61+
62+
Args:
63+
dtype: Data type being loaded
64+
threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
65+
items_per_thread: The number of items each thread loads
66+
algorithm: The data movement algorithm to use
67+
68+
Example:
69+
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
70+
each thread handling 4 integers.
71+
72+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
73+
:language: python
74+
:dedent:
75+
:start-after: example-begin imports
76+
:end-before: example-end imports
77+
78+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
79+
:language: python
80+
:dedent:
81+
:start-after: example-begin load_store
82+
:end-before: example-end load_store
83+
"""
3884
dim = normalize_dim_param(threads_in_block)
3985
template = Algorithm(
4086
"BlockLoad",
@@ -78,6 +124,48 @@ def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
78124

79125

80126
def store(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
127+
"""Creates an operation that performs a block-wide store.
128+
129+
Returns a callable object that can be linked to and invoked from device code. It can be
130+
invoked with the following signatures:
131+
132+
- `(dest: numba.types.Array, src: numba.types.Array) -> dtype`: Each thread stores
133+
`items_per_thread` items from `src` into `dest`. `src` must contain at least
134+
`items_per_thread` items.
135+
136+
Different data movement strategies can be selected via the `algorithm` parameter:
137+
138+
- `algorithm="direct"` (default): A blocked arrangement of data is written directly to memory.
139+
- `algorithm="striped"`: A striped arrangement of data is written directly to memory.
140+
- `algorithm="vectorize"`: A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization.
141+
- `algorithm="transpose"`: A blocked arrangement is locally transposed into a striped arrangement which is then written to memory.
142+
- `algorithm="warp_transpose"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory.
143+
- `algorithm="warp_transpose_timesliced"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. To reduce the shared memory requireent, only one warp’s worth of shared memory is provisioned and is subsequently time-sliced among warps.
144+
145+
For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockStore.html).
146+
147+
Args:
148+
dtype: Data type being loaded
149+
threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
150+
items_per_thread: The number of items each thread loads
151+
algorithm: The data movement algorithm to use
152+
153+
Example:
154+
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
155+
each thread handling 4 integers.
156+
157+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
158+
:language: python
159+
:dedent:
160+
:start-after: example-begin imports
161+
:end-before: example-end imports
162+
163+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
164+
:language: python
165+
:dedent:
166+
:start-after: example-begin load_store
167+
:end-before: example-end load_store
168+
"""
81169
dim = normalize_dim_param(threads_in_block)
82170
template = Algorithm(
83171
"BlockStore",

python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ def reduce(dtype, threads_in_block, binary_op, items_per_thread=1, methods=None)
2828
2929
- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
3030
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
31-
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
32-
first `items_per_thread` items will be included in the reduction.
31+
reduction. The array must contain at least `items_per_thread` items; only the first
32+
`items_per_thread` items will be included in the reduction.
3333
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
3434
single item to the reduction. The items contributed by all other threads are ignored.
3535
@@ -135,7 +135,7 @@ def sum(dtype, threads_in_block, items_per_thread=1, methods=None):
135135
136136
- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
137137
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
138-
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
138+
reduction. The array must contain at least `items_per_thread` items; only the
139139
first `items_per_thread` items will be included in the reduction.
140140
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
141141
single item to the reduction. The items contributed by all other threads are ignored.

python/cuda_cooperative/tests/helpers.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

55
import numpy as np
6-
from numba import types
6+
from numba import cuda, types
77

88
NUMBA_TYPES_TO_NP = {
99
types.int8: np.int8,
@@ -21,3 +21,14 @@
2121

2222
def random_int(shape, dtype):
2323
return np.random.randint(0, 128, size=shape).astype(dtype)
24+
25+
26+
@cuda.jit(device=True)
27+
def row_major_tid():
28+
dim = cuda.blockDim
29+
idx = cuda.threadIdx
30+
return (
31+
(0 if dim.z == 1 else idx.z * dim.x * dim.y)
32+
+ (0 if dim.y == 1 else idx.y * dim.x)
33+
+ idx.x
34+
)

python/cuda_cooperative/tests/test_block_load.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5+
from functools import reduce
6+
from operator import mul
7+
58
import numba
69
import pytest
7-
from helpers import NUMBA_TYPES_TO_NP, random_int
10+
from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
811
from numba import cuda, types
912
from pynvjitlink import patch
1013

@@ -15,7 +18,7 @@
1518

1619

1720
@pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
18-
@pytest.mark.parametrize("threads_in_block", [32, 128, 256])
21+
@pytest.mark.parametrize("threads_in_block", [32, 128, 256, (4, 8), (2, 4, 8)])
1922
@pytest.mark.parametrize("items_per_thread", [1, 3])
2023
@pytest.mark.parametrize(
2124
"algorithm",
@@ -32,16 +35,22 @@ def test_block_load(T, threads_in_block, items_per_thread, algorithm):
3235
block_load = cudax.block.load(T, threads_in_block, items_per_thread, algorithm)
3336
temp_storage_bytes = block_load.temp_storage_bytes
3437

38+
num_threads_in_block = (
39+
threads_in_block
40+
if type(threads_in_block) is int
41+
else reduce(mul, threads_in_block)
42+
)
43+
3544
if algorithm == "striped":
3645

3746
@cuda.jit(device=True)
3847
def output_index(i):
39-
return cuda.threadIdx.x + threads_in_block * i
48+
return row_major_tid() + num_threads_in_block * i
4049
else:
4150

4251
@cuda.jit(device=True)
4352
def output_index(i):
44-
return cuda.threadIdx.x * items_per_thread + i
53+
return row_major_tid() * items_per_thread + i
4554

4655
@cuda.jit(link=block_load.files)
4756
def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
5261
d_output[output_index(i)] = thread_data[i]
5362

5463
dtype = NUMBA_TYPES_TO_NP[T]
55-
items_per_tile = threads_in_block * items_per_thread
64+
items_per_tile = num_threads_in_block * items_per_thread
5665
h_input = random_int(items_per_tile, dtype)
5766
d_input = cuda.to_device(h_input)
5867
d_output = cuda.device_array(items_per_tile, dtype=dtype)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
# example-begin imports
6+
import numba
7+
import numpy as np
8+
from numba import cuda
9+
from pynvjitlink import patch
10+
11+
import cuda.cooperative.experimental as cudax
12+
13+
patch.patch_numba_linker(lto=True)
14+
# example-end imports
15+
16+
numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
17+
18+
19+
def test_block_load_store():
20+
# example-begin load_store
21+
threads_in_block = 32
22+
items_per_thread = 4
23+
block_load = cudax.block.load(
24+
numba.int32, threads_in_block, items_per_thread, "striped"
25+
)
26+
block_store = cudax.block.store(
27+
numba.int32, threads_in_block, items_per_thread, "striped"
28+
)
29+
30+
@cuda.jit(link=block_load.files + block_store.files)
31+
def kernel(input, output):
32+
tmp = cuda.local.array(items_per_thread, numba.int32)
33+
block_load(input, tmp)
34+
cuda.syncthreads()
35+
block_store(output, tmp)
36+
37+
# example-end load_store
38+
39+
h_input = np.random.randint(
40+
0, 42, threads_in_block * items_per_thread, dtype=np.int32
41+
)
42+
d_input = cuda.to_device(h_input)
43+
d_output = cuda.device_array_like(d_input)
44+
kernel[1, threads_in_block](d_input, d_output)
45+
h_output = d_output.copy_to_host()
46+
47+
np.testing.assert_allclose(h_output, h_input)

python/cuda_cooperative/tests/test_block_store.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5+
from functools import reduce
6+
from operator import mul
7+
58
import numba
69
import pytest
7-
from helpers import NUMBA_TYPES_TO_NP, random_int
10+
from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
811
from numba import cuda, types
912
from pynvjitlink import patch
1013

@@ -15,7 +18,7 @@
1518

1619

1720
@pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
18-
@pytest.mark.parametrize("threads_in_block", [32, 128, 256])
21+
@pytest.mark.parametrize("threads_in_block", [32, 128, 256, (4, 8), (2, 4, 8)])
1922
@pytest.mark.parametrize("items_per_thread", [1, 3])
2023
@pytest.mark.parametrize(
2124
"algorithm",
@@ -32,16 +35,22 @@ def test_block_store(T, threads_in_block, items_per_thread, algorithm):
3235
block_store = cudax.block.store(T, threads_in_block, items_per_thread, algorithm)
3336
temp_storage_bytes = block_store.temp_storage_bytes
3437

38+
num_threads_in_block = (
39+
threads_in_block
40+
if type(threads_in_block) is int
41+
else reduce(mul, threads_in_block)
42+
)
43+
3544
if algorithm == "striped":
3645

3746
@cuda.jit(device=True)
3847
def input_index(i):
39-
return cuda.threadIdx.x + threads_in_block * i
48+
return row_major_tid() + num_threads_in_block * i
4049
else:
4150

4251
@cuda.jit(device=True)
4352
def input_index(i):
44-
return cuda.threadIdx.x * items_per_thread + i
53+
return row_major_tid() * items_per_thread + i
4554

4655
@cuda.jit(link=block_store.files)
4756
def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
5261
block_store(temp_storage, d_output, thread_data)
5362

5463
dtype = NUMBA_TYPES_TO_NP[T]
55-
items_per_tile = threads_in_block * items_per_thread
64+
items_per_tile = num_threads_in_block * items_per_thread
5665
h_input = random_int(items_per_tile, dtype)
5766
d_input = cuda.to_device(h_input)
5867
d_output = cuda.device_array(items_per_tile, dtype=dtype)

0 commit comments

Comments
 (0)