Skip to content

Commit 9943068

Browse files
brycelelbachdavebayer
authored andcommitted
[cuda.cooperative] Support multidimensional thread blocks in block load/store and improve load/store docs (NVIDIA#3161)
* [cuda.cooperative] Support multidimensional thread blocks in block load/store * [cuda.cooperative] Add tests for multidimensional block loads and stores and add documentation for block loads and stores. * [cuda.cooperative] Remove an unnecessary synchronization from the block load/store example and fix the return types of block load/store in the docs.
1 parent 00d5e0f commit 9943068

File tree

7 files changed

+196
-17
lines changed

7 files changed

+196
-17
lines changed

python/cuda_cooperative/cuda/cooperative/experimental/_common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,10 @@ def find_dim3(name, txt):
6969
find_unsigned(f"{name}_y", txt),
7070
find_unsigned(f"{name}_z", txt),
7171
)
72+
73+
74+
def normalize_dim_param(dim):
75+
x = dim[0] if type(dim) is not int else dim
76+
y = dim[1] if type(dim) is not int and len(dim) >= 2 else 1
77+
z = dim[2] if type(dim) is not int and len(dim) >= 3 else 1
78+
return (x, y, z)

python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
import numba
77

8-
from cuda.cooperative.experimental._common import make_binary_tempfile
8+
from cuda.cooperative.experimental._common import (
9+
make_binary_tempfile,
10+
normalize_dim_param,
11+
)
912
from cuda.cooperative.experimental._types import (
1013
Algorithm,
1114
Dependency,
@@ -36,6 +39,49 @@
3639

3740

3841
def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
42+
"""Creates an operation that performs a block-wide load.
43+
44+
Returns a callable object that can be linked to and invoked from device code. It can be
45+
invoked with the following signatures:
46+
47+
- `(src: numba.types.Array, dest: numba.types.Array) -> None`: Each thread loads
48+
`items_per_thread` items from `src` into `dest`. `dest` must contain at least
49+
`items_per_thread` items.
50+
51+
Different data movement strategies can be selected via the `algorithm` parameter:
52+
53+
- `algorithm="direct"` (default): A blocked arrangement of data is read directly from memory.
54+
- `algorithm="striped"`: A striped arrangement of data is read directly from memory.
55+
- `algorithm="vectorize"`: A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization.
56+
- `algorithm="transpose"`: A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
57+
- `algorithm="warp_transpose"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
58+
- `algorithm="warp_transpose_timesliced"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement one warp at a time.
59+
60+
For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockLoad.html).
61+
62+
Args:
63+
dtype: Data type being loaded
64+
threads_per_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
65+
items_per_thread: The number of items each thread loads
66+
algorithm: The data movement algorithm to use
67+
68+
Example:
69+
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
70+
each thread handling 4 integers.
71+
72+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
73+
:language: python
74+
:dedent:
75+
:start-after: example-begin imports
76+
:end-before: example-end imports
77+
78+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
79+
:language: python
80+
:dedent:
81+
:start-after: example-begin load_store
82+
:end-before: example-end load_store
83+
"""
84+
dim = normalize_dim_param(threads_per_block)
3985
template = Algorithm(
4086
"BlockLoad",
4187
"Load",
@@ -46,6 +92,8 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
4692
TemplateParameter("BLOCK_DIM_X"),
4793
TemplateParameter("ITEMS_PER_THREAD"),
4894
TemplateParameter("ALGORITHM"),
95+
TemplateParameter("BLOCK_DIM_Y"),
96+
TemplateParameter("BLOCK_DIM_Z"),
4997
],
5098
[
5199
[
@@ -58,9 +106,11 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
58106
specialization = template.specialize(
59107
{
60108
"T": dtype,
61-
"BLOCK_DIM_X": threads_per_block,
109+
"BLOCK_DIM_X": dim[0],
62110
"ITEMS_PER_THREAD": items_per_thread,
63111
"ALGORITHM": CUB_BLOCK_LOAD_ALGOS[algorithm],
112+
"BLOCK_DIM_Y": dim[1],
113+
"BLOCK_DIM_Z": dim[2],
64114
}
65115
)
66116
return Invocable(
@@ -74,6 +124,49 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
74124

75125

76126
def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
127+
"""Creates an operation that performs a block-wide store.
128+
129+
Returns a callable object that can be linked to and invoked from device code. It can be
130+
invoked with the following signatures:
131+
132+
- `(dest: numba.types.Array, src: numba.types.Array) -> None`: Each thread stores
133+
`items_per_thread` items from `src` into `dest`. `src` must contain at least
134+
`items_per_thread` items.
135+
136+
Different data movement strategies can be selected via the `algorithm` parameter:
137+
138+
- `algorithm="direct"` (default): A blocked arrangement of data is written directly to memory.
139+
- `algorithm="striped"`: A striped arrangement of data is written directly to memory.
140+
- `algorithm="vectorize"`: A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization.
141+
- `algorithm="transpose"`: A blocked arrangement is locally transposed into a striped arrangement which is then written to memory.
142+
- `algorithm="warp_transpose"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory.
143+
- `algorithm="warp_transpose_timesliced"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. To reduce the shared memory requireent, only one warp’s worth of shared memory is provisioned and is subsequently time-sliced among warps.
144+
145+
For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockStore.html).
146+
147+
Args:
148+
dtype: Data type being stored
149+
threads_per_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
150+
items_per_thread: The number of items each thread loads
151+
algorithm: The data movement algorithm to use
152+
153+
Example:
154+
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
155+
each thread handling 4 integers.
156+
157+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
158+
:language: python
159+
:dedent:
160+
:start-after: example-begin imports
161+
:end-before: example-end imports
162+
163+
.. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
164+
:language: python
165+
:dedent:
166+
:start-after: example-begin load_store
167+
:end-before: example-end load_store
168+
"""
169+
dim = normalize_dim_param(threads_per_block)
77170
template = Algorithm(
78171
"BlockStore",
79172
"Store",
@@ -84,6 +177,8 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
84177
TemplateParameter("BLOCK_DIM_X"),
85178
TemplateParameter("ITEMS_PER_THREAD"),
86179
TemplateParameter("ALGORITHM"),
180+
TemplateParameter("BLOCK_DIM_Y"),
181+
TemplateParameter("BLOCK_DIM_Z"),
87182
],
88183
[
89184
[
@@ -96,9 +191,11 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
96191
specialization = template.specialize(
97192
{
98193
"T": dtype,
99-
"BLOCK_DIM_X": threads_per_block,
194+
"BLOCK_DIM_X": dim[0],
100195
"ITEMS_PER_THREAD": items_per_thread,
101196
"ALGORITHM": CUB_BLOCK_STORE_ALGOS[algorithm],
197+
"BLOCK_DIM_Y": dim[1],
198+
"BLOCK_DIM_Z": dim[2],
102199
}
103200
)
104201
return Invocable(

python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ def reduce(dtype, threads_per_block, binary_op, items_per_thread=1, methods=None
2828
2929
- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
3030
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
31-
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
32-
first `items_per_thread` items will be included in the reduction.
31+
reduction. The array must contain at least `items_per_thread` items; only the first
32+
`items_per_thread` items will be included in the reduction.
3333
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
3434
single item to the reduction. The items contributed by all other threads are ignored.
3535
@@ -135,7 +135,7 @@ def sum(dtype, threads_per_block, items_per_thread=1, methods=None):
135135
136136
- `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
137137
- `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
138-
reduction. The array must be 1D and contain at least `items_per_thread` items; only the
138+
reduction. The array must contain at least `items_per_thread` items; only the
139139
first `items_per_thread` items will be included in the reduction.
140140
- `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
141141
single item to the reduction. The items contributed by all other threads are ignored.

python/cuda_cooperative/tests/helpers.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

55
import numpy as np
6-
from numba import types
6+
from numba import cuda, types
77

88
NUMBA_TYPES_TO_NP = {
99
types.int8: np.int8,
@@ -21,3 +21,14 @@
2121

2222
def random_int(shape, dtype):
2323
return np.random.randint(0, 128, size=shape).astype(dtype)
24+
25+
26+
@cuda.jit(device=True)
27+
def row_major_tid():
28+
dim = cuda.blockDim
29+
idx = cuda.threadIdx
30+
return (
31+
(0 if dim.z == 1 else idx.z * dim.x * dim.y)
32+
+ (0 if dim.y == 1 else idx.y * dim.x)
33+
+ idx.x
34+
)

python/cuda_cooperative/tests/test_block_load.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5+
from functools import reduce
6+
from operator import mul
7+
58
import numba
69
import pytest
7-
from helpers import NUMBA_TYPES_TO_NP, random_int
10+
from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
811
from numba import cuda, types
912
from pynvjitlink import patch
1013

@@ -15,7 +18,7 @@
1518

1619

1720
@pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
18-
@pytest.mark.parametrize("threads_per_block", [32, 128, 256])
21+
@pytest.mark.parametrize("threads_per_block", [32, 128, 256, (4, 8), (2, 4, 8)])
1922
@pytest.mark.parametrize("items_per_thread", [1, 3])
2023
@pytest.mark.parametrize(
2124
"algorithm",
@@ -32,16 +35,22 @@ def test_block_load(T, threads_per_block, items_per_thread, algorithm):
3235
block_load = cudax.block.load(T, threads_per_block, items_per_thread, algorithm)
3336
temp_storage_bytes = block_load.temp_storage_bytes
3437

38+
num_threads_per_block = (
39+
threads_per_block
40+
if type(threads_per_block) is int
41+
else reduce(mul, threads_per_block)
42+
)
43+
3544
if algorithm == "striped":
3645

3746
@cuda.jit(device=True)
3847
def output_index(i):
39-
return cuda.threadIdx.x + threads_per_block * i
48+
return row_major_tid() + num_threads_per_block * i
4049
else:
4150

4251
@cuda.jit(device=True)
4352
def output_index(i):
44-
return cuda.threadIdx.x * items_per_thread + i
53+
return row_major_tid() * items_per_thread + i
4554

4655
@cuda.jit(link=block_load.files)
4756
def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
5261
d_output[output_index(i)] = thread_data[i]
5362

5463
dtype = NUMBA_TYPES_TO_NP[T]
55-
items_per_tile = threads_per_block * items_per_thread
64+
items_per_tile = num_threads_per_block * items_per_thread
5665
h_input = random_int(items_per_tile, dtype)
5766
d_input = cuda.to_device(h_input)
5867
d_output = cuda.device_array(items_per_tile, dtype=dtype)
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
# example-begin imports
6+
import numba
7+
import numpy as np
8+
from numba import cuda
9+
from pynvjitlink import patch
10+
11+
import cuda.cooperative.experimental as cudax
12+
13+
patch.patch_numba_linker(lto=True)
14+
# example-end imports
15+
16+
numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
17+
18+
19+
def test_block_load_store():
20+
# example-begin load_store
21+
threads_per_block = 32
22+
items_per_thread = 4
23+
block_load = cudax.block.load(
24+
numba.int32, threads_per_block, items_per_thread, "striped"
25+
)
26+
block_store = cudax.block.store(
27+
numba.int32, threads_per_block, items_per_thread, "striped"
28+
)
29+
30+
@cuda.jit(link=block_load.files + block_store.files)
31+
def kernel(input, output):
32+
tmp = cuda.local.array(items_per_thread, numba.int32)
33+
block_load(input, tmp)
34+
block_store(output, tmp)
35+
36+
# example-end load_store
37+
38+
h_input = np.random.randint(
39+
0, 42, threads_per_block * items_per_thread, dtype=np.int32
40+
)
41+
d_input = cuda.to_device(h_input)
42+
d_output = cuda.device_array_like(d_input)
43+
kernel[1, threads_per_block](d_input, d_output)
44+
h_output = d_output.copy_to_host()
45+
46+
np.testing.assert_allclose(h_output, h_input)

python/cuda_cooperative/tests/test_block_store.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5+
from functools import reduce
6+
from operator import mul
7+
58
import numba
69
import pytest
7-
from helpers import NUMBA_TYPES_TO_NP, random_int
10+
from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
811
from numba import cuda, types
912
from pynvjitlink import patch
1013

@@ -15,7 +18,7 @@
1518

1619

1720
@pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
18-
@pytest.mark.parametrize("threads_per_block", [32, 128, 256])
21+
@pytest.mark.parametrize("threads_per_block", [32, 128, 256, (4, 8), (2, 4, 8)])
1922
@pytest.mark.parametrize("items_per_thread", [1, 3])
2023
@pytest.mark.parametrize(
2124
"algorithm",
@@ -32,16 +35,22 @@ def test_block_store(T, threads_per_block, items_per_thread, algorithm):
3235
block_store = cudax.block.store(T, threads_per_block, items_per_thread, algorithm)
3336
temp_storage_bytes = block_store.temp_storage_bytes
3437

38+
num_threads_per_block = (
39+
threads_per_block
40+
if type(threads_per_block) is int
41+
else reduce(mul, threads_per_block)
42+
)
43+
3544
if algorithm == "striped":
3645

3746
@cuda.jit(device=True)
3847
def input_index(i):
39-
return cuda.threadIdx.x + threads_per_block * i
48+
return row_major_tid() + num_threads_per_block * i
4049
else:
4150

4251
@cuda.jit(device=True)
4352
def input_index(i):
44-
return cuda.threadIdx.x * items_per_thread + i
53+
return row_major_tid() * items_per_thread + i
4554

4655
@cuda.jit(link=block_store.files)
4756
def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
5261
block_store(temp_storage, d_output, thread_data)
5362

5463
dtype = NUMBA_TYPES_TO_NP[T]
55-
items_per_tile = threads_per_block * items_per_thread
64+
items_per_tile = num_threads_per_block * items_per_thread
5665
h_input = random_int(items_per_tile, dtype)
5766
d_input = cuda.to_device(h_input)
5867
d_output = cuda.device_array(items_per_tile, dtype=dtype)

0 commit comments

Comments
 (0)