[cuda.cooperative] Support multidimensional thread blocks in block load/store and improve load/store docs (NVIDIA#3161)

brycelelbach · davebayer · commit 9943068f46f6 · 2025-04-07T10:04:41.000+02:00
* [cuda.cooperative] Support multidimensional thread blocks in block load/store
* [cuda.cooperative] Add tests for multidimensional block loads and stores and add
documentation for block loads and stores.
* [cuda.cooperative] Remove an unnecessary synchronization from the block
load/store example and fix the return types of block load/store in the docs.
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_common.py b/python/cuda_cooperative/cuda/cooperative/experimental/_common.py
@@ -69,3 +69,10 @@ def find_dim3(name, txt):
         find_unsigned(f"{name}_y", txt),
         find_unsigned(f"{name}_z", txt),
     )
+
+
+def normalize_dim_param(dim):
+    x = dim[0] if type(dim) is not int else dim
+    y = dim[1] if type(dim) is not int and len(dim) >= 2 else 1
+    z = dim[2] if type(dim) is not int and len(dim) >= 3 else 1
+    return (x, y, z)
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py
@@ -5,7 +5,10 @@
 
 import numba
 
-from cuda.cooperative.experimental._common import make_binary_tempfile
+from cuda.cooperative.experimental._common import (
+    make_binary_tempfile,
+    normalize_dim_param,
+)
 from cuda.cooperative.experimental._types import (
     Algorithm,
     Dependency,
@@ -36,6 +39,49 @@
 
 
 def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
+    """Creates an operation that performs a block-wide load.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(src: numba.types.Array, dest: numba.types.Array) -> None`: Each thread loads
+        `items_per_thread` items from `src` into `dest`. `dest` must contain at least
+        `items_per_thread` items.
+
+    Different data movement strategies can be selected via the `algorithm` parameter:
+
+    - `algorithm="direct"` (default): A blocked arrangement of data is read directly from memory.
+    - `algorithm="striped"`: A striped arrangement of data is read directly from memory.
+    - `algorithm="vectorize"`: A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+    - `algorithm="transpose"`: A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
+    - `algorithm="warp_transpose"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
+    - `algorithm="warp_transpose_timesliced"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement one warp at a time.
+
+    For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockLoad.html).
+
+    Args:
+        dtype: Data type being loaded
+        threads_per_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
+        items_per_thread: The number of items each thread loads
+        algorithm: The data movement algorithm to use
+
+    Example:
+        The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
+        each thread handling 4 integers.
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin imports
+            :end-before: example-end imports
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin load_store
+            :end-before: example-end load_store
+    """
+    dim = normalize_dim_param(threads_per_block)
     template = Algorithm(
         "BlockLoad",
         "Load",
@@ -46,6 +92,8 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
             TemplateParameter("BLOCK_DIM_X"),
             TemplateParameter("ITEMS_PER_THREAD"),
             TemplateParameter("ALGORITHM"),
+            TemplateParameter("BLOCK_DIM_Y"),
+            TemplateParameter("BLOCK_DIM_Z"),
         ],
         [
             [
@@ -58,9 +106,11 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
     specialization = template.specialize(
         {
             "T": dtype,
-            "BLOCK_DIM_X": threads_per_block,
+            "BLOCK_DIM_X": dim[0],
             "ITEMS_PER_THREAD": items_per_thread,
             "ALGORITHM": CUB_BLOCK_LOAD_ALGOS[algorithm],
+            "BLOCK_DIM_Y": dim[1],
+            "BLOCK_DIM_Z": dim[2],
         }
     )
     return Invocable(
@@ -74,6 +124,49 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
 
 
 def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
+    """Creates an operation that performs a block-wide store.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(dest: numba.types.Array, src: numba.types.Array) -> None`: Each thread stores
+        `items_per_thread` items from `src` into `dest`. `src` must contain at least
+        `items_per_thread` items.
+
+    Different data movement strategies can be selected via the `algorithm` parameter:
+
+    - `algorithm="direct"` (default): A blocked arrangement of data is written directly to memory.
+    - `algorithm="striped"`: A striped arrangement of data is written directly to memory.
+    - `algorithm="vectorize"`: A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+    - `algorithm="transpose"`: A blocked arrangement is locally transposed into a striped arrangement which is then written to memory.
+    - `algorithm="warp_transpose"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory.
+    - `algorithm="warp_transpose_timesliced"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. To reduce the shared memory requireent, only one warp’s worth of shared memory is provisioned and is subsequently time-sliced among warps.
+
+    For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockStore.html).
+
+    Args:
+        dtype: Data type being stored
+        threads_per_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
+        items_per_thread: The number of items each thread loads
+        algorithm: The data movement algorithm to use
+
+    Example:
+        The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
+        each thread handling 4 integers.
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin imports
+            :end-before: example-end imports
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin load_store
+            :end-before: example-end load_store
+    """
+    dim = normalize_dim_param(threads_per_block)
     template = Algorithm(
         "BlockStore",
         "Store",
@@ -84,6 +177,8 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
             TemplateParameter("BLOCK_DIM_X"),
             TemplateParameter("ITEMS_PER_THREAD"),
             TemplateParameter("ALGORITHM"),
+            TemplateParameter("BLOCK_DIM_Y"),
+            TemplateParameter("BLOCK_DIM_Z"),
         ],
         [
             [
@@ -96,9 +191,11 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
     specialization = template.specialize(
         {
             "T": dtype,
-            "BLOCK_DIM_X": threads_per_block,
+            "BLOCK_DIM_X": dim[0],
             "ITEMS_PER_THREAD": items_per_thread,
             "ALGORITHM": CUB_BLOCK_STORE_ALGOS[algorithm],
+            "BLOCK_DIM_Y": dim[1],
+            "BLOCK_DIM_Z": dim[2],
         }
     )
     return Invocable(
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py
@@ -28,8 +28,8 @@ def reduce(dtype, threads_per_block, binary_op, items_per_thread=1, methods=None
 
     - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
     - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
-        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
-        first `items_per_thread` items will be included in the reduction.
+        reduction. The array must contain at least `items_per_thread` items; only the first
+        `items_per_thread` items will be included in the reduction.
     - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
         single item to the reduction. The items contributed by all other threads are ignored.
 
@@ -135,7 +135,7 @@ def sum(dtype, threads_per_block, items_per_thread=1, methods=None):
 
     - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
     - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
-        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
+        reduction. The array must contain at least `items_per_thread` items; only the
         first `items_per_thread` items will be included in the reduction.
     - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
         single item to the reduction. The items contributed by all other threads are ignored.
diff --git a/python/cuda_cooperative/tests/helpers.py b/python/cuda_cooperative/tests/helpers.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import numpy as np
-from numba import types
+from numba import cuda, types
 
 NUMBA_TYPES_TO_NP = {
     types.int8: np.int8,
@@ -21,3 +21,14 @@
 
 def random_int(shape, dtype):
     return np.random.randint(0, 128, size=shape).astype(dtype)
+
+
+@cuda.jit(device=True)
+def row_major_tid():
+    dim = cuda.blockDim
+    idx = cuda.threadIdx
+    return (
+        (0 if dim.z == 1 else idx.z * dim.x * dim.y)
+        + (0 if dim.y == 1 else idx.y * dim.x)
+        + idx.x
+    )
diff --git a/python/cuda_cooperative/tests/test_block_load.py b/python/cuda_cooperative/tests/test_block_load.py
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from functools import reduce
+from operator import mul
+
 import numba
 import pytest
-from helpers import NUMBA_TYPES_TO_NP, random_int
+from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 from pynvjitlink import patch
 
@@ -15,7 +18,7 @@
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_per_block", [32, 128, 256])
+@pytest.mark.parametrize("threads_per_block", [32, 128, 256, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 @pytest.mark.parametrize(
     "algorithm",
@@ -32,16 +35,22 @@ def test_block_load(T, threads_per_block, items_per_thread, algorithm):
     block_load = cudax.block.load(T, threads_per_block, items_per_thread, algorithm)
     temp_storage_bytes = block_load.temp_storage_bytes
 
+    num_threads_per_block = (
+        threads_per_block
+        if type(threads_per_block) is int
+        else reduce(mul, threads_per_block)
+    )
+
     if algorithm == "striped":
 
         @cuda.jit(device=True)
         def output_index(i):
-            return cuda.threadIdx.x + threads_per_block * i
+            return row_major_tid() + num_threads_per_block * i
     else:
 
         @cuda.jit(device=True)
         def output_index(i):
-            return cuda.threadIdx.x * items_per_thread + i
+            return row_major_tid() * items_per_thread + i
 
     @cuda.jit(link=block_load.files)
     def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
             d_output[output_index(i)] = thread_data[i]
 
     dtype = NUMBA_TYPES_TO_NP[T]
-    items_per_tile = threads_per_block * items_per_thread
+    items_per_tile = num_threads_per_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
     d_input = cuda.to_device(h_input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)
diff --git a/python/cuda_cooperative/tests/test_block_load_store_api.py b/python/cuda_cooperative/tests/test_block_load_store_api.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# example-begin imports
+import numba
+import numpy as np
+from numba import cuda
+from pynvjitlink import patch
+
+import cuda.cooperative.experimental as cudax
+
+patch.patch_numba_linker(lto=True)
+# example-end imports
+
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+def test_block_load_store():
+    # example-begin load_store
+    threads_per_block = 32
+    items_per_thread = 4
+    block_load = cudax.block.load(
+        numba.int32, threads_per_block, items_per_thread, "striped"
+    )
+    block_store = cudax.block.store(
+        numba.int32, threads_per_block, items_per_thread, "striped"
+    )
+
+    @cuda.jit(link=block_load.files + block_store.files)
+    def kernel(input, output):
+        tmp = cuda.local.array(items_per_thread, numba.int32)
+        block_load(input, tmp)
+        block_store(output, tmp)
+
+    # example-end load_store
+
+    h_input = np.random.randint(
+        0, 42, threads_per_block * items_per_thread, dtype=np.int32
+    )
+    d_input = cuda.to_device(h_input)
+    d_output = cuda.device_array_like(d_input)
+    kernel[1, threads_per_block](d_input, d_output)
+    h_output = d_output.copy_to_host()
+
+    np.testing.assert_allclose(h_output, h_input)
diff --git a/python/cuda_cooperative/tests/test_block_store.py b/python/cuda_cooperative/tests/test_block_store.py
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from functools import reduce
+from operator import mul
+
 import numba
 import pytest
-from helpers import NUMBA_TYPES_TO_NP, random_int
+from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 from pynvjitlink import patch
 
@@ -15,7 +18,7 @@
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_per_block", [32, 128, 256])
+@pytest.mark.parametrize("threads_per_block", [32, 128, 256, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 @pytest.mark.parametrize(
     "algorithm",
@@ -32,16 +35,22 @@ def test_block_store(T, threads_per_block, items_per_thread, algorithm):
     block_store = cudax.block.store(T, threads_per_block, items_per_thread, algorithm)
     temp_storage_bytes = block_store.temp_storage_bytes
 
+    num_threads_per_block = (
+        threads_per_block
+        if type(threads_per_block) is int
+        else reduce(mul, threads_per_block)
+    )
+
     if algorithm == "striped":
 
         @cuda.jit(device=True)
         def input_index(i):
-            return cuda.threadIdx.x + threads_per_block * i
+            return row_major_tid() + num_threads_per_block * i
     else:
 
         @cuda.jit(device=True)
         def input_index(i):
-            return cuda.threadIdx.x * items_per_thread + i
+            return row_major_tid() * items_per_thread + i
 
     @cuda.jit(link=block_store.files)
     def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
         block_store(temp_storage, d_output, thread_data)
 
     dtype = NUMBA_TYPES_TO_NP[T]
-    items_per_tile = threads_per_block * items_per_thread
+    items_per_tile = num_threads_per_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
     d_input = cuda.to_device(h_input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)