[cuda.cooperative] Add tests for multidimensional block loads and stores and add

brycelelbach · brycelelbach · commit ae7b656c9adf · 2025-02-22T09:36:10.000-08:00
documentation for block loads and stores.
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_common.py b/python/cuda_cooperative/cuda/cooperative/experimental/_common.py
@@ -70,6 +70,7 @@ def find_dim3(name, txt):
         find_unsigned(f"{name}_z", txt),
     )
 
+
 def normalize_dim_param(dim):
     x = dim[0] if type(dim) is not int else dim
     y = dim[1] if type(dim) is not int and len(dim) >= 2 else 1
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_load_store.py
@@ -5,7 +5,10 @@
 
 import numba
 
-from cuda.cooperative.experimental._common import make_binary_tempfile, normalize_dim_param
+from cuda.cooperative.experimental._common import (
+    make_binary_tempfile,
+    normalize_dim_param,
+)
 from cuda.cooperative.experimental._types import (
     Algorithm,
     Dependency,
@@ -34,7 +37,50 @@
     "warp_transpose_timesliced": "::cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED",
 }
 
+
 def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
+    """Creates an operation that performs a block-wide load.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(src: numba.types.Array, dest: numba.types.Array) -> dtype`: Each thread loads
+        `items_per_thread` items from `src` into `dest`. `dest` must contain at least
+        `items_per_thread` items.
+
+    Different data movement strategies can be selected via the `algorithm` parameter:
+
+    - `algorithm="direct"` (default): A blocked arrangement of data is read directly from memory.
+    - `algorithm="striped"`: A striped arrangement of data is read directly from memory.
+    - `algorithm="vectorize"`: A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+    - `algorithm="transpose"`: A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
+    - `algorithm="warp_transpose"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement.
+    - `algorithm="warp_transpose_timesliced"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement one warp at a time.
+
+    For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockLoad.html).
+
+    Args:
+        dtype: Data type being loaded
+        threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
+        items_per_thread: The number of items each thread loads
+        algorithm: The data movement algorithm to use
+
+    Example:
+        The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
+        each thread handling 4 integers.
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin imports
+            :end-before: example-end imports
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin load_store
+            :end-before: example-end load_store
+    """
     dim = normalize_dim_param(threads_in_block)
     template = Algorithm(
         "BlockLoad",
@@ -78,6 +124,48 @@ def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
 
 
 def store(dtype, threads_in_block, items_per_thread=1, algorithm="direct"):
+    """Creates an operation that performs a block-wide store.
+
+    Returns a callable object that can be linked to and invoked from device code. It can be
+    invoked with the following signatures:
+
+    - `(dest: numba.types.Array, src: numba.types.Array) -> dtype`: Each thread stores
+        `items_per_thread` items from `src` into `dest`. `src` must contain at least
+        `items_per_thread` items.
+
+    Different data movement strategies can be selected via the `algorithm` parameter:
+
+    - `algorithm="direct"` (default): A blocked arrangement of data is written directly to memory.
+    - `algorithm="striped"`: A striped arrangement of data is written directly to memory.
+    - `algorithm="vectorize"`: A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+    - `algorithm="transpose"`: A blocked arrangement is locally transposed into a striped arrangement which is then written to memory.
+    - `algorithm="warp_transpose"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory.
+    - `algorithm="warp_transpose_timesliced"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. To reduce the shared memory requireent, only one warp’s worth of shared memory is provisioned and is subsequently time-sliced among warps.
+
+    For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockStore.html).
+
+    Args:
+        dtype: Data type being loaded
+        threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers
+        items_per_thread: The number of items each thread loads
+        algorithm: The data movement algorithm to use
+
+    Example:
+        The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
+        each thread handling 4 integers.
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin imports
+            :end-before: example-end imports
+
+        .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin load_store
+            :end-before: example-end load_store
+    """
     dim = normalize_dim_param(threads_in_block)
     template = Algorithm(
         "BlockStore",
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_reduce.py
@@ -28,8 +28,8 @@ def reduce(dtype, threads_in_block, binary_op, items_per_thread=1, methods=None)
 
     - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
     - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
-        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
-        first `items_per_thread` items will be included in the reduction.
+        reduction. The array must contain at least `items_per_thread` items; only the first
+        `items_per_thread` items will be included in the reduction.
     - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
         single item to the reduction. The items contributed by all other threads are ignored.
 
@@ -135,7 +135,7 @@ def sum(dtype, threads_in_block, items_per_thread=1, methods=None):
 
     - `(item: dtype) -> dtype)`: Each thread contributes a single item to the reduction.
     - `(items: numba.types.Array) -> dtype`: Each thread contributes an array of items to the
-        reduction. The array must be 1D and contain at least `items_per_thread` items; only the
+        reduction. The array must contain at least `items_per_thread` items; only the
         first `items_per_thread` items will be included in the reduction.
     - `(item: dtype, num_valid: int) -> dtype`: The first `num_valid` threads contribute a
         single item to the reduction. The items contributed by all other threads are ignored.
diff --git a/python/cuda_cooperative/tests/helpers.py b/python/cuda_cooperative/tests/helpers.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import numpy as np
-from numba import types
+from numba import cuda, types
 
 NUMBA_TYPES_TO_NP = {
     types.int8: np.int8,
@@ -21,3 +21,14 @@
 
 def random_int(shape, dtype):
     return np.random.randint(0, 128, size=shape).astype(dtype)
+
+
+@cuda.jit(device=True)
+def row_major_tid():
+    dim = cuda.blockDim
+    idx = cuda.threadIdx
+    return (
+        (0 if dim.z == 1 else idx.z * dim.x * dim.y)
+        + (0 if dim.y == 1 else idx.y * dim.x)
+        + idx.x
+    )
diff --git a/python/cuda_cooperative/tests/test_block_load.py b/python/cuda_cooperative/tests/test_block_load.py
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from functools import reduce
+from operator import mul
+
 import numba
 import pytest
-from helpers import NUMBA_TYPES_TO_NP, random_int
+from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 from pynvjitlink import patch
 
@@ -15,7 +18,7 @@
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_in_block", [32, 128, 256])
+@pytest.mark.parametrize("threads_in_block", [32, 128, 256, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 @pytest.mark.parametrize(
     "algorithm",
@@ -32,16 +35,22 @@ def test_block_load(T, threads_in_block, items_per_thread, algorithm):
     block_load = cudax.block.load(T, threads_in_block, items_per_thread, algorithm)
     temp_storage_bytes = block_load.temp_storage_bytes
 
+    num_threads_in_block = (
+        threads_in_block
+        if type(threads_in_block) is int
+        else reduce(mul, threads_in_block)
+    )
+
     if algorithm == "striped":
 
         @cuda.jit(device=True)
         def output_index(i):
-            return cuda.threadIdx.x + threads_in_block * i
+            return row_major_tid() + num_threads_in_block * i
     else:
 
         @cuda.jit(device=True)
         def output_index(i):
-            return cuda.threadIdx.x * items_per_thread + i
+            return row_major_tid() * items_per_thread + i
 
     @cuda.jit(link=block_load.files)
     def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
             d_output[output_index(i)] = thread_data[i]
 
     dtype = NUMBA_TYPES_TO_NP[T]
-    items_per_tile = threads_in_block * items_per_thread
+    items_per_tile = num_threads_in_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
     d_input = cuda.to_device(h_input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)
diff --git a/python/cuda_cooperative/tests/test_block_load_store_api.py b/python/cuda_cooperative/tests/test_block_load_store_api.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# example-begin imports
+import numba
+import numpy as np
+from numba import cuda
+from pynvjitlink import patch
+
+import cuda.cooperative.experimental as cudax
+
+patch.patch_numba_linker(lto=True)
+# example-end imports
+
+numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+def test_block_load_store():
+    # example-begin load_store
+    threads_in_block = 32
+    items_per_thread = 4
+    block_load = cudax.block.load(
+        numba.int32, threads_in_block, items_per_thread, "striped"
+    )
+    block_store = cudax.block.store(
+        numba.int32, threads_in_block, items_per_thread, "striped"
+    )
+
+    @cuda.jit(link=block_load.files + block_store.files)
+    def kernel(input, output):
+        tmp = cuda.local.array(items_per_thread, numba.int32)
+        block_load(input, tmp)
+        cuda.syncthreads()
+        block_store(output, tmp)
+
+    # example-end load_store
+
+    h_input = np.random.randint(
+        0, 42, threads_in_block * items_per_thread, dtype=np.int32
+    )
+    d_input = cuda.to_device(h_input)
+    d_output = cuda.device_array_like(d_input)
+    kernel[1, threads_in_block](d_input, d_output)
+    h_output = d_output.copy_to_host()
+
+    np.testing.assert_allclose(h_output, h_input)
diff --git a/python/cuda_cooperative/tests/test_block_store.py b/python/cuda_cooperative/tests/test_block_store.py
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from functools import reduce
+from operator import mul
+
 import numba
 import pytest
-from helpers import NUMBA_TYPES_TO_NP, random_int
+from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 from pynvjitlink import patch
 
@@ -15,7 +18,7 @@
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_in_block", [32, 128, 256])
+@pytest.mark.parametrize("threads_in_block", [32, 128, 256, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 @pytest.mark.parametrize(
     "algorithm",
@@ -32,16 +35,22 @@ def test_block_store(T, threads_in_block, items_per_thread, algorithm):
     block_store = cudax.block.store(T, threads_in_block, items_per_thread, algorithm)
     temp_storage_bytes = block_store.temp_storage_bytes
 
+    num_threads_in_block = (
+        threads_in_block
+        if type(threads_in_block) is int
+        else reduce(mul, threads_in_block)
+    )
+
     if algorithm == "striped":
 
         @cuda.jit(device=True)
         def input_index(i):
-            return cuda.threadIdx.x + threads_in_block * i
+            return row_major_tid() + num_threads_in_block * i
     else:
 
         @cuda.jit(device=True)
         def input_index(i):
-            return cuda.threadIdx.x * items_per_thread + i
+            return row_major_tid() * items_per_thread + i
 
     @cuda.jit(link=block_store.files)
     def kernel(d_input, d_output):
@@ -52,7 +61,7 @@ def kernel(d_input, d_output):
         block_store(temp_storage, d_output, thread_data)
 
     dtype = NUMBA_TYPES_TO_NP[T]
-    items_per_tile = threads_in_block * items_per_thread
+    items_per_tile = num_threads_in_block * items_per_thread
     h_input = random_int(items_per_tile, dtype)
     d_input = cuda.to_device(h_input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ def find_dim3(name, txt):`
`70`	`70`	`find_unsigned(f"{name}_z", txt),`
`71`	`71`	`)`
`72`	`72`
	`73`	`+`
`73`	`74`	`def normalize_dim_param(dim):`
`74`	`75`	`x = dim[0] if type(dim) is not int else dim`
`75`	`76`	`y = dim[1] if type(dim) is not int and len(dim) >= 2 else 1`