Add multi-dimensional support to block_radix_sort routines. (NVIDIA#4035)

tpn · davebayer · commit 8c7e72c70546 · 2025-04-07T10:04:46.000+02:00
* Implement CudaSharedMemConfig enum with supporting tests.

* Relocate CUB_BLOCK_SCAN_ALOGS to _common.py.

We will be using it from _block_radix_sort.py imminently.

* Add multi-dimensional support to block_radix_sort routines.
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/_common.py b/python/cuda_cooperative/cuda/cooperative/experimental/_common.py
@@ -5,6 +5,7 @@
 import re
 import tempfile
 from collections import namedtuple
+from enum import Enum
 from typing import TYPE_CHECKING, Union
 
 # Import for type checking only
@@ -18,6 +19,27 @@
 dim3 = namedtuple("dim3", ("x", "y", "z"))
 
 
+CUB_BLOCK_SCAN_ALGOS = {
+    "raking": "::cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING",
+    "raking_memoize": "::cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE",
+    "warp_scans": "::cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS",
+}
+
+
+class CudaSharedMemConfig(Enum):
+    """
+    CUDA shared memory configuration.  This is intended to mirror the C++
+    equivalent `cudaSharedMemConfig` enum.
+    """
+
+    BankSizeDefault = 0
+    BankSizeFourByte = 1
+    BankSizeEightByte = 2
+
+    def __str__(self):
+        return f"cudaSharedMem{self.name}"
+
+
 def make_binary_tempfile(content, suffix):
     tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=suffix, buffering=0)
     tmp.write(content)
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_radix_sort.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_radix_sort.py
@@ -2,10 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from typing import TYPE_CHECKING, Tuple, Union
+
 import numba
 
 from cuda.cooperative.experimental._common import (
+    CUB_BLOCK_SCAN_ALGOS,
+    CudaSharedMemConfig,
+    dim3,
     make_binary_tempfile,
+    normalize_dim_param,
     normalize_dtype_param,
 )
 from cuda.cooperative.experimental._types import (
@@ -18,6 +24,112 @@
     Value,
 )
 
+if TYPE_CHECKING:
+    import numpy as np
+
+
+TEMPLATE_PARAMETERS = [
+    TemplateParameter("KeyT"),
+    TemplateParameter("BLOCK_DIM_X"),
+    TemplateParameter("ITEMS_PER_THREAD"),
+    TemplateParameter("ValueT"),
+    TemplateParameter("RADIX_BITS"),
+    TemplateParameter("MEMOIZE_OUTER_SCAN"),
+    TemplateParameter("INNER_SCAN_ALGORITHM"),
+    TemplateParameter("SMEM_CONFIG"),
+    TemplateParameter("BLOCK_DIM_Y"),
+    TemplateParameter("BLOCK_DIM_Z"),
+]
+
+
+METHOD_PARAMETERS_VARIANTS = [
+    [
+        Pointer(numba.uint8),
+        DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
+    ],
+    [
+        Pointer(numba.uint8),
+        DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
+        Value(numba.int32),
+        Value(numba.int32),
+    ],
+]
+
+
+# N.B. In order to support multi-dimensional block dimensions, we have to
+#      defaults for all the template parameters preceding the final Y and
+#      Z dimensions.  This will be improved in the future, allowing users
+#      to provide overrides for the default values.
+
+TEMPLATE_PARAMETER_DEFAULTS = {
+    "ValueT": "::cub::NullType",  # Indicates keys-only sort
+    "RADIX_BITS": 4,
+    "MEMOIZE_OUTER_SCAN": "true",
+    "INNER_SCAN_ALGORITHM": CUB_BLOCK_SCAN_ALGOS["warp_scans"],
+    "SMEM_CONFIG": str(CudaSharedMemConfig.BankSizeFourByte),
+}
+
+
+def _get_template_parameter_specializations(
+    dtype: numba.types.Type, dim: dim3, items_per_thread: int
+) -> dict:
+    """
+    Returns a dictionary of template parameter specializations for the block
+    radix sort algorithm.
+
+    Args:
+        dtype: Supplies the Numba data type.
+
+        dim: Supplies the block dimensions.
+
+        items_per_thread: Supplies the number of items each thread owns.
+
+    Returns:
+        A dictionary of template parameter specializations.
+    """
+    specialization = {
+        "KeyT": dtype,
+        "BLOCK_DIM_X": dim[0],
+        "ITEMS_PER_THREAD": items_per_thread,
+        "BLOCK_DIM_Y": dim[1],
+        "BLOCK_DIM_Z": dim[2],
+    }
+
+    specialization.update(TEMPLATE_PARAMETER_DEFAULTS)
+
+    return specialization
+
+
+def _radix_sort(
+    dtype: Union[str, type, "np.dtype", "numba.types.Type"],
+    threads_per_block: Union[int, Tuple[int, int], Tuple[int, int, int], dim3],
+    items_per_thread: int,
+    descending: bool,
+) -> Invocable:
+    dim = normalize_dim_param(threads_per_block)
+    dtype = normalize_dtype_param(dtype)
+
+    method_name = "SortDescending" if descending else "Sort"
+    template = Algorithm(
+        "BlockRadixSort",
+        method_name,
+        "block_radix_sort",
+        ["cub/block/block_radix_sort.cuh"],
+        TEMPLATE_PARAMETERS,
+        METHOD_PARAMETERS_VARIANTS,
+    )
+    specialization = template.specialize(
+        _get_template_parameter_specializations(dtype, dim, items_per_thread)
+    )
+    return Invocable(
+        temp_files=[
+            make_binary_tempfile(ltoir, ".ltoir")
+            for ltoir in specialization.get_lto_ir()
+        ],
+        temp_storage_bytes=specialization.get_temp_storage_bytes(),
+        algorithm=specialization,
+    )
+
 
 def radix_sort_keys(dtype, threads_per_block, items_per_thread):
     """Performs an ascending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
@@ -47,54 +159,17 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
         ``{ [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }``.
 
     Args:
-        dtype: Numba data type of the keys to be sorted
-        threads_per_block: The number of threads in a block
+        dtype: Data type of the keys to be sorted
+
+        threads_per_block: The number of threads in a block, either an integer
+            or a tuple of 2 or 3 integers
+
         items_per_thread: The number of items each thread owns
 
     Returns:
         A callable object that can be linked to and invoked from a CUDA kernel
     """
-    # Normalize the dtype parameter.
-    dtype = normalize_dtype_param(dtype)
-
-    template = Algorithm(
-        "BlockRadixSort",
-        "Sort",
-        "block_radix_sort",
-        ["cub/block/block_radix_sort.cuh"],
-        [
-            TemplateParameter("KeyT"),
-            TemplateParameter("BLOCK_DIM_X"),
-            TemplateParameter("ITEMS_PER_THREAD"),
-        ],
-        [
-            [
-                Pointer(numba.uint8),
-                DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
-            ],
-            [
-                Pointer(numba.uint8),
-                DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
-                Value(numba.int32),
-                Value(numba.int32),
-            ],
-        ],
-    )
-    specialization = template.specialize(
-        {
-            "KeyT": dtype,
-            "BLOCK_DIM_X": threads_per_block,
-            "ITEMS_PER_THREAD": items_per_thread,
-        }
-    )
-    return Invocable(
-        temp_files=[
-            make_binary_tempfile(ltoir, ".ltoir")
-            for ltoir in specialization.get_lto_ir()
-        ],
-        temp_storage_bytes=specialization.get_temp_storage_bytes(),
-        algorithm=specialization,
-    )
+    return _radix_sort(dtype, threads_per_block, items_per_thread, descending=False)
 
 
 def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
@@ -125,49 +200,14 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
         ``{ [511, 510, 509, 508], [507, 506, 505, 504], ..., [3, 2, 1, 0] }``.
 
     Args:
-        dtype: Numba data type of the keys to be sorted
-        threads_per_block: The number of threads in a block
+        dtype: Data type of the keys to be sorted
+
+        threads_per_block: The number of threads in a block, either an integer
+            or a tuple of 2 or 3 integers
+
         items_per_thread: The number of items each thread owns
 
     Returns:
         A callable object that can be linked to and invoked from a CUDA kernel
     """
-    template = Algorithm(
-        "BlockRadixSort",
-        "SortDescending",
-        "block_radix_sort",
-        ["cub/block/block_radix_sort.cuh"],
-        [
-            TemplateParameter("KeyT"),
-            TemplateParameter("BLOCK_DIM_X"),
-            TemplateParameter("ITEMS_PER_THREAD"),
-        ],
-        [
-            [
-                Pointer(numba.uint8),
-                DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
-            ],
-            [
-                Pointer(numba.uint8),
-                DependentArray(Dependency("KeyT"), Dependency("ITEMS_PER_THREAD")),
-                Value(numba.int32),
-                Value(numba.int32),
-            ],
-        ],
-    )
-    specialization = template.specialize(
-        {
-            "KeyT": dtype,
-            "BLOCK_DIM_X": threads_per_block,
-            "ITEMS_PER_THREAD": items_per_thread,
-        }
-    )
-
-    return Invocable(
-        temp_files=[
-            make_binary_tempfile(ltoir, ".ltoir")
-            for ltoir in specialization.get_lto_ir()
-        ],
-        temp_storage_bytes=specialization.get_temp_storage_bytes(),
-        algorithm=specialization,
-    )
+    return _radix_sort(dtype, threads_per_block, items_per_thread, descending=True)
diff --git a/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_scan.py b/python/cuda_cooperative/cuda/cooperative/experimental/block/_block_scan.py
@@ -7,6 +7,7 @@
 import numba
 
 from cuda.cooperative.experimental._common import (
+    CUB_BLOCK_SCAN_ALGOS,
     make_binary_tempfile,
     normalize_dtype_param,
 )
@@ -24,12 +25,6 @@
 if TYPE_CHECKING:
     import numpy as np
 
-CUB_BLOCK_SCAN_ALGOS = {
-    "raking": "::cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING",
-    "raking_memoize": "::cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE",
-    "warp_scans": "::cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS",
-}
-
 
 def _scan(
     dtype: Union[str, type, "np.dtype", "numba.types.Type"],
diff --git a/python/cuda_cooperative/tests/test_block_radix_sort.py b/python/cuda_cooperative/tests/test_block_radix_sort.py
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from functools import reduce
+from operator import mul
+
 import numba
 import pytest
-from helpers import NUMBA_TYPES_TO_NP, random_int
+from helpers import NUMBA_TYPES_TO_NP, random_int, row_major_tid
 from numba import cuda, types
 from pynvjitlink import patch
 
@@ -15,19 +18,26 @@
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_per_block", [32, 128, 256, 1024])
+@pytest.mark.parametrize("threads_per_block", [32, 128, 256, 1024, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 def test_block_radix_sort_descending(T, threads_per_block, items_per_thread):
     begin_bit = numba.int32(0)
     end_bit = numba.int32(T.bitwidth)
+
+    num_threads_per_block = (
+        threads_per_block
+        if type(threads_per_block) is int
+        else reduce(mul, threads_per_block)
+    )
+
     block_radix_sort = cudax.block.radix_sort_keys_descending(
         dtype=T, threads_per_block=threads_per_block, items_per_thread=items_per_thread
     )
     temp_storage_bytes = block_radix_sort.temp_storage_bytes
 
     @cuda.jit(link=block_radix_sort.files)
     def kernel(input, output):
-        tid = cuda.threadIdx.x
+        tid = row_major_tid()
         temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype="uint8")
         thread_data = cuda.local.array(shape=items_per_thread, dtype=dtype)
         for i in range(items_per_thread):
@@ -37,7 +47,7 @@ def kernel(input, output):
             output[tid * items_per_thread + i] = thread_data[i]
 
     dtype = NUMBA_TYPES_TO_NP[T]
-    items_per_tile = threads_per_block * items_per_thread
+    items_per_tile = num_threads_per_block * items_per_thread
     input = random_int(items_per_tile, dtype)
     d_input = cuda.to_device(input)
     d_output = cuda.device_array(items_per_tile, dtype=dtype)
@@ -57,10 +67,14 @@ def kernel(input, output):
 
 
 @pytest.mark.parametrize("T", [types.int8, types.int16, types.uint32, types.uint64])
-@pytest.mark.parametrize("threads_per_block", [32, 128, 256, 1024])
+@pytest.mark.parametrize("threads_per_block", [32, 128, 256, 1024, (4, 8), (2, 4, 8)])
 @pytest.mark.parametrize("items_per_thread", [1, 3])
 def test_block_radix_sort(T, threads_per_block, items_per_thread):
-    items_per_tile = threads_per_block * items_per_thread
+    items_per_tile = (
+        threads_per_block * items_per_thread
+        if type(threads_per_block) is int
+        else reduce(mul, threads_per_block) * items_per_thread
+    )
 
     block_radix_sort = cudax.block.radix_sort_keys(
         dtype=T, threads_per_block=threads_per_block, items_per_thread=items_per_thread
@@ -69,7 +83,7 @@ def test_block_radix_sort(T, threads_per_block, items_per_thread):
 
     @cuda.jit(link=block_radix_sort.files)
     def kernel(input, output):
-        tid = cuda.threadIdx.x
+        tid = row_major_tid()
         temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype="uint8")
         thread_data = cuda.local.array(shape=items_per_thread, dtype=dtype)
         for i in range(items_per_thread):
diff --git a/python/cuda_cooperative/tests/test_common.py b/python/cuda_cooperative/tests/test_common.py