|
5 | 5 |
|
6 | 6 | import numba |
7 | 7 |
|
8 | | -from cuda.cooperative.experimental._common import make_binary_tempfile, normalize_dim_param |
| 8 | +from cuda.cooperative.experimental._common import ( |
| 9 | + make_binary_tempfile, |
| 10 | + normalize_dim_param, |
| 11 | +) |
9 | 12 | from cuda.cooperative.experimental._types import ( |
10 | 13 | Algorithm, |
11 | 14 | Dependency, |
|
34 | 37 | "warp_transpose_timesliced": "::cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED", |
35 | 38 | } |
36 | 39 |
|
| 40 | + |
37 | 41 | def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"): |
| 42 | + """Creates an operation that performs a block-wide load. |
| 43 | +
|
| 44 | + Returns a callable object that can be linked to and invoked from device code. It can be |
| 45 | + invoked with the following signatures: |
| 46 | +
|
| 47 | + - `(src: numba.types.Array, dest: numba.types.Array) -> dtype`: Each thread loads |
| 48 | + `items_per_thread` items from `src` into `dest`. `dest` must contain at least |
| 49 | + `items_per_thread` items. |
| 50 | +
|
| 51 | + Different data movement strategies can be selected via the `algorithm` parameter: |
| 52 | +
|
| 53 | + - `algorithm="direct"` (default): A blocked arrangement of data is read directly from memory. |
| 54 | + - `algorithm="striped"`: A striped arrangement of data is read directly from memory. |
| 55 | + - `algorithm="vectorize"`: A blocked arrangement of data is read directly from memory using CUDA's built-in vectorized loads as a coalescing optimization. |
| 56 | + - `algorithm="transpose"`: A striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement. |
| 57 | + - `algorithm="warp_transpose"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement. |
| 58 | + - `algorithm="warp_transpose_timesliced"`: A warp-striped arrangement of data is read directly from memory and is then locally transposed into a blocked arrangement one warp at a time. |
| 59 | +
|
| 60 | + For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockLoad.html). |
| 61 | +
|
| 62 | + Args: |
| 63 | + dtype: Data type being loaded |
| 64 | + threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers |
| 65 | + items_per_thread: The number of items each thread loads |
| 66 | + algorithm: The data movement algorithm to use |
| 67 | +
|
| 68 | + Example: |
| 69 | + The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with |
| 70 | + each thread handling 4 integers. |
| 71 | +
|
| 72 | + .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py |
| 73 | + :language: python |
| 74 | + :dedent: |
| 75 | + :start-after: example-begin imports |
| 76 | + :end-before: example-end imports |
| 77 | +
|
| 78 | + .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py |
| 79 | + :language: python |
| 80 | + :dedent: |
| 81 | + :start-after: example-begin load_store |
| 82 | + :end-before: example-end load_store |
| 83 | + """ |
38 | 84 | dim = normalize_dim_param(threads_in_block) |
39 | 85 | template = Algorithm( |
40 | 86 | "BlockLoad", |
@@ -78,6 +124,48 @@ def load(dtype, threads_in_block, items_per_thread=1, algorithm="direct"): |
78 | 124 |
|
79 | 125 |
|
80 | 126 | def store(dtype, threads_in_block, items_per_thread=1, algorithm="direct"): |
| 127 | + """Creates an operation that performs a block-wide store. |
| 128 | +
|
| 129 | + Returns a callable object that can be linked to and invoked from device code. It can be |
| 130 | + invoked with the following signatures: |
| 131 | +
|
| 132 | + - `(dest: numba.types.Array, src: numba.types.Array) -> dtype`: Each thread stores |
| 133 | + `items_per_thread` items from `src` into `dest`. `src` must contain at least |
| 134 | + `items_per_thread` items. |
| 135 | +
|
| 136 | + Different data movement strategies can be selected via the `algorithm` parameter: |
| 137 | +
|
| 138 | + - `algorithm="direct"` (default): A blocked arrangement of data is written directly to memory. |
| 139 | + - `algorithm="striped"`: A striped arrangement of data is written directly to memory. |
| 140 | + - `algorithm="vectorize"`: A blocked arrangement of data is written directly to memory using CUDA's built-in vectorized stores as a coalescing optimization. |
| 141 | + - `algorithm="transpose"`: A blocked arrangement is locally transposed into a striped arrangement which is then written to memory. |
| 142 | + - `algorithm="warp_transpose"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. |
| 143 | + - `algorithm="warp_transpose_timesliced"`: A blocked arrangement is locally transposed into a warp-striped arrangement which is then written to memory. To reduce the shared memory requireent, only one warp’s worth of shared memory is provisioned and is subsequently time-sliced among warps. |
| 144 | +
|
| 145 | + For more details, [read the corresponding CUB C++ documentation](https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockStore.html). |
| 146 | +
|
| 147 | + Args: |
| 148 | + dtype: Data type being loaded |
| 149 | + threads_in_block: The number of threads in a block, either an integer or a tuple of 2 or 3 integers |
| 150 | + items_per_thread: The number of items each thread loads |
| 151 | + algorithm: The data movement algorithm to use |
| 152 | +
|
| 153 | + Example: |
| 154 | + The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with |
| 155 | + each thread handling 4 integers. |
| 156 | +
|
| 157 | + .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py |
| 158 | + :language: python |
| 159 | + :dedent: |
| 160 | + :start-after: example-begin imports |
| 161 | + :end-before: example-end imports |
| 162 | +
|
| 163 | + .. literalinclude:: ../../python/cuda_cooperative/tests/test_block_load_store_api.py |
| 164 | + :language: python |
| 165 | + :dedent: |
| 166 | + :start-after: example-begin load_store |
| 167 | + :end-before: example-end load_store |
| 168 | + """ |
81 | 169 | dim = normalize_dim_param(threads_in_block) |
82 | 170 | template = Algorithm( |
83 | 171 | "BlockStore", |
|
0 commit comments