Skip to content

Commit 3a70e49

Browse files
NaderAlAwardavebayer
authored andcommitted
cuda.parallel: Add documentation for the current iterators along with examples and tests (NVIDIA#3311)
* Add tests demonstrating usage of different iterators * Update documentation of reduce_into by merging import code snippet with the rest of the example * Add documentation for current iterators * Run pre-commit checks and update accordingly * Fix comments to refer to the proper lines in the code snippets in the docs
1 parent 6b82e05 commit 3a70e49

File tree

3 files changed

+228
-21
lines changed

3 files changed

+228
-21
lines changed

python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -157,16 +157,7 @@ def reduce_into(
157157
"""Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
158158
159159
Example:
160-
The code snippet below illustrates a user-defined min-reduction of a
161-
device vector of ``int`` data elements.
162-
163-
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
164-
:language: python
165-
:dedent:
166-
:start-after: example-begin imports
167-
:end-before: example-end imports
168-
169-
Below is the code snippet that demonstrates the usage of the ``reduce_into`` API:
160+
The code snippet below demonstrates the usage of the ``reduce_into`` API:
170161
171162
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
172163
:language: python

python/cuda_parallel/cuda/parallel/experimental/iterators/__init__.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,22 @@ def CacheModifiedInputIterator(device_array, modifier):
99
Similar to https://nvidia.github.io/cccl/cub/api/classcub_1_1CacheModifiedInputIterator.html
1010
1111
Currently the only supported modifier is "stream" (LOAD_CS).
12+
13+
Example:
14+
The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
15+
16+
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
17+
:language: python
18+
:dedent:
19+
:start-after: example-begin cache-iterator
20+
:end-before: example-end cache-iterator
21+
22+
Args:
23+
device_array: CUDA device array storing the input sequence of data items
24+
modifier: The PTX cache load modifier
25+
26+
Returns:
27+
A ``CacheModifiedInputIterator`` object initialized with ``device_array``
1228
"""
1329
if modifier != "stream":
1430
raise NotImplementedError("Only stream modifier is supported")
@@ -19,15 +35,74 @@ def CacheModifiedInputIterator(device_array, modifier):
1935

2036

2137
def ConstantIterator(value):
22-
"""Returns an Iterator representing a sequence of constant values."""
38+
"""Returns an Iterator representing a sequence of constant values.
39+
40+
Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1constant__iterator.html
41+
42+
Example:
43+
The code snippet below demonstrates the usage of a ``ConstantIterator``
44+
representing the sequence ``[10, 10, 10]``:
45+
46+
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
47+
:language: python
48+
:dedent:
49+
:start-after: example-begin constant-iterator
50+
:end-before: example-end constant-iterator
51+
52+
Args:
53+
value: The value of every item in the sequence
54+
55+
Returns:
56+
A ``ConstantIterator`` object initialized to ``value``
57+
"""
2358
return _iterators.ConstantIterator(value)
2459

2560

2661
def CountingIterator(offset):
27-
"""Returns an Iterator representing a sequence of incrementing values."""
62+
"""Returns an Iterator representing a sequence of incrementing values.
63+
64+
Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1counting__iterator.html
65+
66+
Example:
67+
The code snippet below demonstrates the usage of a ``CountingIterator``
68+
representing the sequence ``[10, 11, 12]``:
69+
70+
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
71+
:language: python
72+
:dedent:
73+
:start-after: example-begin counting-iterator
74+
:end-before: example-end counting-iterator
75+
76+
Args:
77+
offset: The initial value of the sequence
78+
79+
Returns:
80+
A ``CountingIterator`` object initialized to ``offset``
81+
"""
2882
return _iterators.CountingIterator(offset)
2983

3084

3185
def TransformIterator(it, op):
32-
"""Returns an Iterator representing a transformed sequence of values."""
86+
"""Returns an Iterator representing a transformed sequence of values.
87+
88+
Similar to https://nvidia.github.io/cccl/cub/api/classcub_1_1TransformInputIterator.html
89+
90+
Example:
91+
The code snippet below demonstrates the usage of a ``TransformIterator``
92+
composed with a ``CountingIterator``, transforming the sequence ``[10, 11, 12]``
93+
by squaring each item before reducing the output:
94+
95+
.. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
96+
:language: python
97+
:dedent:
98+
:start-after: example-begin transform-iterator
99+
:end-before: example-end transform-iterator
100+
101+
Args:
102+
it: The iterator object to be transformed
103+
op: The transform operation
104+
105+
Returns:
106+
A ``TransformIterator`` object to transform the items in ``it`` using ``op``
107+
"""
33108
return _iterators.make_transform_iterator(it, op)

python/cuda_parallel/tests/test_reduce_api.py

Lines changed: 149 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,14 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

5-
# example-begin imports
6-
import cupy as cp
7-
import numpy as np
8-
9-
import cuda.parallel.experimental.algorithms as algorithms
10-
11-
# example-end imports
12-
135

146
def test_device_reduce():
157
# example-begin reduce-min
8+
import cupy as cp
9+
import numpy as np
10+
11+
import cuda.parallel.experimental.algorithms as algorithms
12+
1613
def min_op(a, b):
1714
return a if a < b else b
1815

@@ -37,3 +34,147 @@ def min_op(a, b):
3734
expected_output = 0
3835
assert (d_output == expected_output).all()
3936
# example-end reduce-min
37+
38+
39+
def test_cache_modified_input_iterator():
40+
# example-begin cache-iterator
41+
import functools
42+
43+
import cupy as cp
44+
import numpy as np
45+
46+
import cuda.parallel.experimental.algorithms as algorithms
47+
import cuda.parallel.experimental.iterators as iterators
48+
49+
def add_op(a, b):
50+
return a + b
51+
52+
values = [8, 6, 7, 5, 3, 0, 9]
53+
d_input = cp.array(values, dtype=np.int32)
54+
d_output = cp.empty(1, dtype=np.int32)
55+
56+
iterator = iterators.CacheModifiedInputIterator(
57+
d_input, modifier="stream"
58+
) # Input sequence
59+
h_init = np.array([0], dtype=np.int32) # Initial value for the reduction
60+
d_output = cp.empty(1, dtype=np.int32) # Storage for output
61+
62+
# Instantiate reduction, determine storage requirements, and allocate storage
63+
reduce_into = algorithms.reduce_into(iterator, d_output, add_op, h_init)
64+
temp_storage_size = reduce_into(None, iterator, d_output, len(values), h_init)
65+
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
66+
67+
# Run reduction
68+
reduce_into(d_temp_storage, iterator, d_output, len(values), h_init)
69+
70+
expected_output = functools.reduce(lambda a, b: a + b, values)
71+
assert (d_output == expected_output).all()
72+
# example-end cache-iterator
73+
74+
75+
def test_constant_iterator():
76+
# example-begin constant-iterator
77+
import functools
78+
79+
import cupy as cp
80+
import numpy as np
81+
82+
import cuda.parallel.experimental.algorithms as algorithms
83+
import cuda.parallel.experimental.iterators as iterators
84+
85+
def add_op(a, b):
86+
return a + b
87+
88+
value = 10
89+
num_items = 3
90+
91+
constant_it = iterators.ConstantIterator(np.int32(value)) # Input sequence
92+
h_init = np.array([0], dtype=np.int32) # Initial value for the reduction
93+
d_output = cp.empty(1, dtype=np.int32) # Storage for output
94+
95+
# Instantiate reduction, determine storage requirements, and allocate storage
96+
reduce_into = algorithms.reduce_into(constant_it, d_output, add_op, h_init)
97+
temp_storage_size = reduce_into(None, constant_it, d_output, num_items, h_init)
98+
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
99+
100+
# Run reduction
101+
reduce_into(d_temp_storage, constant_it, d_output, num_items, h_init)
102+
103+
expected_output = functools.reduce(lambda a, b: a + b, [value] * num_items)
104+
assert (d_output == expected_output).all()
105+
# example-end constant-iterator
106+
107+
108+
def test_counting_iterator():
109+
# example-begin counting-iterator
110+
import functools
111+
112+
import cupy as cp
113+
import numpy as np
114+
115+
import cuda.parallel.experimental.algorithms as algorithms
116+
import cuda.parallel.experimental.iterators as iterators
117+
118+
def add_op(a, b):
119+
return a + b
120+
121+
first_item = 10
122+
num_items = 3
123+
124+
first_it = iterators.CountingIterator(np.int32(first_item)) # Input sequence
125+
h_init = np.array([0], dtype=np.int32) # Initial value for the reduction
126+
d_output = cp.empty(1, dtype=np.int32) # Storage for output
127+
128+
# Instantiate reduction, determine storage requirements, and allocate storage
129+
reduce_into = algorithms.reduce_into(first_it, d_output, add_op, h_init)
130+
temp_storage_size = reduce_into(None, first_it, d_output, num_items, h_init)
131+
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
132+
133+
# Run reduction
134+
reduce_into(d_temp_storage, first_it, d_output, num_items, h_init)
135+
136+
expected_output = functools.reduce(
137+
lambda a, b: a + b, range(first_item, first_item + num_items)
138+
)
139+
assert (d_output == expected_output).all()
140+
# example-end counting-iterator
141+
142+
143+
def test_transform_iterator():
144+
# example-begin transform-iterator
145+
import functools
146+
147+
import cupy as cp
148+
import numpy as np
149+
150+
import cuda.parallel.experimental.algorithms as algorithms
151+
import cuda.parallel.experimental.iterators as iterators
152+
153+
def add_op(a, b):
154+
return a + b
155+
156+
def square_op(a):
157+
return a**2
158+
159+
first_item = 10
160+
num_items = 3
161+
162+
transform_it = iterators.TransformIterator(
163+
iterators.CountingIterator(np.int32(first_item)), square_op
164+
) # Input sequence
165+
h_init = np.array([0], dtype=np.int32) # Initial value for the reduction
166+
d_output = cp.empty(1, dtype=np.int32) # Storage for output
167+
168+
# Instantiate reduction, determine storage requirements, and allocate storage
169+
reduce_into = algorithms.reduce_into(transform_it, d_output, add_op, h_init)
170+
temp_storage_size = reduce_into(None, transform_it, d_output, num_items, h_init)
171+
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)
172+
173+
# Run reduction
174+
reduce_into(d_temp_storage, transform_it, d_output, num_items, h_init)
175+
176+
expected_output = functools.reduce(
177+
lambda a, b: a + b, [a**2 for a in range(first_item, first_item + num_items)]
178+
)
179+
assert (d_output == expected_output).all()
180+
# example-end transform-iterator

0 commit comments

Comments
 (0)