From ed53f8266210bad6b87dcf15af8349631a643077 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Mon, 29 Jul 2024 17:09:55 +0000 Subject: [PATCH 01/17] Add host-bulk for_each API for static_map --- .../cuco/detail/open_addressing/kernels.cuh | 41 ++++++ .../open_addressing/open_addressing_impl.cuh | 67 ++++++++++ include/cuco/detail/static_map/static_map.inl | 64 +++++++++ .../cuco/detail/static_map/static_map_ref.inl | 73 +++++++++++ include/cuco/static_map.cuh | 70 ++++++++++ tests/CMakeLists.txt | 1 + tests/static_map/for_each_test.cu | 122 ++++++++++++++++++ 7 files changed, 438 insertions(+) create mode 100644 tests/static_map/for_each_test.cu diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 266335a50..649165486 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -182,6 +182,47 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, } } +/** + * @brief Asynchronously executes a callback on every element in the container whose key matches + * with a key from the input key sequence. + * + * @note Passes an un-incrementable input iterator to the element whose key matches with + * a key from the input key sequence to the callback. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `key_type` of the data structure + * @tparam CallbackOp Unary callback functor or device lambda + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param callback_op Function to call on every element found in the container + * @param ref Non-owning container device ref used to access the slot storage + */ +template +CUCO_KERNEL __launch_bounds__(BlockSize) void for_each(InputIt first, + cuco::detail::index_type n, + CallbackOp callback_op, + Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& key{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.for_each(key, callback_op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.for_each(tile, key, callback_op); + } + idx += loop_stride; + } +} + /** * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data * structure if `pred` of the corresponding stencil returns true. diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 9dabff990..87215e207 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -681,6 +681,73 @@ class open_addressing_impl { return output_begin + h_num_out; } + /** + * @brief Executes a callback on every filled element in the container. + * + * @note Passes an un-incrementable input iterator to the element whose key is filled + * to the callback. + * + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param callback_op Function to call on every filled element in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) + { + using const_iterator = typename storage_ref_type::const_iterator; + + auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + thrust::for_each( + thrust::cuda::par.on(stream.get()), + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(this->capacity()), + [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const idx) { + auto const window_idx = idx / storage_ref_type::window_size; + auto const intra_idx = idx % storage_ref_type::window_size; + auto const slot_ptr = const_iterator{&(storage_[window_idx][intra_idx])}; + + if (is_filled(*slot_ptr)) { callback_op(slot_ptr); } + }); + } + + /** + * @brief Asynchronously executes a callback on every element in the container whose key matches + * with a key from the input key sequence. + * + * @note Passes an un-incrementable input iterator to the element whose key matches with + * a key from the input key sequence to the callback. + * + * @tparam InputIt Device accessible random access input iterator whose `value_type` is + * convertible to key type of the map. + * @tparam CallbackOp Unary callback functor or device lambda + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every element found in the container + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + Ref container_ref, + cuda::stream_ref stream) + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::for_each + <<>>( + first, num_keys, std::forward(callback_op), container_ref); + } + /** * @brief Gets the number of elements in the container * diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index e2f7aae71..ea3ff9e28 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -143,6 +143,70 @@ void static_mapclear_async(stream); } +template +template +void static_map::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) +{ + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + CallbackOp&& callback_op, cuda::stream_ref stream) +{ + impl_->for_each_async(std::forward(callback_op), stream); +} + +template +template +void static_map::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); +} + template +class operator_impl< + op::for_each_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Executes a callback on every element in the container with key equivalent to the probe + * key. + * + * @note Passes an un-incrementable input iterator to the element whose key is equivalent to + * `key` to the callback. + * + * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param key The key to search for + * @param callback_op Function to call on every element found + */ + template + __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(key, std::forward(callback_op)); + } + + /** + * @brief Executes a callback on every element in the container with key equivalent to the probe + * key. + * + * @note Passes an un-incrementable input iterator to the element whose key is equivalent to + * `key` to the callback. + * + * @note This function uses cooperative group semantics, meaning that any thread may call the + * callback if it finds a matching element. If multiple elements are found within the same group, + * each thread with a match will call the callback with its associated element. + * + * @note Synchronizing `group` within `callback_op` is undefined behavior. + * + * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param callback_op Function to call on every element found + */ + template + __device__ void for_each(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(group, key, std::forward(callback_op)); + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 0ef12ea5d..a22742a4d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -279,6 +279,76 @@ class static_map { */ void clear_async(cuda::stream_ref stream = {}) noexcept; + /** + * @brief Executes a callback on every filled element in the container. + * + * @note Passes an un-incrementable input iterator to the element whose key is filled + * + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param callback_op Function to call on every filled element in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously executes a callback on every filled element in the container. + * + * @note Passes an un-incrementable input iterator to the element whose key is filled + * + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param callback_op Function to call on every filled element in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}); + + /** + * @brief Executes a callback on every element in the container whose key matches with + * a key from the input key sequence. + * + * @note Passes an un-incrementable input iterator to the element whose key matches with + * a key from the input key sequence to the callback. + * + * @tparam InputIt Device accessible random access input iterator whose `value_type` is + * convertible to key type of the map. + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every element found in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously executes a callback on every element in the container whose key matches + * with a key from the input key sequence. + * + * @note Passes an un-incrementable input iterator to the element whose key matches with + * a key from the input key sequence to the callback. + * + * @tparam InputIt Device accessible random access input iterator whose `value_type` is + * convertible to key type of the map. + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every element found in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}); + /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful * insertions. diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 06321dea6..639c7f3eb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/custom_type_test.cu static_map/duplicate_keys_test.cu static_map/erase_test.cu + static_map/for_each_test.cu static_map/hash_test.cu static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu new file mode 100644 index 000000000..2b5d50b7d --- /dev/null +++ b/tests/static_map/for_each_test.cu @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +using size_type = std::size_t; + +template +void test_for_each(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + REQUIRE(num_keys % 2 == 0); + + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([] __device__(auto i) { + // use payload as 1 for even keys and 2 for odd keys + return cuco::pair{i, i % 2 + 1}; + })); + + cuda::stream_ref stream{}; + + map.insert(pairs_begin, pairs_begin + num_keys, stream); + + using Allocator = cuco::cuda_allocator>; + cuco::detail::counter_storage counter_storage( + Allocator{}); + counter_storage.reset(stream); + + // count all the keys which are even and whose payload has value 1 + map.for_each( + [counter = counter_storage.data()] __device__(auto const slot_ptr) { + auto const& [key, value] = *slot_ptr; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + + auto const res = counter_storage.load_to_host(stream); + REQUIRE(res == num_keys / 2); + + counter_storage.reset(stream); + + map.for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(2 * num_keys), // test for false-positives + [counter = counter_storage.data()] __device__(auto const slot_ptr) { + auto const& [key, value] = *slot_ptr; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + REQUIRE(res == num_keys / 2); +} + +TEMPLATE_TEST_CASE_SIG( + "static_map for_each tests", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{100}; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_for_each(map, num_keys); +} From ded6180e14564ff1e88a2a9022d5e0dbda85b7b8 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Mon, 29 Jul 2024 20:27:11 +0000 Subject: [PATCH 02/17] use par_nosync execution policy --- include/cuco/detail/open_addressing/open_addressing_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 87215e207..09d7471ba 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -701,7 +701,7 @@ class open_addressing_impl { this->empty_key_sentinel(), this->erased_key_sentinel()}; thrust::for_each( - thrust::cuda::par.on(stream.get()), + thrust::cuda::par_nosync.on(stream.get()), thrust::make_counting_iterator(static_cast(0)), thrust::make_counting_iterator(this->capacity()), [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const idx) { From a845c2ae8609d16be38fc9154c0de2ebf09cfa0e Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 31 Jul 2024 18:23:47 +0000 Subject: [PATCH 03/17] pass copy of the slot to the callback_op --- .../open_addressing/open_addressing_impl.cuh | 11 +++++------ .../open_addressing_ref_impl.cuh | 18 +++++++++--------- .../cuco/detail/static_map/static_map_ref.inl | 4 ++-- .../static_multiset/static_multiset_ref.inl | 12 ++++++------ include/cuco/static_map.cuh | 10 +++++----- tests/static_map/for_each_test.cu | 8 ++++---- tests/static_multiset/for_each_test.cu | 12 ++++++------ 7 files changed, 37 insertions(+), 38 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 09d7471ba..383e5cd00 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -684,8 +684,7 @@ class open_addressing_impl { /** * @brief Executes a callback on every filled element in the container. * - * @note Passes an un-incrementable input iterator to the element whose key is filled - * to the callback. + * @note Passes a copy of the filled element to the callback. * * @tparam CallbackOp Unary callback functor or device lambda * @@ -707,9 +706,9 @@ class open_addressing_impl { [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const idx) { auto const window_idx = idx / storage_ref_type::window_size; auto const intra_idx = idx % storage_ref_type::window_size; - auto const slot_ptr = const_iterator{&(storage_[window_idx][intra_idx])}; + auto const slot = storage_[window_idx][intra_idx]; - if (is_filled(*slot_ptr)) { callback_op(slot_ptr); } + if (is_filled(slot)) { callback_op(slot); } }); } @@ -717,8 +716,8 @@ class open_addressing_impl { * @brief Asynchronously executes a callback on every element in the container whose key matches * with a key from the input key sequence. * - * @note Passes an un-incrementable input iterator to the element whose key matches with - * a key from the input key sequence to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam InputIt Device accessible random access input iterator whose `value_type` is * convertible to key type of the map. diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 5396f318a..2e8f038a6 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -969,8 +969,8 @@ class open_addressing_ref_impl { * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam ProbeKey Input type which is convertible to 'key_type' * @tparam CallbackOp Unary callback functor or device lambda @@ -995,7 +995,7 @@ class open_addressing_ref_impl { return; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: continue; @@ -1009,8 +1009,8 @@ class open_addressing_ref_impl { * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1045,7 +1045,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { @@ -1064,8 +1064,8 @@ class open_addressing_ref_impl { * key and can additionally perform work that requires synchronizing the Cooperative Group * performing this operation. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1108,7 +1108,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 6a7e4a0a1..ca5567f3c 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -1010,8 +1010,8 @@ class operator_impl< * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam ProbeKey Input type which is convertible to 'key_type' * @tparam CallbackOp Unary callback functor or device lambda diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl index d08e50f0e..0ba47fbb8 100644 --- a/include/cuco/detail/static_multiset/static_multiset_ref.inl +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -469,8 +469,8 @@ class operator_impl< * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam ProbeKey Input type which is convertible to 'key_type' * @tparam CallbackOp Unary callback functor or device lambda @@ -490,8 +490,8 @@ class operator_impl< * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -521,8 +521,8 @@ class operator_impl< * key and can additionally perform work that requires synchronizing the Cooperative Group * performing this operation. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index a22742a4d..fd434f175 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -282,7 +282,7 @@ class static_map { /** * @brief Executes a callback on every filled element in the container. * - * @note Passes an un-incrementable input iterator to the element whose key is filled + * @note Passes a copy of the filled element to the callback. * * @tparam CallbackOp Unary callback functor or device lambda * @@ -309,8 +309,8 @@ class static_map { * @brief Executes a callback on every element in the container whose key matches with * a key from the input key sequence. * - * @note Passes an un-incrementable input iterator to the element whose key matches with - * a key from the input key sequence to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam InputIt Device accessible random access input iterator whose `value_type` is * convertible to key type of the map. @@ -331,8 +331,8 @@ class static_map { * @brief Asynchronously executes a callback on every element in the container whose key matches * with a key from the input key sequence. * - * @note Passes an un-incrementable input iterator to the element whose key matches with - * a key from the input key sequence to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam InputIt Device accessible random access input iterator whose `value_type` is * convertible to key type of the map. diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu index 2b5d50b7d..1c72a2e58 100644 --- a/tests/static_map/for_each_test.cu +++ b/tests/static_map/for_each_test.cu @@ -55,8 +55,8 @@ void test_for_each(Map& map, size_type num_keys) // count all the keys which are even and whose payload has value 1 map.for_each( - [counter = counter_storage.data()] __device__(auto const slot_ptr) { - auto const& [key, value] = *slot_ptr; + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } }, stream); @@ -69,8 +69,8 @@ void test_for_each(Map& map, size_type num_keys) map.for_each( thrust::counting_iterator(0), thrust::counting_iterator(2 * num_keys), // test for false-positives - [counter = counter_storage.data()] __device__(auto const slot_ptr) { - auto const& [key, value] = *slot_ptr; + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } }, stream); diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index 1872586b7..b987ba660 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -45,8 +45,8 @@ CUCO_KERNEL void for_each_check_scalar(Ref ref, while (idx < n) { auto const& key = *(first + idx); std::size_t matches = 0; - ref.for_each(key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { matches++; } + ref.for_each(key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { matches++; } }); if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); } idx += loop_stride; @@ -73,13 +73,13 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, ref.for_each( tile, key, - [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }, [] __device__(auto const& group) { group.sync(); }); } else { - ref.for_each(tile, key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + ref.for_each(tile, key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }); } auto const tile_matches = From f867a0a153ffcca29edb1adaced7169173f7522f Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 31 Jul 2024 21:13:36 +0000 Subject: [PATCH 04/17] Replace thrust::for_each with cub::DeviceFor::ForEachN --- .../open_addressing/open_addressing_impl.cuh | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 383e5cd00..23f4e4c3e 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -694,22 +695,39 @@ class open_addressing_impl { template void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) { - using const_iterator = typename storage_ref_type::const_iterator; + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator()}; auto const is_filled = open_addressing_ns::detail::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; - thrust::for_each( - thrust::cuda::par_nosync.on(stream.get()), - thrust::make_counting_iterator(static_cast(0)), - thrust::make_counting_iterator(this->capacity()), + auto const op = [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const idx) { auto const window_idx = idx / storage_ref_type::window_size; auto const intra_idx = idx % storage_ref_type::window_size; auto const slot = storage_[window_idx][intra_idx]; if (is_filled(slot)) { callback_op(slot); } - }); + }; + + CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(nullptr, + temp_storage_bytes, + thrust::make_counting_iterator(static_cast(0)), + this->capacity(), + op, + stream.get())); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(d_temp_storage, + temp_storage_bytes, + thrust::make_counting_iterator(static_cast(0)), + this->capacity(), + op, + stream.get())); } /** From eeecedb06b935c1b9d4f4e302c7ab52c998659a9 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 31 Jul 2024 21:15:56 +0000 Subject: [PATCH 05/17] update docs --- include/cuco/detail/open_addressing/kernels.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 649165486..a306b4c55 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -186,8 +186,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, * @brief Asynchronously executes a callback on every element in the container whose key matches * with a key from the input key sequence. * - * @note Passes an un-incrementable input iterator to the element whose key matches with - * a key from the input key sequence to the callback. + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block From 48158483d1e2870718376b5ad68456fee75b1bba Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 2 Aug 2024 17:46:36 +0000 Subject: [PATCH 06/17] deallocate temp storage --- include/cuco/detail/open_addressing/open_addressing_impl.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 23f4e4c3e..8ba3724cc 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -728,6 +728,8 @@ class open_addressing_impl { this->capacity(), op, stream.get())); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); } /** From df28e18685e05d2893ab53581608b2e383c7216e Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 7 Aug 2024 23:10:40 +0000 Subject: [PATCH 07/17] use cub ForEachCopyN --- .../open_addressing/open_addressing_impl.cuh | 34 +++---------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 8ba3724cc..52be9f106 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -695,41 +695,17 @@ class open_addressing_impl { template void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) { - std::size_t temp_storage_bytes = 0; - using temp_allocator_type = - typename std::allocator_traits::template rebind_alloc; - auto temp_allocator = temp_allocator_type{this->allocator()}; - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; auto const op = - [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const idx) { - auto const window_idx = idx / storage_ref_type::window_size; - auto const intra_idx = idx % storage_ref_type::window_size; - auto const slot = storage_[window_idx][intra_idx]; - - if (is_filled(slot)) { callback_op(slot); } + [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const window_slots) { + for (auto const slot : window_slots) { + if (is_filled(slot)) { callback_op(slot); } + } }; - CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(nullptr, - temp_storage_bytes, - thrust::make_counting_iterator(static_cast(0)), - this->capacity(), - op, - stream.get())); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); - - CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(d_temp_storage, - temp_storage_bytes, - thrust::make_counting_iterator(static_cast(0)), - this->capacity(), - op, - stream.get())); - - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + cub::DeviceFor::ForEachCopyN(storage_.data(), storage_.num_windows(), op, stream.get()); } /** From 0d2a64918ebdfba77bf542efef022648ec2f1a59 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 7 Aug 2024 23:42:28 +0000 Subject: [PATCH 08/17] minor nits --- .../open_addressing/open_addressing_impl.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 52be9f106..690fc5ad0 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -698,14 +698,14 @@ class open_addressing_impl { auto const is_filled = open_addressing_ns::detail::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; - auto const op = - [callback_op, is_filled, storage_ = this->storage_ref()] __device__(auto const window_slots) { - for (auto const slot : window_slots) { - if (is_filled(slot)) { callback_op(slot); } - } - }; - - cub::DeviceFor::ForEachCopyN(storage_.data(), storage_.num_windows(), op, stream.get()); + auto storage_ref = this->storage_ref(); + auto const op = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) { + for (auto const slot : window_slots) { + if (is_filled(slot)) { callback_op(slot); } + } + }; + + cub::DeviceFor::ForEachCopyN(storage_ref.data(), storage_ref.num_windows(), op, stream.get()); } /** From 61e6200e07bc38949bde47fed9672e4462fb8457 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:07:50 -0700 Subject: [PATCH 09/17] Make for_each const --- .../open_addressing/open_addressing_impl.cuh | 4 +- include/cuco/detail/static_map/static_map.inl | 128 ++++++++-------- include/cuco/static_map.cuh | 140 +++++++++--------- 3 files changed, 136 insertions(+), 136 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 690fc5ad0..741568688 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -693,7 +693,7 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const { auto const is_filled = open_addressing_ns::detail::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; @@ -731,7 +731,7 @@ class open_addressing_impl { InputIt last, CallbackOp&& callback_op, Ref container_ref, - cuda::stream_ref stream) + cuda::stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 554953f69..e575114de 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -145,70 +145,6 @@ void static_mapclear_async(stream); } -template -template -void static_map::for_each( - CallbackOp&& callback_op, cuda::stream_ref stream) -{ - impl_->for_each_async(std::forward(callback_op), stream); - stream.wait(); -} - -template -template -void static_map::for_each_async( - CallbackOp&& callback_op, cuda::stream_ref stream) -{ - impl_->for_each_async(std::forward(callback_op), stream); -} - -template -template -void static_map::for_each( - InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) -{ - impl_->for_each_async( - first, last, std::forward(callback_op), ref(op::for_each), stream); - stream.wait(); -} - -template -template -void static_map::for_each_async( - InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) -{ - impl_->for_each_async( - first, last, std::forward(callback_op), ref(op::for_each), stream); -} - template find_async(first, last, output_begin, ref(op::find), stream); } +template +template +void static_map::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); +} + +template +template +void static_map::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const noexcept +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); +} + template - void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}); - - /** - * @brief Asynchronously executes a callback on every filled element in the container. - * - * @note Passes an un-incrementable input iterator to the element whose key is filled - * - * @tparam CallbackOp Unary callback functor or device lambda - * - * @param callback_op Function to call on every filled element in the container - * @param stream CUDA stream used for this operation - */ - template - void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}); - - /** - * @brief Executes a callback on every element in the container whose key matches with - * a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. - * - * @tparam InputIt Device accessible random access input iterator whose `value_type` is - * convertible to key type of the map. - * @tparam CallbackOp Unary callback functor or device lambda - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param callback_op Function to call on every element found in the container - * @param stream CUDA stream used for this operation - */ - template - void for_each(InputIt first, - InputIt last, - CallbackOp&& callback_op, - cuda::stream_ref stream = {}); - - /** - * @brief Asynchronously executes a callback on every element in the container whose key matches - * with a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. - * - * @tparam InputIt Device accessible random access input iterator whose `value_type` is - * convertible to key type of the map. - * @tparam CallbackOp Unary callback functor or device lambda - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param callback_op Function to call on every element found in the container - * @param stream CUDA stream used for this operation - */ - template - void for_each_async(InputIt first, - InputIt last, - CallbackOp&& callback_op, - cuda::stream_ref stream = {}); - /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful * insertions. @@ -832,6 +762,76 @@ class static_map { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief Executes a callback on every filled element in the container. + * + * @note Passes a copy of the filled element to the callback. + * + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param callback_op Function to call on every filled element in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously executes a callback on every filled element in the container. + * + * @note Passes an un-incrementable input iterator to the element whose key is filled + * + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param callback_op Function to call on every filled element in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Executes a callback on every element in the container whose key matches with + * a key from the input key sequence. + * + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. + * + * @tparam InputIt Device accessible random access input iterator whose `value_type` is + * convertible to key type of the map. + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every element found in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously executes a callback on every element in the container whose key matches + * with a key from the input key sequence. + * + * @note Passes a copy of the element whose `key` matches with a key from the input key sequence + * to the callback. + * + * @tparam InputIt Device accessible random access input iterator whose `value_type` is + * convertible to key type of the map. + * @tparam CallbackOp Unary callback functor or device lambda + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every element found in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const noexcept; + /** * @brief Retrieves all of the keys and their associated values. * From 1e96ba2e2cc76753a96064207c4988824b7ccca9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:09:13 -0700 Subject: [PATCH 10/17] Catch ForEachCopyN CUDA error --- include/cuco/detail/open_addressing/open_addressing_impl.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 741568688..c4dbea782 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -705,7 +705,8 @@ class open_addressing_impl { } }; - cub::DeviceFor::ForEachCopyN(storage_ref.data(), storage_ref.num_windows(), op, stream.get()); + CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( + storage_ref.data(), storage_ref.num_windows(), op, stream.get())); } /** From a495f96f86554dddab995cf868770231ea636b43 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:25:50 -0700 Subject: [PATCH 11/17] Update docs --- .../open_addressing/open_addressing_impl.cuh | 23 ++++------ include/cuco/static_map.cuh | 46 ++++++++----------- 2 files changed, 27 insertions(+), 42 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index c4dbea782..01c8ef5d9 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -683,13 +683,12 @@ class open_addressing_impl { } /** - * @brief Executes a callback on every filled element in the container. + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container * - * @note Passes a copy of the filled element to the callback. + * @tparam CallbackOp Type of unary callback function object * - * @tparam CallbackOp Unary callback functor or device lambda - * - * @param callback_op Function to call on every filled element in the container + * @param callback_op Function to call on every filled slot in the container * @param stream CUDA stream used for this operation */ template @@ -710,20 +709,16 @@ class open_addressing_impl { } /** - * @brief Asynchronously executes a callback on every element in the container whose key matches - * with a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. * - * @tparam InputIt Device accessible random access input iterator whose `value_type` is - * convertible to key type of the map. - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object * @tparam Ref Type of non-owning device container ref allowing access to storage * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param callback_op Function to call on every element found in the container + * @param callback_op Function to call on every match found in the container * @param container_ref Non-owning device container ref used to access the slot storage * @param stream CUDA stream used for this operation */ diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index bd1bdcbe2..30711f171 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -763,45 +763,39 @@ class static_map { cuda::stream_ref stream = {}) const; /** - * @brief Executes a callback on every filled element in the container. - * - * @note Passes a copy of the filled element to the callback. + * @brief Applies the given function object `callback_op` to the copy of every filled slot in the + * container * - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * - * @param callback_op Function to call on every filled element in the container + * @param callback_op Function to call on every filled slot in the container * @param stream CUDA stream used for this operation */ template void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; /** - * @brief Asynchronously executes a callback on every filled element in the container. + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container * - * @note Passes an un-incrementable input iterator to the element whose key is filled + * @tparam CallbackOp Type of unary callback function object * - * @tparam CallbackOp Unary callback functor or device lambda - * - * @param callback_op Function to call on every filled element in the container + * @param callback_op Function to call on every filled slot in the container * @param stream CUDA stream used for this operation */ template void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; /** - * @brief Executes a callback on every element in the container whose key matches with - * a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief For each key in the range [first, last), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. * - * @tparam InputIt Device accessible random access input iterator whose `value_type` is - * convertible to key type of the map. - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param callback_op Function to call on every element found in the container + * @param callback_op Function to call on every match found in the container * @param stream CUDA stream used for this operation */ template @@ -811,19 +805,15 @@ class static_map { cuda::stream_ref stream = {}) const; /** - * @brief Asynchronously executes a callback on every element in the container whose key matches - * with a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. * - * @tparam InputIt Device accessible random access input iterator whose `value_type` is - * convertible to key type of the map. - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param callback_op Function to call on every element found in the container + * @param callback_op Function to call on every match found in the container * @param stream CUDA stream used for this operation */ template From a1b39f14bb5e1aaa3bdacb34ed0efc54f8c0e06f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:32:29 -0700 Subject: [PATCH 12/17] Rename the kernel as for_each_n --- include/cuco/detail/open_addressing/kernels.cuh | 17 +++++++---------- .../open_addressing/open_addressing_impl.cuh | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index a306b4c55..adb8c7f13 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -183,17 +183,14 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, } /** - * @brief Asynchronously executes a callback on every element in the container whose key matches - * with a key from the input key sequence. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief For each key in the range [first, first + n), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the `key_type` of the data structure - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements @@ -202,10 +199,10 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, * @param ref Non-owning container device ref used to access the slot storage */ template -CUCO_KERNEL __launch_bounds__(BlockSize) void for_each(InputIt first, - cuco::detail::index_type n, - CallbackOp callback_op, - Ref ref) +CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first, + cuco::detail::index_type n, + CallbackOp callback_op, + Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 01c8ef5d9..4f706ff8d 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -734,7 +734,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::for_each + detail::for_each_n <<>>( first, num_keys, std::forward(callback_op), container_ref); } From 0105682b5d9c012310aee2001625e21946cef965 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:49:25 -0700 Subject: [PATCH 13/17] Update ref docs --- .../cuco/detail/static_map/static_map_ref.inl | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 20f53aa58..f0ef6cc66 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -1271,17 +1271,14 @@ class operator_impl< public: /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. - * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to the match */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -1292,11 +1289,8 @@ class operator_impl< } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. - * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1304,12 +1298,12 @@ class operator_impl< * * @note Synchronizing `group` within `callback_op` is undefined behavior. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to the match */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, From c081b0efcc846ddae8c6f3e79ac4bafa0c4796a1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Aug 2024 14:51:17 -0700 Subject: [PATCH 14/17] Revert multiset doc changes --- .../detail/static_multiset/static_multiset_ref.inl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl index ddf1d036e..fc1f3db9d 100644 --- a/include/cuco/detail/static_multiset/static_multiset_ref.inl +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -495,8 +495,8 @@ class operator_impl< * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @note Passes an un-incrementable input iterator to the element whose key is equivalent to + * `key` to the callback. * * @tparam ProbeKey Input type which is convertible to 'key_type' * @tparam CallbackOp Unary callback functor or device lambda @@ -516,8 +516,8 @@ class operator_impl< * @brief Executes a callback on every element in the container with key equivalent to the probe * key. * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @note Passes an un-incrementable input iterator to the element whose key is equivalent to + * `key` to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -547,8 +547,8 @@ class operator_impl< * key and can additionally perform work that requires synchronizing the Cooperative Group * performing this operation. * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @note Passes an un-incrementable input iterator to the element whose key is equivalent to + * `key` to the callback. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, From 3f29affed421e539e3bd51bcf87a14e76b2e2c15 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Aug 2024 10:26:28 -0700 Subject: [PATCH 15/17] Update OA docs --- .../open_addressing_ref_impl.cuh | 38 ++++++++----------- .../cuco/detail/static_map/static_map_ref.inl | 4 +- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 2e8f038a6..1c30f305f 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -966,17 +966,14 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. + * @brief For a given key, applies the function object `callback_op` to all corresponding matches + * found in the container. * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. - * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every match */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -1006,11 +1003,8 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief For a given key, applies the function object `callback_op` to all corresponding matches + * found in the container. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1018,12 +1012,12 @@ class open_addressing_ref_impl { * * @note Synchronizing `group` within `callback_op` is undefined behavior. * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every match */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, @@ -1060,9 +1054,9 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key and can additionally perform work that requires synchronizing the Cooperative Group - * performing this operation. + * @brief Applies the function object `callback_op` on every slot in the container with key + * equivalent to the probe key and can additionally perform work that requires synchronizing the + * Cooperative Group performing this operation. * * @note Passes a copy of the element whose `key` matches with a key from the input key sequence * to the callback. @@ -1078,13 +1072,13 @@ class open_addressing_ref_impl { * synchronization points is capped by `window_size * cg_size`. The functor will be called right * after the current probing window has been traversed. * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object * @tparam SyncOp Functor or device lambda which accepts the current `group` object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every match * @param sync_op Function that is allowed to synchronize `group` inbetween probing windows */ template diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index f0ef6cc66..e37294fbe 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -1275,7 +1275,7 @@ class operator_impl< * container. * * @tparam ProbeKey Probe key type - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for * @param callback_op Function to apply to the match @@ -1299,7 +1299,7 @@ class operator_impl< * @note Synchronizing `group` within `callback_op` is undefined behavior. * * @tparam ProbeKey Probe key type - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for From d9fd78fceb55f669a5f32ff6192bbffa870b4c4a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Aug 2024 10:29:46 -0700 Subject: [PATCH 16/17] Minor doc updates --- .../open_addressing_ref_impl.cuh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 1c30f305f..c5d3fd4df 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -966,8 +966,8 @@ class open_addressing_ref_impl { } /** - * @brief For a given key, applies the function object `callback_op` to all corresponding matches - * found in the container. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * * @tparam ProbeKey Probe key type * @tparam CallbackOp Type of unary callback function object @@ -1003,8 +1003,8 @@ class open_addressing_ref_impl { } /** - * @brief For a given key, applies the function object `callback_op` to all corresponding matches - * found in the container. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1054,12 +1054,9 @@ class open_addressing_ref_impl { } /** - * @brief Applies the function object `callback_op` on every slot in the container with key - * equivalent to the probe key and can additionally perform work that requires synchronizing the - * Cooperative Group performing this operation. - * - * @note Passes a copy of the element whose `key` matches with a key from the input key sequence - * to the callback. + * @brief Applies the function object `callback_op` to the copy of every slot in the container + * with key equivalent to the probe key and can additionally perform work that requires + * synchronizing the Cooperative Group performing this operation. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, From 680f6e3d3aed11f8045945d839b824be3e551a85 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Aug 2024 17:24:01 -0700 Subject: [PATCH 17/17] Update docs --- include/cuco/detail/open_addressing/kernels.cuh | 4 +++- .../open_addressing/open_addressing_impl.cuh | 4 ++++ .../open_addressing/open_addressing_ref_impl.cuh | 12 +++++++++--- .../cuco/detail/static_map/static_map_ref.inl | 11 +++++++---- include/cuco/static_map.cuh | 16 ++++++++++++---- 5 files changed, 35 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index adb8c7f13..24fce230c 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -186,6 +186,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, * @brief For each key in the range [first, first + n), applies the function object `callback_op` to * the copy of all corresponding matches found in the container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block * @tparam InputIt Device accessible input iterator whose `value_type` is @@ -195,7 +197,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, * * @param first Beginning of the sequence of input elements * @param n Number of input elements - * @param callback_op Function to call on every element found in the container + * @param callback_op Function to call on every matched slot found in the container * @param ref Non-owning container device ref used to access the slot storage */ template diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 4f706ff8d..f9c35e0ff 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -686,6 +686,8 @@ class open_addressing_impl { * @brief Asynchronously applies the given function object `callback_op` to the copy of every * filled slot in the container * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam CallbackOp Type of unary callback function object * * @param callback_op Function to call on every filled slot in the container @@ -712,6 +714,8 @@ class open_addressing_impl { * @brief For each key in the range [first, last), asynchronously applies the function object * `callback_op` to the copy of all corresponding matches found in the container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam InputIt Device accessible random access input iterator * @tparam CallbackOp Type of unary callback function object * @tparam Ref Type of non-owning device container ref allowing access to storage diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index c5d3fd4df..ef402fc7b 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -969,11 +969,13 @@ class open_addressing_ref_impl { * @brief For a given key, applies the function object `callback_op` to the copy of all * corresponding matches found in the container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam ProbeKey Probe key type * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for - * @param callback_op Function to apply to every match + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -1010,6 +1012,8 @@ class open_addressing_ref_impl { * callback if it finds a matching element. If multiple elements are found within the same group, * each thread with a match will call the callback with its associated element. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note Synchronizing `group` within `callback_op` is undefined behavior. * * @tparam ProbeKey Probe key type @@ -1017,7 +1021,7 @@ class open_addressing_ref_impl { * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to apply to every match + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, @@ -1064,6 +1068,8 @@ class open_addressing_ref_impl { * * @note Synchronizing `group` within `callback_op` is undefined behavior. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note The `sync_op` function can be used to perform work that requires synchronizing threads in * `group` inbetween probing steps, where the number of probing steps performed between * synchronization points is capped by `window_size * cg_size`. The functor will be called right @@ -1075,7 +1081,7 @@ class open_addressing_ref_impl { * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to apply to every match + * @param callback_op Function to apply to every matched slot * @param sync_op Function that is allowed to synchronize `group` inbetween probing windows */ template diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index e37294fbe..f13847cc2 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -1274,11 +1274,13 @@ class operator_impl< * @brief For a given key, applies the function object `callback_op` to its match found in the * container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam ProbeKey Probe key type * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for - * @param callback_op Function to apply to the match + * @param callback_op Function to apply to the copy of the matched key-value pair */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -1293,8 +1295,9 @@ class operator_impl< * container. * * @note This function uses cooperative group semantics, meaning that any thread may call the - * callback if it finds a matching element. If multiple elements are found within the same group, - * each thread with a match will call the callback with its associated element. + * callback if it finds a matching key-value pair. + * + * @note The return value of `callback_op`, if any, is ignored. * * @note Synchronizing `group` within `callback_op` is undefined behavior. * @@ -1303,7 +1306,7 @@ class operator_impl< * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to apply to the match + * @param callback_op Function to apply to the copy of the matched key-value pair */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 30711f171..01a39ad5d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -766,9 +766,11 @@ class static_map { * @brief Applies the given function object `callback_op` to the copy of every filled slot in the * container * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam CallbackOp Type of unary callback function object * - * @param callback_op Function to call on every filled slot in the container + * @param callback_op Function to apply to the copy of the matched key-value pair * @param stream CUDA stream used for this operation */ template @@ -778,9 +780,11 @@ class static_map { * @brief Asynchronously applies the given function object `callback_op` to the copy of every * filled slot in the container * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam CallbackOp Type of unary callback function object * - * @param callback_op Function to call on every filled slot in the container + * @param callback_op Function to apply to the copy of the matched key-value pair * @param stream CUDA stream used for this operation */ template @@ -790,12 +794,14 @@ class static_map { * @brief For each key in the range [first, last), applies the function object `callback_op` to * the copy of all corresponding matches found in the container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam InputIt Device accessible random access input iterator * @tparam CallbackOp Type of unary callback function object * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param callback_op Function to call on every match found in the container + * @param callback_op Function to apply to the copy of the matched key-value pair * @param stream CUDA stream used for this operation */ template @@ -808,12 +814,14 @@ class static_map { * @brief For each key in the range [first, last), asynchronously applies the function object * `callback_op` to the copy of all corresponding matches found in the container. * + * @note The return value of `callback_op`, if any, is ignored. + * * @tparam InputIt Device accessible random access input iterator * @tparam CallbackOp Type of unary callback function object * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param callback_op Function to call on every match found in the container + * @param callback_op Function to apply to the copy of the matched key-value pair * @param stream CUDA stream used for this operation */ template