diff --git a/README.md b/README.md index f6c80ba2f..e75b8a2b2 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith ### Major Updates +__02/03/2026__ Modernized `dynamic_map`: promoted `cuco::experimental::dynamic_map` to `cuco::dynamic_map` and removed the legacy implementation + __01/30/2026__ Removed legacy `static_multimap` implementation and promoted `cuco::experimental::static_multimap` to `cuco::static_multimap` __10/08/2025__ Changed `cuda_allocator` to stream-ordered, requiring `cuda::stream_ref` parameter in `allocate`/`deallocate`. @@ -244,7 +246,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection ### `dynamic_map` -`cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It currently only provides host-bulk APIs. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information. +`cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It supports `insert`, `insert_or_assign`, `erase`, `find`, `contains`, and `retrieve_all` operations via host-bulk APIs with kernel-based implementations for optimal performance. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information. #### Examples: - [Host-bulk APIs (TODO)]() diff --git a/benchmarks/dynamic_map/contains_bench.cu b/benchmarks/dynamic_map/contains_bench.cu index 60d8af1b5..90867ee64 100644 --- a/benchmarks/dynamic_map/contains_bench.cu +++ b/benchmarks/dynamic_map/contains_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * Copyright (c) 2023-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains( state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + map.contains(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); }); } diff --git a/benchmarks/dynamic_map/erase_bench.cu b/benchmarks/dynamic_map/erase_bench.cu index c6f373138..5e0a02547 100644 --- a/benchmarks/dynamic_map/erase_bench.cu +++ b/benchmarks/dynamic_map/erase_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * Copyright (c) 2023-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,15 +56,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase( state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - // dynamic map with erase support cuco::dynamic_map map{static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + map.insert(pairs.begin(), pairs.end(), {launch.get_stream()}); timer.start(); - map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream()); + map.erase(keys.begin(), keys.end(), {launch.get_stream()}); timer.stop(); }); } diff --git a/benchmarks/dynamic_map/find_bench.cu b/benchmarks/dynamic_map/find_bench.cu index f68987857..2fd87d27c 100644 --- a/benchmarks/dynamic_map/find_bench.cu +++ b/benchmarks/dynamic_map/find_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * Copyright (c) 2023-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find( state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + map.find(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); }); } diff --git a/benchmarks/dynamic_map/insert_bench.cu b/benchmarks/dynamic_map/insert_bench.cu index f157a79b4..7c3d24bcf 100644 --- a/benchmarks/dynamic_map/insert_bench.cu +++ b/benchmarks/dynamic_map/insert_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. + * Copyright (c) 2023-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,15 +57,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert( state.exec( nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - cuco::dynamic_map map{static_cast(initial_size), - cuco::empty_key{-1}, - cuco::empty_value{-1}, - {}, - launch.get_stream()}; + cuco::dynamic_map map{ + static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; timer.start(); for (int64_t i = 0; i < num_keys; i += batch_size) { - map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {}, {}, launch.get_stream()); + map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {launch.get_stream()}); } timer.stop(); }); diff --git a/benchmarks/dynamic_map/retrieve_all_bench.cu b/benchmarks/dynamic_map/retrieve_all_bench.cu index fb7d5b0bf..1dd34bbe1 100644 --- a/benchmarks/dynamic_map/retrieve_all_bench.cu +++ b/benchmarks/dynamic_map/retrieve_all_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,14 +53,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_retrieve_all( cuco::dynamic_map map{ static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(pairs.begin(), pairs.end()); - // Prepare output buffers - thrust::device_vector retrieved_keys(map.get_size()); - thrust::device_vector retrieved_values(map.get_size()); - state.add_element_count(map.get_size()); + thrust::device_vector retrieved_keys(map.size()); + thrust::device_vector retrieved_values(map.size()); + + state.add_element_count(map.size()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_all(retrieved_keys.begin(), retrieved_values.begin(), launch.get_stream()); + map.retrieve_all(retrieved_keys.begin(), retrieved_values.begin(), {launch.get_stream()}); }); } diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl deleted file mode 100644 index 962d95b69..000000000 --- a/include/cuco/detail/dynamic_map.inl +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace cuco { - -template -dynamic_map::dynamic_map(std::size_t initial_capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - Allocator const& alloc, - cudaStream_t stream) - : empty_key_sentinel_(empty_key_sentinel.value), - empty_value_sentinel_(empty_value_sentinel.value), - erased_key_sentinel_(empty_key_sentinel.value), - size_(0), - capacity_(initial_capacity), - min_insert_size_(1E4), - max_load_factor_(0.60), - alloc_{alloc} -{ - submaps_.push_back(std::make_unique>( - initial_capacity, - empty_key{empty_key_sentinel}, - empty_value{empty_value_sentinel}, - alloc, - stream)); - submap_views_.push_back(submaps_[0]->get_device_view()); - submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->num_successes_); -} - -template -dynamic_map::dynamic_map(std::size_t initial_capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel, - Allocator const& alloc, - cudaStream_t stream) - : empty_key_sentinel_(empty_key_sentinel.value), - empty_value_sentinel_(empty_value_sentinel.value), - erased_key_sentinel_(erased_key_sentinel.value), - size_(0), - capacity_(initial_capacity), - min_insert_size_(1E4), - max_load_factor_(0.60), - alloc_{alloc} -{ - CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, - "The empty key sentinel and erased key sentinel cannot be the same value.", - std::runtime_error); - - submaps_.push_back(std::make_unique>( - initial_capacity, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}, - erased_key{erased_key_sentinel_}, - alloc, - stream)); - submap_views_.push_back(submaps_[0]->get_device_view()); - submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->num_successes_); -} - -template -void dynamic_map::reserve(std::size_t n, cudaStream_t stream) -{ - // Calculate current total available capacity across all submaps - std::size_t total_available_capacity = 0; - for (std::size_t i = 0; i < submaps_.size(); ++i) { - std::size_t submap_usable_capacity = - static_cast(max_load_factor_ * submaps_[i]->get_capacity()); - // Only count capacity above the minimum insert threshold - if (submap_usable_capacity >= min_insert_size_) { - total_available_capacity += submap_usable_capacity - min_insert_size_; - } - } - - // Create new submaps until we have enough capacity - while (total_available_capacity < n) { - std::size_t new_submap_capacity = capacity_; - - if (erased_key_sentinel_ != empty_key_sentinel_) { - submaps_.push_back(std::make_unique>( - new_submap_capacity, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}, - erased_key{erased_key_sentinel_}, - alloc_, - stream)); - } else { - submaps_.push_back(std::make_unique>( - new_submap_capacity, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}, - alloc_, - stream)); - } - - std::size_t submap_idx = submaps_.size() - 1; - submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_); - submap_views_.push_back(submaps_[submap_idx]->get_device_view()); - submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); - - // Add the new submap's usable capacity - std::size_t new_usable_capacity = - static_cast(max_load_factor_ * new_submap_capacity); - if (new_usable_capacity >= min_insert_size_) { - total_available_capacity += new_usable_capacity - min_insert_size_; - } - - // Update capacity for next submap (double the size) - capacity_ *= 2; - } -} - -template -template -void dynamic_map::insert( - InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) -{ - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), - "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - - std::size_t num_to_insert = std::distance(first, last); - - reserve(size_ + num_to_insert, stream); - - std::size_t submap_idx = 0; - while (num_to_insert > 0) { - std::size_t capacity_remaining = - max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size(); - // If we are tying to insert some of the remaining keys into this submap, we can insert - // only if we meet the minimum insert size. - if (capacity_remaining >= min_insert_size_) { - CUCO_CUDA_TRY( - cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream)); - - auto const n = std::min(capacity_remaining, num_to_insert); - auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); - - detail::insert> - <<>>(first, - first + n, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - submap_num_successes_.data().get(), - submap_idx, - submaps_.size(), - hash, - key_equal); - - std::size_t h_num_successes; - CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes, - submap_num_successes_[submap_idx], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - stream)); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - submaps_[submap_idx]->size_ += h_num_successes; - size_ += h_num_successes; - first += n; - num_to_insert -= n; - } - submap_idx++; - } -} - -template -template -void dynamic_map::erase( - InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) -{ - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), - "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - - auto const num_keys = std::distance(first, last); - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - - // zero out submap success counters - for (std::size_t i = 0; i < submaps_.size(); ++i) { - CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream)); - } - - auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); - - detail::erase - <<>>(first, - first + num_keys, - submap_mutable_views_.data().get(), - submap_num_successes_.data().get(), - submaps_.size(), - hash, - key_equal); - - for (std::size_t i = 0; i < submaps_.size(); ++i) { - std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes, - submap_num_successes_[i], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost, - stream)); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - submaps_[i]->size_ -= h_submap_num_successes; - size_ -= h_submap_num_successes; - } -} - -template -template -void dynamic_map::find(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) -{ - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - - auto const num_keys = std::distance(first, last); - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - - detail::find<<>>( - first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); -} - -template -template -std::pair dynamic_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const -{ - auto constexpr block_size = 128; - auto constexpr stride = 1; - - auto const capacity = get_capacity(); - auto grid_size = (capacity + stride * block_size - 1) / (stride * block_size); - - std::vector submap_cap_prefix(submaps_.size()); - std::inclusive_scan( - submaps_.begin(), - submaps_.end(), - submap_cap_prefix.begin(), - [](auto const& sum, auto const& submap) { return sum + submap->get_capacity(); }, - size_t{0}); - thrust::device_vector submap_cap_prefix_d(submap_cap_prefix); - - auto counter = - detail::counter_storage{this->alloc_, cuda::stream_ref{stream}}; - counter.reset({stream}); - - detail::retrieve_all - <<>>(keys_out, - values_out, - submap_views_.data().get(), - submaps_.size(), - capacity, - counter.data(), - submap_cap_prefix_d.data().get()); - - auto const h_num_out = counter.load_to_host({stream}); - return {keys_out + h_num_out, values_out + h_num_out}; -} - -template -template -void dynamic_map::contains(InputIt first, - InputIt last, - OutputIt output_begin, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) -{ - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto constexpr tile_size = 4; - - auto const num_keys = std::distance(first, last); - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - - detail::contains<<>>( - first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); -} - -} // namespace cuco diff --git a/include/cuco/detail/dynamic_map/dynamic_map.inl b/include/cuco/detail/dynamic_map/dynamic_map.inl index fb65147f5..670239b58 100644 --- a/include/cuco/detail/dynamic_map/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map/dynamic_map.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,20 +14,22 @@ * limitations under the License. */ -#include -#include +#include #include #include #include -#include +#include #include #include #include +#include +#include +#include +#include namespace cuco { -namespace experimental { template (1E4)}, - max_load_factor_{0.60}, + max_load_factor_{0.60f}, alloc_{alloc} { - submaps_.push_back( - std::make_unique< - cuco::static_map>( - initial_capacity, - empty_key_sentinel, - empty_value_sentinel, - pred, - probing_scheme, - scope, - storage, - alloc, - stream)); + submaps_.push_back(std::make_unique(initial_capacity, + empty_key_sentinel, + empty_value_sentinel, + pred, + probing_scheme, + scope, + storage, + alloc, + stream)); +} + +template +constexpr dynamic_map:: + dynamic_map(Extent initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope scope, + Storage storage, + Allocator const& alloc, + cuda::stream_ref stream) + : size_{0}, + capacity_{initial_capacity}, + min_insert_size_{static_cast(1E4)}, + max_load_factor_{0.60f}, + alloc_{alloc} +{ + CUCO_EXPECTS(empty_key_sentinel.value != erased_key_sentinel.value, + "The empty key sentinel and erased key sentinel cannot be the same value.", + std::runtime_error); + + submaps_.push_back(std::make_unique(initial_capacity, + empty_key_sentinel, + empty_value_sentinel, + erased_key_sentinel, + pred, + probing_scheme, + scope, + storage, + alloc, + stream)); } template reserve(size_ + num_to_insert, stream); + // Fast path: single submap, no cross-submap duplicate check needed + if (submaps_.size() == 1) { + size_ += submaps_.front()->insert(first, last, stream); + return; + } + + // Multiple submaps: use kernel to check for duplicates across all submaps + using ref_type = decltype(submaps_.front()->ref(cuco::op::contains, cuco::op::insert)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + + using counter_allocator_type = + typename std::allocator_traits::template rebind_alloc>; + auto counter_allocator = counter_allocator_type{alloc_}; + std::size_t submap_idx = 0; while (num_to_insert > 0) { auto& cur = submaps_[submap_idx]; auto capacity_remaining = max_load_factor_ * cur->capacity() - cur->size(); - // If we are tying to insert some of the remaining keys into this submap, we can insert - // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { auto const n = std::min(static_cast(capacity_remaining), num_to_insert); - std::size_t h_num_successes = cur->insert(first, first + n, stream); + // Allocate and initialize device counter + auto* d_num_successes = counter_allocator.allocate(1, stream); + CUCO_CUDA_TRY( + cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic), stream.get())); + + // Allocate and copy refs for all submaps (with both contains and insert ops) + auto* d_submap_refs = ref_allocator.allocate(submaps_.size(), stream); + std::vector h_submap_refs; + h_submap_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_submap_refs.push_back(submap->ref(cuco::op::contains, cuco::op::insert)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_submap_refs, + h_submap_refs.data(), + sizeof(ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(n, cg_size); + + detail::dynamic_map_ns::insert + <<>>(first, + n, + d_num_successes, + d_submap_refs, + static_cast(submap_idx), + static_cast(submaps_.size())); + + // Read back success count + std::size_t h_num_successes = 0; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes, + d_num_successes, + sizeof(std::size_t), + cudaMemcpyDeviceToHost, + stream.get())); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); + + ref_allocator.deallocate(d_submap_refs, submaps_.size(), stream); + counter_allocator.deallocate(d_num_successes, 1, stream); size_ += h_num_successes; first += n; @@ -102,6 +197,101 @@ void dynamic_map +template +void dynamic_map:: + insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream) +{ + auto num_to_insert = cuco::detail::distance(first, last); + this->reserve(size_ + num_to_insert, stream); + + // Fast path: single submap + if (submaps_.size() == 1) { + auto const old_size = submaps_.front()->size(stream); + submaps_.front()->insert_or_assign(first, last, stream); + auto const new_size = submaps_.front()->size(stream); + size_ += (new_size - old_size); + return; + } + + // Multiple submaps: use kernel to check for existing keys across all submaps + using ref_type = decltype(submaps_.front()->ref( + cuco::op::contains, cuco::op::insert, cuco::op::insert_or_assign)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + + using counter_allocator_type = + typename std::allocator_traits::template rebind_alloc>; + auto counter_allocator = counter_allocator_type{alloc_}; + + std::size_t submap_idx = 0; + while (num_to_insert > 0) { + auto& cur = submaps_[submap_idx]; + + auto capacity_remaining = max_load_factor_ * cur->capacity() - cur->size(); + if (capacity_remaining >= min_insert_size_) { + auto const n = std::min(static_cast(capacity_remaining), num_to_insert); + + // Allocate and initialize device counter for new insertions + auto* d_num_insertions = counter_allocator.allocate(1, stream); + CUCO_CUDA_TRY( + cudaMemsetAsync(d_num_insertions, 0, sizeof(cuda::atomic), stream.get())); + + // Allocate and copy refs for all submaps + auto* d_submap_refs = ref_allocator.allocate(submaps_.size(), stream); + std::vector h_submap_refs; + h_submap_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_submap_refs.push_back( + submap->ref(cuco::op::contains, cuco::op::insert, cuco::op::insert_or_assign)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_submap_refs, + h_submap_refs.data(), + sizeof(ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(n, cg_size); + + detail::dynamic_map_ns::insert_or_assign + <<>>(first, + n, + d_num_insertions, + d_submap_refs, + static_cast(submap_idx), + static_cast(submaps_.size())); + + // Read back insertion count (only new insertions, not assignments) + std::size_t h_num_insertions = 0; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_insertions, + d_num_insertions, + sizeof(std::size_t), + cudaMemcpyDeviceToHost, + stream.get())); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); + + ref_allocator.deallocate(d_submap_refs, submaps_.size(), stream); + counter_allocator.deallocate(d_num_insertions, 1, stream); + + size_ += h_num_insertions; + first += n; + num_to_insert -= n; + } + submap_idx++; + } +} + template ::reserve( size_type n, cuda::stream_ref stream) { - size_type num_elements_remaining = n; - std::size_t submap_idx = 0; - while (num_elements_remaining > 0) { + auto const& ref = *submaps_.front(); + auto const empty_key_val = ref.empty_key_sentinel(); + auto const empty_value_val = ref.empty_value_sentinel(); + auto const erased_key_val = ref.erased_key_sentinel(); + auto const pred = ref.key_eq(); + auto const probing_scheme = ProbingScheme{ref.hash_function()}; + auto const has_erased_key = empty_key_val != erased_key_val; + + std::size_t submap_idx = 0; + while (n > 0) { std::size_t submap_capacity; - // if the submap already exists if (submap_idx < submaps_.size()) { submap_capacity = submaps_[submap_idx]->capacity(); - } - // if the submap does not exist yet, create it - else { - empty_key empty_key_sentinel{submaps_.front()->empty_key_sentinel()}; - empty_value empty_value_sentinel{submaps_.front()->empty_value_sentinel()}; - + } else { submap_capacity = capacity_; - submaps_.push_back(std::make_unique(submap_capacity, - empty_key_sentinel, - empty_value_sentinel, - KeyEqual{}, - ProbingScheme{}, - cuda_thread_scope{}, - Storage{}, - alloc_, - stream)); + + if (has_erased_key) { + submaps_.push_back(std::make_unique(submap_capacity, + empty_key{empty_key_val}, + empty_value{empty_value_val}, + erased_key{erased_key_val}, + pred, + probing_scheme, + cuda_thread_scope{}, + Storage{}, + alloc_, + stream)); + } else { + submaps_.push_back(std::make_unique(submap_capacity, + empty_key{empty_key_val}, + empty_value{empty_value_val}, + pred, + probing_scheme, + cuda_thread_scope{}, + Storage{}, + alloc_, + stream)); + } capacity_ *= 2; } - num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_; + auto const usable_capacity = + static_cast(max_load_factor_ * submap_capacity) - min_insert_size_; + if (usable_capacity >= n) { break; } + n -= usable_capacity; submap_idx++; } } +template +template +void dynamic_map::erase( + InputIt first, InputIt last, cuda::stream_ref stream) +{ + auto const& ref = *submaps_.front(); + CUCO_EXPECTS(ref.empty_key_sentinel() != ref.erased_key_sentinel(), + "Erase requires a unique erased key sentinel to be provided at construction.", + std::runtime_error); + + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + // Fast path: single submap + if (submaps_.size() == 1) { + auto const size_before = submaps_.front()->size(stream); + submaps_.front()->erase(first, last, stream); + auto const size_after = submaps_.front()->size(stream); + size_ -= (size_before - size_after); + return; + } + + // Multiple submaps: use kernel to erase from all submaps in parallel + using erase_ref_type = decltype(submaps_.front()->ref(cuco::op::erase)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + + using counter_allocator_type = + typename std::allocator_traits::template rebind_alloc>; + auto counter_allocator = counter_allocator_type{alloc_}; + + // Allocate and initialize device counter + auto* d_num_successes = counter_allocator.allocate(1, stream); + CUCO_CUDA_TRY( + cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic), stream.get())); + + // Allocate and copy erase refs for all submaps + auto* d_refs = ref_allocator.allocate(submaps_.size(), stream); + std::vector h_refs; + h_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_refs.push_back(submap->ref(cuco::op::erase)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs, + h_refs.data(), + sizeof(erase_ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::dynamic_map_ns::erase<<>>( + first, num_keys, d_num_successes, d_refs, static_cast(submaps_.size())); + + // Read back success count + std::size_t h_num_successes = 0; + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_num_successes, d_num_successes, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream.get())); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); + + ref_allocator.deallocate(d_refs, submaps_.size(), stream); + counter_allocator.deallocate(d_num_successes, 1, stream); + + size_ -= h_num_successes; +} + +template +template +void dynamic_map::erase_async( + InputIt first, InputIt last, cuda::stream_ref stream) +{ + auto const& ref = *submaps_.front(); + CUCO_EXPECTS(ref.empty_key_sentinel() != ref.erased_key_sentinel(), + "Erase requires a unique erased key sentinel to be provided at construction.", + std::runtime_error); + + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + // Fast path: single submap + if (submaps_.size() == 1) { + submaps_.front()->erase_async(first, last, stream); + return; + } + + // Multiple submaps: use kernel to erase from all submaps in parallel + using erase_ref_type = decltype(submaps_.front()->ref(cuco::op::erase)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + + // Allocate and copy erase refs for all submaps + auto* d_refs = ref_allocator.allocate(submaps_.size(), stream); + std::vector h_refs; + h_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_refs.push_back(submap->ref(cuco::op::erase)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs, + h_refs.data(), + sizeof(erase_ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + // For async, we don't track success count + using counter_allocator_type = + typename std::allocator_traits::template rebind_alloc>; + auto counter_allocator = counter_allocator_type{alloc_}; + auto* d_num_successes = counter_allocator.allocate(1, stream); + CUCO_CUDA_TRY( + cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic), stream.get())); + + detail::dynamic_map_ns::erase<<>>( + first, num_keys, d_num_successes, d_refs, static_cast(submaps_.size())); + + // Deallocate asynchronously (counter value is discarded for async) + ref_allocator.deallocate(d_refs, submaps_.size(), stream); + counter_allocator.deallocate(d_num_successes, 1, stream); +} + +template +template +void dynamic_map::find( + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const +{ + find_async(first, last, output_begin, stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); +} + +template +template +void dynamic_map::find_async( + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + if (submaps_.size() == 1) { + submaps_.front()->find_async(first, last, output_begin, stream); + return; + } + + using ref_type = decltype(submaps_.front()->ref(cuco::op::find)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + auto* d_refs = ref_allocator.allocate(submaps_.size(), stream); + + std::vector h_refs; + h_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_refs.push_back(submap->ref(cuco::op::find)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs, + h_refs.data(), + sizeof(ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::dynamic_map_ns::find<<>>( + first, num_keys, output_begin, d_refs, static_cast(submaps_.size())); + + ref_allocator.deallocate(d_refs, submaps_.size(), stream); +} + template void dynamic_map::contains( InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { - auto num_keys = cuco::detail::distance(first, last); - std::size_t traversed = 0; - std::size_t submap_idx = 0; - while (num_keys > 0 && submap_idx < submaps_.size()) { - const auto& cur = submaps_[submap_idx]; - const size_t cur_size = cur->size(); - const size_t num_keys_to_process = - std::min(static_cast(cur_size), num_keys); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); + contains_async(first, last, output_begin, stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); +} - cur->contains(first, first + num_keys_to_process, output_begin + traversed, stream); +template +template +void dynamic_map:: + contains_async(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } - traversed += num_keys_to_process; - num_keys -= num_keys_to_process; - submap_idx++; - first += num_keys_to_process; + if (submaps_.size() == 1) { + submaps_.front()->contains_async(first, last, output_begin, stream); + return; } + + using ref_type = decltype(submaps_.front()->ref(cuco::op::contains)); + + using ref_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto ref_allocator = ref_allocator_type{alloc_}; + auto* d_refs = ref_allocator.allocate(submaps_.size(), stream); + + std::vector h_refs; + h_refs.reserve(submaps_.size()); + for (auto const& submap : submaps_) { + h_refs.push_back(submap->ref(cuco::op::contains)); + } + CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs, + h_refs.data(), + sizeof(ref_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + + auto constexpr cg_size = ProbingScheme::cg_size; + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::dynamic_map_ns::contains<<>>( + first, num_keys, output_begin, d_refs, static_cast(submaps_.size())); + + ref_allocator.deallocate(d_refs, submaps_.size(), stream); +} + +template +template +std::pair +dynamic_map::retrieve_all( + KeyOut keys_out, ValueOut values_out, cuda::stream_ref stream) const +{ + if (size_ == 0) { return {keys_out, values_out}; } + + // Fast path: single submap + if (submaps_.size() == 1) { return submaps_.front()->retrieve_all(keys_out, values_out, stream); } + + // Multiple submaps: use kernel + using slot_type = typename map_type::value_type; + + // Compute capacity prefix sums and total capacity + std::vector h_capacity_prefix_sum(submaps_.size()); + detail::index_type total_capacity = 0; + for (std::size_t i = 0; i < submaps_.size(); ++i) { + total_capacity += submaps_[i]->capacity(); + h_capacity_prefix_sum[i] = total_capacity; + } + + // Collect slot pointers + std::vector h_slot_arrays(submaps_.size()); + for (std::size_t i = 0; i < submaps_.size(); ++i) { + h_slot_arrays[i] = submaps_[i]->data(); + } + + // Allocate device memory + using slot_ptr_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto slot_ptr_allocator = slot_ptr_allocator_type{alloc_}; + auto* d_slot_arrays = slot_ptr_allocator.allocate(submaps_.size(), stream); + + using index_allocator_type = + typename std::allocator_traits::template rebind_alloc; + auto index_allocator = index_allocator_type{alloc_}; + auto* d_capacity_prefix_sum = index_allocator.allocate(submaps_.size(), stream); + + using counter_allocator_type = + typename std::allocator_traits::template rebind_alloc>; + auto counter_allocator = counter_allocator_type{alloc_}; + auto* d_num_out = counter_allocator.allocate(1, stream); + + // Copy data to device + CUCO_CUDA_TRY(cudaMemcpyAsync(d_slot_arrays, + h_slot_arrays.data(), + sizeof(slot_type const*) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + CUCO_CUDA_TRY(cudaMemcpyAsync(d_capacity_prefix_sum, + h_capacity_prefix_sum.data(), + sizeof(detail::index_type) * submaps_.size(), + cudaMemcpyHostToDevice, + stream.get())); + CUCO_CUDA_TRY(cudaMemsetAsync(d_num_out, 0, sizeof(cuda::atomic), stream.get())); + + auto constexpr block_size = detail::default_block_size(); + auto const grid_size = detail::grid_size(total_capacity); + + detail::dynamic_map_ns::retrieve_all + <<>>(keys_out, + values_out, + d_slot_arrays, + static_cast(submaps_.size()), + total_capacity, + d_num_out, + d_capacity_prefix_sum, + empty_key_sentinel(), + erased_key_sentinel()); + + // Read back count + std::size_t h_num_out = 0; + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream.get())); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get())); + + // Deallocate + slot_ptr_allocator.deallocate(d_slot_arrays, submaps_.size(), stream); + index_allocator.deallocate(d_capacity_prefix_sum, submaps_.size(), stream); + counter_allocator.deallocate(d_num_out, 1, stream); + + return {keys_out + h_num_out, values_out + h_num_out}; } -} // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/dynamic_map/kernels.cuh b/include/cuco/detail/dynamic_map/kernels.cuh new file mode 100644 index 000000000..fc6253dc7 --- /dev/null +++ b/include/cuco/detail/dynamic_map/kernels.cuh @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include + +namespace cuco::detail::dynamic_map_ns { +namespace cg = cooperative_groups; + +CUCO_SUPPRESS_KERNEL_WARNINGS + +/** + * @brief Inserts key/value pairs into the map, checking all submaps for duplicates. + * + * For each key, checks all submaps except the target for existing keys. Only inserts + * if the key doesn't exist in any other submap. + * + * @tparam CGSize Cooperative group size + * @tparam BlockSize The number of threads in the thread block + * @tparam InputIt Device accessible input iterator + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of submap device ref with both contains and insert capabilities + * + * @param first Beginning of the sequence of key/value pairs + * @param n Number of keys + * @param num_successes Pointer to atomic counter for successful insertions + * @param submap_refs Array of submap refs (with contains and insert ops) + * @param insert_idx Index of the submap we're inserting into + * @param num_submaps Total number of submaps + */ +template +CUCO_KERNEL void insert(InputIt first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref* submap_refs, + uint32_t insert_idx, + uint32_t num_submaps) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + std::size_t thread_num_successes = 0; + + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename cuda::std::iterator_traits::value_type const pair{*(first + idx)}; + bool exists = false; + + if constexpr (CGSize == 1) { + for (uint32_t i = 0; i < num_submaps && !exists; ++i) { + if (i != insert_idx) { exists = submap_refs[i].contains(pair.first); } + } + + if (!exists) { + if (submap_refs[insert_idx].insert(pair)) { ++thread_num_successes; } + } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + for (uint32_t i = 0; i < num_submaps && !exists; ++i) { + if (i != insert_idx) { exists = submap_refs[i].contains(tile, pair.first); } + } + tile.sync(); + + if (!exists) { + if (submap_refs[insert_idx].insert(tile, pair) && tile.thread_rank() == 0) { + ++thread_num_successes; + } + } + } + idx += loop_stride; + } + + // Aggregate success count + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts or assigns key/value pairs, checking all submaps. + * + * For each key, checks all submaps. If found, assigns the new value. If not found, + * inserts into the target submap. Only counts new insertions (not assignments). + * + * @tparam CGSize Cooperative group size + * @tparam BlockSize The number of threads in the thread block + * @tparam InputIt Device accessible input iterator + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of submap device ref with contains, insert, and insert_or_assign capabilities + * + * @param first Beginning of the sequence of key/value pairs + * @param n Number of keys + * @param num_insertions Pointer to atomic counter for new insertions (not assignments) + * @param submap_refs Array of submap refs + * @param insert_idx Index of the submap to insert into if key not found + * @param num_submaps Total number of submaps + */ +template +CUCO_KERNEL void insert_or_assign(InputIt first, + cuco::detail::index_type n, + AtomicT* num_insertions, + Ref* submap_refs, + uint32_t insert_idx, + uint32_t num_submaps) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + std::size_t thread_num_insertions = 0; + + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename cuda::std::iterator_traits::value_type const pair{*(first + idx)}; + bool found = false; + + if constexpr (CGSize == 1) { + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + if (submap_refs[i].contains(pair.first)) { + submap_refs[i].insert_or_assign(pair); + found = true; + } + } + + if (!found) { + if (submap_refs[insert_idx].insert(pair)) { ++thread_num_insertions; } + } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + if (submap_refs[i].contains(tile, pair.first)) { + submap_refs[i].insert_or_assign(tile, pair); + found = true; + } + } + tile.sync(); + + if (!found) { + if (submap_refs[insert_idx].insert(tile, pair) && tile.thread_rank() == 0) { + ++thread_num_insertions; + } + } + } + idx += loop_stride; + } + + // Aggregate insertion count + std::size_t const block_num_insertions = BlockReduce(temp_storage).Sum(thread_num_insertions); + if (threadIdx.x == 0) { + num_insertions->fetch_add(block_num_insertions, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Erases keys from all submaps. + * + * For each key, attempts to erase from all submaps. Tracks total erased count. + * + * @tparam CGSize Cooperative group size + * @tparam BlockSize The number of threads in the thread block + * @tparam InputIt Device accessible input iterator + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of submap device ref with erase capability + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param num_successes Pointer to atomic counter for successful erasures + * @param submap_refs Array of submap refs for erase operations + * @param num_submaps Total number of submaps + */ +template +CUCO_KERNEL void erase(InputIt first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref* submap_refs, + uint32_t num_submaps) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + std::size_t thread_num_successes = 0; + + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename cuda::std::iterator_traits::value_type const key{*(first + idx)}; + + if constexpr (CGSize == 1) { + for (uint32_t i = 0; i < num_submaps; ++i) { + if (submap_refs[i].erase(key)) { + ++thread_num_successes; + break; + } + } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + for (uint32_t i = 0; i < num_submaps; ++i) { + if (submap_refs[i].erase(tile, key)) { + if (tile.thread_rank() == 0) { ++thread_num_successes; } + break; + } + } + } + idx += loop_stride; + } + + // Aggregate success count + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Finds the values corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in any submap, copies its associated value to `(output_begin + + * i)`. Else, copies the empty value sentinel. + * + * @tparam CGSize Cooperative group size + * @tparam BlockSize The number of threads in the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator + * @tparam Ref Type of submap device ref + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param submap_refs Array of submap device refs + * @param num_submaps The number of submaps in the map + */ +template +CUCO_KERNEL void find(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref const* submap_refs, + uint32_t num_submaps) +{ + using mapped_type = typename Ref::mapped_type; + + auto const empty_value_sentinel = submap_refs[0].empty_value_sentinel(); + + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + __shared__ mapped_type write_buffer[BlockSize]; + + while (idx < n) { + typename cuda::std::iterator_traits::value_type const key{*(first + idx)}; + auto found_value = empty_value_sentinel; + bool found = false; + + if constexpr (CGSize == 1) { + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + auto const result = submap_refs[i].find(key); + if (result != submap_refs[i].end()) { + found_value = result->second; + found = true; + } + } + write_buffer[threadIdx.x] = found_value; + __syncthreads(); + *(output_begin + idx) = write_buffer[threadIdx.x]; + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + auto const result = submap_refs[i].find(tile, key); + if (result != submap_refs[i].end()) { + found_value = result->second; + found = true; + } + } + + if (tile.thread_rank() == 0) { write_buffer[threadIdx.x / CGSize] = found_value; } + __syncthreads(); + if (tile.thread_rank() == 0) { *(output_begin + idx) = write_buffer[threadIdx.x / CGSize]; } + } + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in any submap. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam CGSize Cooperative group size + * @tparam BlockSize The number of threads in the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator + * @tparam Ref Type of submap device ref + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param submap_refs Array of submap device refs + * @param num_submaps The number of submaps in the map + */ +/** + * @brief Retrieves all key-value pairs from all submaps. + * + * Iterates through all slots across all submaps, outputting non-empty/non-erased pairs. + * + * @tparam BlockSize The number of threads in the thread block + * @tparam Key Key type + * @tparam Value Mapped value type + * @tparam KeyOut Device accessible output iterator for keys + * @tparam ValueOut Device accessible output iterator for values + * @tparam SlotT Slot type (cuco::pair) + * @tparam AtomicT Atomic counter type + * + * @param keys_out Beginning output iterator for keys + * @param values_out Beginning output iterator for values + * @param slot_arrays Array of pointers to each submap's slot storage + * @param num_submaps Number of submaps + * @param total_capacity Total number of slots across all submaps + * @param num_out Pointer to atomic counter for number of retrieved pairs + * @param capacity_prefix_sum Prefix sum of submap capacities + * @param empty_key_sentinel Sentinel value for empty key + * @param erased_key_sentinel Sentinel value for erased key + */ +template +CUCO_KERNEL void retrieve_all(KeyOut keys_out, + ValueOut values_out, + SlotT const* const* slot_arrays, + uint32_t num_submaps, + cuco::detail::index_type total_capacity, + AtomicT* num_out, + cuco::detail::index_type const* capacity_prefix_sum, + Key empty_key_sentinel, + Key erased_key_sentinel) +{ + using BlockScan = cub::BlockScan; + __shared__ typename BlockScan::TempStorage scan_temp_storage; + __shared__ unsigned int block_base; + + auto idx = cuco::detail::global_thread_id(); + + while ((idx - threadIdx.x) < total_capacity) { + // Determine which submap this slot belongs to + uint32_t submap_idx = 0; + auto submap_offset = idx; + + if (idx < total_capacity) { + while (submap_idx < num_submaps && idx >= capacity_prefix_sum[submap_idx]) { + ++submap_idx; + } + if (submap_idx > 0) { submap_offset = idx - capacity_prefix_sum[submap_idx - 1]; } + } + + // Check if slot is filled (not empty and not erased) + bool is_filled = false; + Key key{}; + Value value{}; + + if (idx < total_capacity && submap_idx < num_submaps) { + auto const& slot = slot_arrays[submap_idx][submap_offset]; + // Use atomic_ref for thread-safe read + cuda::atomic_ref key_ref(slot.first); + key = key_ref.load(cuda::std::memory_order_relaxed); + is_filled = !cuco::detail::bitwise_compare(key, empty_key_sentinel) && + !cuco::detail::bitwise_compare(key, erased_key_sentinel); + if (is_filled) { + cuda::atomic_ref value_ref(slot.second); + value = value_ref.load(cuda::std::memory_order_relaxed); + } + } + + // Block scan to compute output positions + unsigned int local_idx = 0; + unsigned int block_valid = 0; + BlockScan(scan_temp_storage).ExclusiveSum(is_filled ? 1u : 0u, local_idx, block_valid); + + if (threadIdx.x == 0) { + block_base = num_out->fetch_add(block_valid, cuda::std::memory_order_relaxed); + } + __syncthreads(); + + if (is_filled) { + keys_out[block_base + local_idx] = key; + values_out[block_base + local_idx] = value; + } + + idx += cuco::detail::grid_stride(); + } +} + +template +CUCO_KERNEL void contains(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref const* submap_refs, + uint32_t num_submaps) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + __shared__ bool write_buffer[BlockSize]; + + while (idx < n) { + typename cuda::std::iterator_traits::value_type const key{*(first + idx)}; + bool found = false; + + if constexpr (CGSize == 1) { + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + found = submap_refs[i].contains(key); + } + write_buffer[threadIdx.x] = found; + __syncthreads(); + *(output_begin + idx) = write_buffer[threadIdx.x]; + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + for (uint32_t i = 0; i < num_submaps && !found; ++i) { + found = submap_refs[i].contains(tile, key); + } + + if (tile.thread_rank() == 0) { write_buffer[threadIdx.x / CGSize] = found; } + __syncthreads(); + if (tile.thread_rank() == 0) { *(output_begin + idx) = write_buffer[threadIdx.x / CGSize]; } + } + idx += loop_stride; + } +} + +} // namespace cuco::detail::dynamic_map_ns diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh deleted file mode 100644 index 6d3f9b9c7..000000000 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ /dev/null @@ -1,709 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include -#include - -#include - -namespace cuco { -namespace detail { -namespace cg = cooperative_groups; - -CUCO_SUPPRESS_KERNEL_WARNINGS - -/** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam block_size - * @tparam pair_type Type of the pairs contained in the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam viewT Type of the `static_map` device views - * @tparam mutableViewT Type of the `static_map` device mutable views - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param submap_views Array of `static_map::device_view` objects used to - * perform `contains` operations on each underlying `static_map` - * @param submap_mutable_views Array of `static_map::device_mutable_view` objects - * used to perform an `insert` into the target `static_map` submap - * @param num_successes The number of successfully inserted key/value pairs - * @param insert_idx The index of the submap we are inserting into - * @param num_submaps The total number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function used to compare two keys for equality - */ -template -CUCO_KERNEL void insert(InputIt first, - InputIt last, - viewT* submap_views, - mutableViewT* submap_mutable_views, - atomicT* num_successes, - uint32_t insert_idx, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - - while (first + tid < last) { - pair_type insert_pair = *(first + tid); - auto exists = false; - - // manually check for duplicates in those submaps we are not inserting into - for (auto i = 0; i < num_submaps; ++i) { - if (i != insert_idx) { - exists = submap_views[i].contains(insert_pair.first, hash, key_equal); - if (exists) { break; } - } - } - if (!exists) { - if (submap_mutable_views[insert_idx].insert(insert_pair, hash, key_equal)) { - thread_num_successes++; - } - } - - tid += gridDim.x * blockDim.x; - } - - std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to perform each key/value insertion. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `insert` at moderate to high load factors. - * - * @tparam block_size - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam pair_type Type of the pairs contained in the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam viewT Type of the `static_map` device views - * @tparam mutableViewT Type of the `static_map` device mutable views - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param submap_views Array of `static_map::device_view` objects used to - * perform `contains` operations on each underlying `static_map` - * @param submap_mutable_views Array of `static_map::device_mutable_view` objects - * used to perform an `insert` into the target `static_map` submap - * @param submap_num_successes The number of successfully inserted key/value pairs for each submap - * @param insert_idx The index of the submap we are inserting into - * @param num_submaps The total number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function used to compare two keys for equality - */ -template -CUCO_KERNEL void insert(InputIt first, - InputIt last, - viewT* submap_views, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t insert_idx, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_successes = 0; - - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; - - while (it < last) { - pair_type insert_pair = *it; - auto exists = false; - - // manually check for duplicates in those submaps we are not inserting into - for (auto i = 0; i < num_submaps; ++i) { - if (i != insert_idx) { - exists = submap_views[i].contains(tile, insert_pair.first, hash, key_equal); - if (exists) { break; } - } - } - if (!exists) { - if (submap_mutable_views[insert_idx].insert(tile, insert_pair, hash, key_equal) && - tile.thread_rank() == 0) { - thread_num_successes++; - } - } - - it += (gridDim.x * blockDim.x) / tile_size; - } - - std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - submap_num_successes[insert_idx]->fetch_add(block_num_successes, - cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, its slot is erased and made available for future - insertions. - * Else, no effect. - * - * @tparam block_size The size of the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam mutableViewT Type of device view allowing modification of hash map storage - * @tparam atomicT Type of atomic storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to - * perform `erase` operations on each underlying `static_map` - * @param num_successes The number of successfully erased key/value pairs - * @param submap_num_successes The number of successfully erased key/value pairs - * in each submap - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void erase(InputIt first, - InputIt last, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - extern __shared__ unsigned long long submap_block_num_successes[]; - - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; - - for (auto i = threadIdx.x; i < num_submaps; i += block_size) { - submap_block_num_successes[i] = 0; - } - __syncthreads(); - - while (it < last) { - for (auto i = 0; i < num_submaps; ++i) { - if (submap_mutable_views[i].erase(*it, hash, key_equal)) { - atomicAdd(&submap_block_num_successes[i], 1); - break; - } - } - it += gridDim.x * blockDim.x; - } - __syncthreads(); - - for (auto i = 0; i < num_submaps; ++i) { - if (threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); - } - } -} - -/** - * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, its slot is erased and made available for future - * insertions. - * Else, no effect. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam mutableViewT Type of device view allowing modification of hash map storage - * @tparam atomicT Type of atomic storage - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to - * perform `erase` operations on each underlying `static_map` - * @param num_successes The number of successfully erased key/value pairs - * @param submap_num_successes The number of successfully erased key/value pairs - * in each submap - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void erase(InputIt first, - InputIt last, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - extern __shared__ unsigned long long submap_block_num_successes[]; - - auto block = cg::this_thread_block(); - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * block.group_index().x + block.thread_rank(); - auto it = first + tid / tile_size; - - for (auto i = threadIdx.x; i < num_submaps; i += block_size) { - submap_block_num_successes[i] = 0; - } - block.sync(); - - while (it < last) { - auto erased = false; - int i = 0; - for (i = 0; i < num_submaps; ++i) { - erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); - if (erased) { break; } - } - if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); } - it += (gridDim.x * blockDim.x) / tile_size; - } - block.sync(); - - for (auto i = 0; i < num_submaps; ++i) { - if (threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); - } - } -} - -/** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. - * Else, copies the empty value sentinel. - * - * @tparam block_size The number of threads in the thread block - * @tparam Value The mapped value type for the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of `static_map` device view - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param submap_views Array of `static_map::device_view` objects used to - * perform `find` operations on each underlying `static_map` - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void find(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); - __shared__ Value writeBuffer[block_size]; - - while (first + tid < last) { - auto key = *(first + tid); - auto found_value = empty_value_sentinel; - for (auto i = 0; i < num_submaps; ++i) { - auto submap_view = submap_views[i]; - auto found = submap_view.find(key, hash, key_equal); - if (found != submap_view.end()) { - found_value = found->second.load(cuda::std::memory_order_relaxed); - break; - } - } - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = found_value; - __syncthreads(); - *(output_begin + tid) = writeBuffer[threadIdx.x]; - tid += gridDim.x * blockDim.x; - } -} - -/** - * @brief Finds the values corresponding to all keys in the range `[first, last)`. - * - * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. - * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups - * of multiple threads to find each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `find` at moderate to high load factors. - * - * @tparam block_size The number of threads in the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to - * perform find operations - * @tparam Value The mapped value type for the map - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of `static_map` device view - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param submap_views Array of `static_map::device_view` objects used to - * perform `find` operations on each underlying `static_map` - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void find(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); - __shared__ Value writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - auto found_value = empty_value_sentinel; - for (auto i = 0; i < num_submaps; ++i) { - auto submap_view = submap_views[i]; - auto found = submap_view.find(tile, key, hash, key_equal); - if (found != submap_view.end()) { - found_value = found->second.load(cuda::std::memory_order_relaxed); - break; - } - } - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found_value; } - __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * blockDim.x) / tile_size; - } -} - -/** - * @brief Retrieves all of the keys and their associated values. - * - * The order in which keys are returned is implementation defined and not guaranteed to be - * consistent between subsequent calls to `retrieve_all`. - * - * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than - * `get_size()` - * - * @tparam block_size The number of threads in the thread block - * @tparam KeyOutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam ValueOutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of `static_map` device view - * @tparam AtomicT Atomic counter type - * - * @param keys_out Beginning output iterator for keys - * @param values_out Beginning output iterator for values - * @param submap_views Array of `static_map::device_view` objects used to - * perform `retrieve_all` operations on each underlying `static_map` - * @param num_submaps The number of submaps in the map - * @param capacity The total number of slots of all submaps - * @param d_num_out Pointer to the device memory location where the number of keys/vals retrieved - * are stored - * @param cap_prefix_sum Array of prefix sums of the number of slots in each submap - * @return Pair of iterators indicating the last elements in the output - */ -template -CUCO_KERNEL void retrieve_all(KeyOutputIt keys_out, - ValueOutputIt values_out, - viewT* submap_views, - uint32_t num_submaps, - uint64_t capacity, - AtomicT* d_num_out, - size_t* cap_prefix_sum) -{ - using BlockScan = cub::BlockScan; - - __shared__ typename BlockScan::TempStorage scan_temp_storage; - __shared__ unsigned int block_base; - - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto const empty_key_sentinel = submap_views[0].get_empty_key_sentinel(); - auto const erased_key_sentinel = submap_views[0].get_erased_key_sentinel(); - - while ((tid - threadIdx.x) < capacity) { - uint32_t submap_idx = 0; - uint32_t submap_offset = tid; - while (tid >= cap_prefix_sum[submap_idx] && submap_idx < num_submaps) { - ++submap_idx; - } - if (submap_idx > 0) { submap_offset = tid - cap_prefix_sum[submap_idx - 1]; } - - auto const& current_slot = submap_views[submap_idx].get_slots()[submap_offset]; - auto const existing_key = current_slot.first.load(cuda::std::memory_order_relaxed); - - bool const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or - cuco::detail::bitwise_compare(existing_key, erased_key_sentinel)); - - unsigned int local_idx = 0; - unsigned int block_valid = 0; - BlockScan(scan_temp_storage).ExclusiveSum(is_filled ? 1u : 0u, local_idx, block_valid); - - if (threadIdx.x == 0) { - block_base = d_num_out->fetch_add(block_valid, cuda::memory_order_relaxed); - } - __syncthreads(); - - if (is_filled) { - auto const value = current_slot.second.load(cuda::std::memory_order_relaxed); - keys_out[block_base + local_idx] = existing_key; - values_out[block_base + local_idx] = value; - } - tid += gridDim.x * blockDim.x; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * @tparam block_size The number of threads in the thread block - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of `static_map` device view - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param submap_views Array of `static_map::device_view` objects used to - * perform `contains` operations on each underlying `static_map` - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void contains(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - __shared__ bool writeBuffer[block_size]; - - while (first + tid < last) { - auto key = *(first + tid); - auto found = false; - for (auto i = 0; i < num_submaps; ++i) { - found = submap_views[i].contains(key, hash, key_equal); - if (found) { break; } - } - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - writeBuffer[threadIdx.x] = found; - __syncthreads(); - *(output_begin + tid) = writeBuffer[threadIdx.x]; - tid += gridDim.x * blockDim.x; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each key. This provides a significant boost in throughput compared - * to the non Cooperative Group `contains` at moderate to high load factors. - * - * @tparam block_size The number of threads in the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to - * perform find operations - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam viewT Type of `static_map` device view - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param submap_views Array of `static_map::device_view` objects used to - * perform `contains` operations on each underlying `static_map` - * @param num_submaps The number of submaps in the map - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void contains(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; - - while (first + key_idx < last) { - auto key = *(first + key_idx); - auto found = false; - for (auto i = 0; i < num_submaps; ++i) { - found = submap_views[i].contains(tile, key, hash, key_equal); - if (found) { break; } - } - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } - __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * blockDim.x) / tile_size; - } -} -} // namespace detail -} // namespace cuco diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 9eb1fa7e4..5b7839cbd 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * Copyright (c) 2020-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,24 +16,21 @@ #pragma once -#include #include #include #include -#include #include -#include #include #include #include #include +#include #include namespace cuco { -namespace experimental { /** * @brief A GPU-accelerated, unordered, associative container of key-value * pairs with unique keys. @@ -68,6 +65,7 @@ class dynamic_map { using value_type = typename map_type::value_type; ///< Key-value pair type using size_type = typename map_type::size_type; ///< Size type using key_equal = typename map_type::key_equal; ///< Key equality comparator type + using hasher = typename map_type::hasher; ///< Hash function type using mapped_type = T; ///< Payload type dynamic_map(dynamic_map const&) = delete; @@ -84,7 +82,7 @@ class dynamic_map { ~dynamic_map() = default; /** - * @brief Constructs a dynamically-sized map with erase capability. + * @brief Constructs a dynamically-sized map. * * The capacity of the map will automatically increase as the user adds key/value pairs using * `insert`. @@ -113,178 +111,6 @@ class dynamic_map { Allocator const& alloc = {}, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); - /** - * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs. - * - * If there is already enough space for `n` key/value pairs, the capacity remains the same. - * - * @param n The number of key value pairs for which there must be space - * @param stream Stream used for executing the kernels - */ - void reserve(size_type n, cuda::stream_ref stream); - - /** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * If multiple keys in `[first, last)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stream Stream used for executing the kernels - */ - template - void insert(InputIt first, - InputIt last, - cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); - - /** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. - * - * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param stream Stream used for executing the kernels - */ - template - void contains(InputIt first, - InputIt last, - OutputIt output_begin, - cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const; - - private: - size_type size_{}; ///< Number of keys in the map - size_type capacity_{}; ///< Maximum number of keys that can be inserted - - std::vector> submaps_; ///< vector of pointers to each submap - size_type min_insert_size_{}; ///< min remaining capacity of submap for insert - float max_load_factor_{}; ///< Maximum load factor - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage -}; - -} // namespace experimental - -/** - * @brief A GPU-accelerated, unordered, associative container of key-value - * pairs with unique keys - * - * Automatically grows capacity as necessary until device memory runs out. - * - * Allows constant time concurrent inserts or concurrent find operations (not - * concurrent insert and find) from threads in device code. - * - * Current limitations: - * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true - * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. - * - Capacity does not shrink automatically - * - Requires the user to specify sentinel values for both key and mapped value - * to indicate empty slots - * - Does not support concurrent insert and find operations - * - * The `dynamic_map` supports host-side "bulk" operations which include `insert`, `find` - * and `contains`. These are to be used when there are a large number of keys to insert - * or lookup in the map. For example, given a range of keys specified by device-accessible - * iterators, the bulk `insert` function will insert all keys into the map. - * - * Example: - * \code{.cpp} - * int empty_key_sentinel = -1; - * int empty_value_sentinel = -1; - * - * // Constructs a map with 100,000 initial slots using -1 and -1 as the empty key/value - * // sentinels. Performs one bulk insert of 50,000 keys and a second bulk insert of - * // 100,000 keys. The map automatically increases capacity to accomodate the excess keys - * // within the second insert. - * - * dynamic_map m{100'000, - * empty_key{empty_key_sentinel}, - * empty_value{empty_value_sentinel}}; - * - * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::device_vector> pairs_0(50'000); - * thrust::transform(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(pairs_0.size()), - * pairs_0.begin(), - * []__device__(auto i){ return cuco::pair{i,i}; }; - * - * thrust::device_vector> pairs_1(100'000); - * thrust::transform(thrust::make_counting_iterator(50'000), - * thrust::make_counting_iterator(pairs_1.size()), - * pairs_1.begin(), - * []__device__(auto i){ return cuco::pair{i,i}; }; - * - * // Inserts all pairs into the map - * m.insert(pairs_0.begin(), pairs_0.end()); - * m.insert(pairs_1.begin(), pairs_1.end()); - * \endcode - * - * @tparam Key Arithmetic type used for key - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which insert/find/contains will be performed by - * individual threads. - * @tparam Allocator Type of allocator used to allocate submap device storage - */ -template > -class dynamic_map { - static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); - - public: - using value_type = cuco::pair; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using atomic_ctr_type = cuda::atomic; ///< Atomic counter type - using view_type = - typename cuco::legacy::static_map::device_view; ///< Type for submap device - ///< view - using mutable_view_type = - typename cuco::legacy::static_map::device_mutable_view; ///< Type for submap - ///< mutable device - ///< view - - dynamic_map(dynamic_map const&) = delete; - dynamic_map(dynamic_map&&) = delete; - - dynamic_map& operator=(dynamic_map const&) = delete; - dynamic_map& operator=(dynamic_map&&) = delete; - - /** - * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor - * and sentinel values. - * - * The capacity of the map will automatically increase as the user adds key/value pairs using - * `insert`. - * - * Capacity increases by a factor of growth_factor each time the size of the map exceeds a - * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the - * map's capacity grows. - * - * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and - * undefined behavior results from attempting to insert any key/value pair - * that contains either. - * - * @param initial_capacity The initial number of slots in the map - * @param empty_key_sentinel The reserved key value for empty slots - * @param empty_value_sentinel The reserved mapped value for empty slots - * @param alloc Allocator used to allocate submap device storage - * @param stream Stream used for executing the kernels - */ - dynamic_map(std::size_t initial_capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - Allocator const& alloc = Allocator{}, - cudaStream_t stream = nullptr); - /** * @brief Constructs a dynamically-sized map with erase capability. * @@ -292,7 +118,7 @@ class dynamic_map { * `insert`. * * Capacity increases by a factor of growth_factor each time the size of the map exceeds a - * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the + * threshold occupancy. The performance of `find` and `contains` gradually decreases each time the * map's capacity grows. * * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and @@ -303,24 +129,26 @@ class dynamic_map { * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param erased_key_sentinel The reserved key value for erased slots - * @param alloc Allocator used to allocate submap device storage - * @param stream Stream used for executing the kernels + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map * * @throw std::runtime error if the empty key sentinel and erased key sentinel * are the same value */ - dynamic_map(std::size_t initial_capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - erased_key erased_key_sentinel, - Allocator const& alloc = Allocator{}, - cudaStream_t stream = nullptr); - - /** - * @brief Destroys the map and frees its contents - * - */ - ~dynamic_map() {} + constexpr dynamic_map(Extent initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs. @@ -330,75 +158,111 @@ class dynamic_map { * @param n The number of key value pairs for which there must be space * @param stream Stream used for executing the kernels */ - void reserve(std::size_t n, cudaStream_t stream = nullptr); + void reserve(size_type n, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** * @brief Inserts all key/value pairs in the range `[first, last)`. * + * @note This function synchronizes the given stream. + * * If multiple keys in `[first, last)` compare equal, it is unspecified which * element is inserted. * * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels */ - template , - typename KeyEqual = cuda::std::equal_to> + template void insert(InputIt first, InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = nullptr); + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` + * already exists in the map, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note This function synchronizes the given stream. + * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is + * inserted or assigned. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * dynamic_map::value_type> is `true` + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stream CUDA stream used for the operation + */ + template + void insert_or_assign(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** * @brief Erases keys in the range `[first, last)`. * - * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's - * associated value from the map. Else, no effect. + * @note This function synchronizes the given stream. For asynchronous execution use + * `erase_async`. * - * Side-effects: - * - `contains(k) == false` - * - `find(k) == end()` - * - `insert({k,v}) == true` - * - `get_size()` is reduced by the total number of erased keys + * For each key `k` in `[first, last)`, if `contains(k) == true`, removes `k` and its + * associated value from the map. Else, no effect. * - * This function synchronizes `stream`. + * Side-effects: + * - `contains(k) == false` + * - `find(k) == end()` + * - `insert({k,v}) == true` + * - `size()` is reduced by the total number of erased keys * * Keep in mind that `erase` does not cause the map to shrink its memory allocation. * * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `value_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type + * convertible to the map's `key_type` * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels * * @throw std::runtime_error if a unique erased key sentinel value was not * provided at construction */ - template , - typename KeyEqual = cuda::std::equal_to> + template void erase(InputIt first, InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = nullptr); + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + + /** + * @brief Asynchronously erases keys in the range `[first, last)`. + * + * For each key `k` in `[first, last)`, if `contains(k) == true`, removes `k` and its + * associated value from the map. Else, no effect. + * + * @note `size()` will not be updated. Use the synchronous `erase` if you need accurate size + * tracking. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream Stream used for executing the kernels + * + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction + */ + template + void erase_async(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. * + * @note This function synchronizes the given stream. For asynchronous execution use + * `find_async`. + * * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + * i)`. Else, copies the empty value sentinel. * @@ -406,122 +270,191 @@ class dynamic_map { * convertible to the map's `key_type` * @tparam OutputIt Device accessible output iterator whose `value_type` is * convertible to the map's `mapped_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels */ - template , - typename KeyEqual = cuda::std::equal_to> + template void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = nullptr); + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const; /** - * @brief Retrieves all of the keys and their associated values. + * @brief Asynchronously finds the values corresponding to all keys in the range `[first, last)`. * - * The order in which keys are returned is implementation defined and not guaranteed to be - * consistent between subsequent calls to `retrieve_all`. + * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + + * i)`. Else, copies the empty value sentinel. * - * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than - * `get_size()` + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam OutputIt Device accessible output iterator whose `value_type` is + * convertible to the map's `mapped_type` * - * @tparam KeyOut Device accessible random access output iterator whose `value_type` is - * convertible from `key_type`. - * @tparam ValueOut Device accessible random access output iterator whose `value_type` is - * convertible from `mapped_type`. - * @param keys_out Beginning output iterator for keys - * @param values_out Beginning output iterator for values - * @param stream CUDA stream used for this operation - * @return Pair of iterators indicating the last elements in the output + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of values retrieved for each key + * @param stream Stream used for executing the kernels */ - template - std::pair retrieve_all(KeyOut keys_out, - ValueOut values_out, - cudaStream_t stream = 0) const; + template + void find_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * convertible to the map's `mapped_type` - * @tparam Hash Unary callable type - * @tparam KeyEqual Binary callable type + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param hash The unary function to apply to hash each key - * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels */ - template , - typename KeyEqual = cuda::std::equal_to> + template void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = nullptr); + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the map. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const; + + /** + * @brief Retrieves all of the keys and their associated values. + * + * @note This function synchronizes the given stream. + * + * The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * + * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than + * `size()` + * + * @tparam KeyOut Device accessible random access output iterator whose `value_type` is + * convertible from `key_type`. + * @tparam ValueOut Device accessible random access output iterator whose `value_type` is + * convertible from `mapped_type`. + * @param keys_out Beginning output iterator for keys + * @param values_out Beginning output iterator for values + * @param stream CUDA stream used for this operation + * @return Pair of iterators indicating the last elements in the output + */ + template + std::pair retrieve_all(KeyOut keys_out, + ValueOut values_out, + cuda::stream_ref stream = cuda::stream_ref{ + cudaStream_t{nullptr}}) const; /** * @brief Gets the current number of elements in the map * * @return The current number of elements in the map */ - std::size_t get_size() const noexcept { return size_; } + [[nodiscard]] size_type size() const noexcept { return size_; } /** * @brief Gets the maximum number of elements the hash map can hold. * * @return The maximum number of elements the hash map can hold */ - std::size_t get_capacity() const noexcept { return capacity_; } + [[nodiscard]] size_type capacity() const noexcept { return capacity_; } /** * @brief Gets the load factor of the hash map. * * @return The load factor of the hash map */ - float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } + /** + * @brief Gets the current load factor of the map + * + * @return The current load factor of the map + */ + [[nodiscard]] float load_factor() const noexcept { return static_cast(size_) / capacity_; } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept + { + return submaps_.front()->empty_key_sentinel(); + } + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept + { + return submaps_.front()->empty_value_sentinel(); + } + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept + { + return submaps_.front()->erased_key_sentinel(); + } + + /** + * @brief Gets the function used to compare keys for equality + * + * @return The function used to compare keys for equality + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept { return submaps_.front()->key_eq(); } + + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept + { + return submaps_.front()->hash_function(); + } private: - key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot - mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot - key_type erased_key_sentinel_{}; ///< Key value that represents an erased slot - - // TODO: initialize this - std::size_t size_{}; ///< Number of keys in the map - std::size_t capacity_{}; ///< Maximum number of keys that can be inserted - float max_load_factor_{}; ///< Max load factor before capacity growth - - std::vector>> - submaps_; ///< vector of pointers to each submap - thrust::device_vector submap_views_; ///< vector of device views for each submap - thrust::device_vector - submap_mutable_views_; ///< vector of mutable device views for each submap - std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert - thrust::device_vector - submap_num_successes_; ///< Number of successfully erased keys for each submap - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + size_type size_; ///< Number of keys in the map + size_type capacity_; ///< Capacity for next submap (also returned by capacity()) + + std::vector> submaps_; ///< vector of pointers to each submap + size_type min_insert_size_; ///< min remaining capacity of submap for insert + float max_load_factor_; ///< Maximum load factor + Allocator alloc_; ///< Allocator passed to submaps to allocate their device storage }; + } // namespace cuco -#include #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 50b8771d4..dde5b249e 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * Copyright (c) 2020-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1249,7 +1249,6 @@ class static_map { mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload }; -namespace experimental { template class dynamic_map; -} - -template -class dynamic_map; namespace legacy { @@ -1356,8 +1351,6 @@ class static_map { "declared as safe for bitwise comparison via specialization of " "cuco::is_bitwise_comparable_v."); - friend class dynamic_map; ///< Dynamic map as friend class - public: using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b5d42770d..a4787f432 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2025, NVIDIA CORPORATION. +# Copyright (c) 2018-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -99,10 +99,10 @@ ConfigureTest(STATIC_MAP_TEST # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST dynamic_map/unique_sequence_test.cu - dynamic_map/unique_sequence_test_experimental.cu dynamic_map/erase_test.cu dynamic_map/find_test.cu - dynamic_map/retrieve_all_test.cu) + dynamic_map/retrieve_all_test.cu + dynamic_map/multiplicity_test.cu) ################################################################################################### # - static_multiset tests ------------------------------------------------------------------------- diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index b723093ff..1ffc9673c 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,9 +18,13 @@ #include +#include #include +#include #include #include +#include +#include #include #include @@ -48,33 +52,33 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - auto pairs_begin = - thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) { + return cuco::pair{keys[i], values[i]}; + })); map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.get_size() == num_keys); + REQUIRE(map.size() == num_keys); map.erase(d_keys.begin(), d_keys.end()); - // delete decreases count correctly - REQUIRE(map.get_size() == 0); + REQUIRE(map.size() == 0); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - // keys were actaully deleted REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{})); - // ensures that map is reusing deleted slots map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.get_size() == num_keys); + REQUIRE(map.size() == num_keys); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{})); - // erase can act selectively map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); @@ -84,7 +88,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests", REQUIRE(cuco::test::all_of( d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), cuda::std::identity{})); - // clear map map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); } @@ -99,14 +102,17 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - auto pairs_begin = - thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) { + return cuco::pair{keys[i], values[i]}; + })); map.insert(pairs_begin, pairs_begin + num); - // map should resize twice if the erased slots are successfully reused - REQUIRE(map.get_capacity() == 2 * num); - // check that keys can be successfully deleted from only the first and second submaps + REQUIRE(map.capacity() == 2 * num); + map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); @@ -116,15 +122,14 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests", REQUIRE(cuco::test::all_of( d_keys_exist.begin() + 2 * num_keys, d_keys_exist.end(), cuda::std::identity{})); - REQUIRE(map.get_size() == 2 * num_keys); - // check that keys can be successfully deleted from all submaps (some will be unsuccessful - // erases) + REQUIRE(map.size() == 2 * num_keys); + map.erase(d_keys.begin(), d_keys.end()); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{})); - REQUIRE(map.get_size() == 0); + REQUIRE(map.size() == 0); } } diff --git a/tests/dynamic_map/find_test.cu b/tests/dynamic_map/find_test.cu index b83964735..ebd88a029 100644 --- a/tests/dynamic_map/find_test.cu +++ b/tests/dynamic_map/find_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -52,24 +54,25 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - auto pairs_begin = - thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) { + return cuco::pair{keys[i], values[i]}; + })); map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.get_size() == num_keys); + REQUIRE(map.size() == num_keys); - // Find all inserted keys map.find(d_keys.begin(), d_keys.end(), d_found_values.begin()); - // Verify that all keys were found with correct values auto zip_equal = cuda::proclaim_return_type( [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_found_values.begin()}); REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - // Test finding non-existent keys thrust::device_vector d_nonexistent_keys(100); thrust::device_vector d_nonexistent_values(100); @@ -80,19 +83,15 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests", map.find(d_nonexistent_keys.begin(), d_nonexistent_keys.end(), d_nonexistent_values.begin()); - // Verify that non-existent keys return empty value sentinel auto empty_zip = thrust::make_zip_iterator( cuda::std::tuple{d_nonexistent_values.begin(), thrust::constant_iterator{cuco::empty_value{-1}.value}}); REQUIRE(cuco::test::all_of(empty_zip, empty_zip + 100, zip_equal)); - // Test finding a mix of existing and non-existing keys thrust::device_vector d_mixed_keys(200); thrust::device_vector d_mixed_values(200); - // First half: existing keys thrust::copy(d_keys.begin(), d_keys.begin() + 100, d_mixed_keys.begin()); - // Second half: non-existing keys thrust::sequence(thrust::device, d_mixed_keys.begin() + 100, d_mixed_keys.end(), @@ -100,12 +99,10 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests", map.find(d_mixed_keys.begin(), d_mixed_keys.end(), d_mixed_values.begin()); - // Verify first half found correct values auto first_half_zip = thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_mixed_values.begin()}); REQUIRE(cuco::test::all_of(first_half_zip, first_half_zip + 100, zip_equal)); - // Verify second half returned empty value sentinel auto second_half_empty_zip = thrust::make_zip_iterator( cuda::std::tuple{d_mixed_values.begin() + 100, thrust::constant_iterator{cuco::empty_value{-1}.value}}); @@ -121,14 +118,17 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - auto pairs_begin = - thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) { + return cuco::pair{keys[i], values[i]}; + })); map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.get_size() == num_keys); + REQUIRE(map.size() == num_keys); - // Find all keys before erase map.find(d_keys.begin(), d_keys.end(), d_found_values.begin()); auto zip_equal = cuda::proclaim_return_type( @@ -137,73 +137,20 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests", thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_found_values.begin()}); REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - // Erase first half of keys map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); - REQUIRE(map.get_size() == num_keys / 2); + REQUIRE(map.size() == num_keys / 2); - // Find all keys after erase map.find(d_keys.begin(), d_keys.end(), d_found_values.begin()); - // First half should return empty value sentinel (erased) auto first_half_empty_zip = thrust::make_zip_iterator( cuda::std::tuple{d_found_values.begin(), thrust::constant_iterator{cuco::empty_value{-1}.value}}); REQUIRE( cuco::test::all_of(first_half_empty_zip, first_half_empty_zip + num_keys / 2, zip_equal)); - // Second half should return correct values (not erased) auto second_half_zip = thrust::make_zip_iterator( cuda::std::tuple{d_values.begin() + num_keys / 2, d_found_values.begin() + num_keys / 2}); REQUIRE(cuco::test::all_of(second_half_zip, second_half_zip + num_keys / 2, zip_equal)); } - - cuco::dynamic_map indentity_hash_map{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - SECTION("Check find in a all erased submap") - { - constexpr float default_load_factor = 0.60; - constexpr std::size_t first_insert_size = num_keys * default_load_factor; - - thrust::device_vector d_keys(num_keys); - thrust::device_vector d_values(num_keys); - thrust::device_vector d_found_values(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); - thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - - auto pairs_begin = - thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); - - // To construct a map with all erased keys, we can't insert all at once - indentity_hash_map.insert( - pairs_begin, pairs_begin + first_insert_size, cuco::identity_hash()); - REQUIRE(indentity_hash_map.get_size() == first_insert_size); - - indentity_hash_map.erase( - d_keys.begin(), d_keys.begin() + first_insert_size, cuco::identity_hash()); - REQUIRE(indentity_hash_map.get_size() == 0); - - indentity_hash_map.insert( - pairs_begin + first_insert_size, pairs_begin + num_keys, cuco::identity_hash()); - REQUIRE(indentity_hash_map.get_size() == num_keys - first_insert_size); - - indentity_hash_map.erase( - d_keys.begin() + first_insert_size, d_keys.end(), cuco::identity_hash()); - REQUIRE(indentity_hash_map.get_size() == 0); - - // we've construct a dynamic_map with one submap whose keys are all erased keys (-2 in this - // case) this find would run forever if we don't check whether we have iterated all keys in a - // submap - indentity_hash_map.find( - d_keys.begin(), d_keys.end(), d_found_values.begin(), cuco::identity_hash()); - // all d_found_values should be empty value sentinel (-1 in this case) - auto empty_zip = thrust::make_zip_iterator( - cuda::std::tuple{d_found_values.begin(), - thrust::constant_iterator{cuco::empty_value{-1}.value}}); - auto zip_equal = cuda::proclaim_return_type( - [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); - REQUIRE(cuco::test::all_of(empty_zip, empty_zip + num_keys, zip_equal)); - } -} \ No newline at end of file +} diff --git a/tests/dynamic_map/multiplicity_test.cu b/tests/dynamic_map/multiplicity_test.cu new file mode 100644 index 000000000..5fc7eddb7 --- /dev/null +++ b/tests/dynamic_map/multiplicity_test.cu @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("dynamic_map: cross-submap duplicate handling", + "", + ((typename Key, typename T), Key, T), + (int32_t, int32_t), + (int64_t, int64_t)) +{ + // Use capacity large enough to satisfy min_insert_size_ (10,000 default) + // but small enough to force multiple submaps after several inserts + constexpr std::size_t initial_capacity{50'000}; + constexpr std::size_t num_keys{20'000}; // Fill about 2/3 of first submap (load factor ~0.6) + + cuco::dynamic_map map{initial_capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + // Create pairs for first submap (keys 0 to num_keys-1) + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i); })); + + // Create pairs for second submap (keys num_keys to 2*num_keys-1) + auto pairs_begin_2 = + thrust::make_transform_iterator(thrust::make_counting_iterator(num_keys), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i); })); + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector d_contained(num_keys); + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + SECTION("insert does not insert duplicates across submaps") + { + // Fill first submap + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + // Insert new keys to trigger second submap creation + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Try to insert duplicates of keys from first submap - should not increase size + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Try to insert duplicates with DIFFERENT values - should still not insert and preserve + // originals + auto duplicate_pairs_diff_values = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i + 999); })); + map.insert(duplicate_pairs_diff_values, duplicate_pairs_diff_values + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Verify original values are preserved (not overwritten by duplicate insert attempts) + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(0), + cuda::std::equal_to{})); + } + + SECTION("contains finds keys in any submap") + { + // Fill first submap + map.insert(pairs_begin, pairs_begin + num_keys); + + // Insert new keys to trigger second submap + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + + // Keys in FIRST submap should be found + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + + // Keys in SECOND submap should be found + thrust::device_vector d_keys_2(num_keys); + thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys); + map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + + // Non-existent keys should NOT be found + thrust::device_vector d_keys_nonexistent(num_keys); + thrust::sequence( + thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 2 * num_keys); + map.contains(d_keys_nonexistent.begin(), d_keys_nonexistent.end(), d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("find retrieves values from any submap") + { + // Fill first submap + map.insert(pairs_begin, pairs_begin + num_keys); + + // Insert new keys to trigger second submap + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + + // Find keys from FIRST submap - should return correct values + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(0), + cuda::std::equal_to{})); + + // Find keys from SECOND submap - should return correct values + thrust::device_vector d_keys_2(num_keys); + thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys); + map.find(d_keys_2.begin(), d_keys_2.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(num_keys), + cuda::std::equal_to{})); + + // Non-existent keys should return empty_value_sentinel (-1) + thrust::device_vector d_keys_nonexistent(num_keys); + thrust::sequence( + thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 2 * num_keys); + map.find(d_keys_nonexistent.begin(), d_keys_nonexistent.end(), d_results.begin()); + REQUIRE(cuco::test::all_of( + d_results.begin(), d_results.end(), [] __device__(T val) { return val == T{-1}; })); + } + + SECTION("erase removes keys from any submap") + { + // Fill first submap + map.insert(pairs_begin, pairs_begin + num_keys); + + // Insert new keys to trigger second submap + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Erase keys from FIRST submap + map.erase(d_keys.begin(), d_keys.end()); + REQUIRE(map.size() == num_keys); + + // Verify keys from first submap are no longer contained + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + + // Verify find returns sentinel for erased keys + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + REQUIRE(cuco::test::all_of( + d_results.begin(), d_results.end(), [] __device__(T val) { return val == T{-1}; })); + + // Verify keys from SECOND submap are still there + thrust::device_vector d_keys_2(num_keys); + thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys); + map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + + // Erase same keys again (already erased) - should not change size + map.erase(d_keys.begin(), d_keys.end()); + REQUIRE(map.size() == num_keys); + + // Erase non-existent keys - should not change size + thrust::device_vector d_keys_nonexistent(num_keys); + thrust::sequence( + thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 3 * num_keys); + map.erase(d_keys_nonexistent.begin(), d_keys_nonexistent.end()); + REQUIRE(map.size() == num_keys); + + // Now erase keys from SECOND submap + map.erase(d_keys_2.begin(), d_keys_2.end()); + REQUIRE(map.size() == 0); + + // Verify all keys are gone + map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("insert_or_assign updates in correct submap without creating duplicates") + { + // Fill first submap with values = keys + map.insert(pairs_begin, pairs_begin + num_keys); + + // Insert new keys to trigger second submap + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Create pairs with same keys as first submap but different values (value = key + 100) + auto updated_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i + 100); })); + + // insert_or_assign should UPDATE values in first submap, not insert into second + map.insert_or_assign(updated_pairs, updated_pairs + num_keys); + REQUIRE(map.size() == 2 * num_keys); // Size should not change + + // Verify values were updated in first submap + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(100), // Values should now be key + 100 + cuda::std::equal_to{})); + + // Verify second submap values are unchanged + thrust::device_vector d_keys_2(num_keys); + thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys); + map.find(d_keys_2.begin(), d_keys_2.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(num_keys), + cuda::std::equal_to{})); + + // Test INSERT behavior: insert_or_assign with completely NEW keys should increase size + auto new_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(2 * num_keys), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i); })); + map.insert_or_assign(new_pairs, new_pairs + num_keys); + REQUIRE(map.size() == 3 * num_keys); // Size should increase by num_keys + + // Verify newly inserted keys exist with correct values + thrust::device_vector d_keys_new(num_keys); + thrust::sequence(thrust::device, d_keys_new.begin(), d_keys_new.end(), 2 * num_keys); + map.find(d_keys_new.begin(), d_keys_new.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(2 * num_keys), + cuda::std::equal_to{})); + + // Test MIXED behavior: some keys exist (update), some don't (insert) + // Use keys from 0 to num_keys/2 (exist in first submap) and + // keys from 3*num_keys to 3*num_keys + num_keys/2 (don't exist) + auto mixed_pairs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>([] __device__(auto i) { + Key key = (i < num_keys / 2) ? Key(i) : Key(3 * num_keys + i - num_keys / 2); + return cuco::pair(key, T(i + 500)); + })); + std::size_t const size_before = map.size(); + map.insert_or_assign(mixed_pairs, mixed_pairs + num_keys); + // Only num_keys/2 new keys should be inserted + REQUIRE(map.size() == size_before + num_keys / 2); + } + + SECTION("retrieve_all retrieves from all submaps") + { + // Fill first submap + map.insert(pairs_begin, pairs_begin + num_keys); + + // Insert new keys to trigger second submap + map.insert(pairs_begin_2, pairs_begin_2 + num_keys); + REQUIRE(map.size() == 2 * num_keys); + + // Retrieve all key-value pairs + thrust::device_vector d_retrieved_keys(2 * num_keys); + thrust::device_vector d_retrieved_values(2 * num_keys); + auto const end = map.retrieve_all(d_retrieved_keys.begin(), d_retrieved_values.begin()); + auto const num_retrieved = std::distance(d_retrieved_keys.begin(), end.first); + + // Should retrieve all keys from both submaps + REQUIRE(num_retrieved == 2 * num_keys); + + // Sort by key to verify all expected keys are present + thrust::sort_by_key( + thrust::device, d_retrieved_keys.begin(), d_retrieved_keys.end(), d_retrieved_values.begin()); + + // Keys should be 0 to 2*num_keys-1 + thrust::device_vector d_expected_keys(2 * num_keys); + thrust::sequence(thrust::device, d_expected_keys.begin(), d_expected_keys.end()); + REQUIRE(cuco::test::equal(d_retrieved_keys.begin(), + d_retrieved_keys.end(), + d_expected_keys.begin(), + cuda::std::equal_to{})); + + // Values should match keys (since we inserted key=value pairs) + REQUIRE(cuco::test::equal(d_retrieved_values.begin(), + d_retrieved_values.end(), + thrust::counting_iterator(0), + cuda::std::equal_to{})); + } +} diff --git a/tests/dynamic_map/retrieve_all_test.cu b/tests/dynamic_map/retrieve_all_test.cu index 72f0df329..35f057e4b 100644 --- a/tests/dynamic_map/retrieve_all_test.cu +++ b/tests/dynamic_map/retrieve_all_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -59,7 +60,7 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests", map.insert(pairs, pairs + num_keys); - REQUIRE(map.get_size() == num_keys); + REQUIRE(map.size() == num_keys); thrust::device_vector retrieved_keys(num_keys); thrust::device_vector retrieved_values(num_keys); @@ -70,7 +71,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests", REQUIRE(keys_out == retrieved_keys.end()); REQUIRE(values_out == retrieved_values.end()); - // d_keys and d_values are already sorted thrust::sort(retrieved_keys.begin(), retrieved_keys.end()); thrust::sort(retrieved_values.begin(), retrieved_values.end()); @@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests", map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); - REQUIRE(map.get_size() == num_keys / 2); + REQUIRE(map.size() == num_keys / 2); thrust::device_vector retrieved_keys(num_keys / 2); thrust::device_vector retrieved_values(num_keys / 2); @@ -107,7 +107,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests", REQUIRE(std::distance(retrieved_keys.begin(), keys_out) == num_keys / 2); REQUIRE(std::distance(retrieved_values.begin(), values_out) == num_keys / 2); - // d_keys and d_values are already sorted thrust::sort(retrieved_keys.begin(), retrieved_keys.end()); thrust::sort(retrieved_values.begin(), retrieved_values.end()); diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index afec6b82f..13cf08d0b 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * Copyright (c) 2020-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,75 +19,113 @@ #include #include -#include +#include #include #include #include #include -#include #include #include -TEMPLATE_TEST_CASE_SIG("dynamic_map unique sequence tests", +TEMPLATE_TEST_CASE_SIG("dynamic_map: unique sequence", "", - ((typename Key, typename Value), Key, Value), + ((typename Key, typename T), Key, T), (int32_t, int32_t), (int32_t, int64_t), (int64_t, int32_t), (int64_t, int64_t)) { - constexpr std::size_t num_keys{50'000'000}; + constexpr std::size_t num_keys{1'000'000}; - cuco::dynamic_map map{ - 30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; + cuco::dynamic_map map{30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; thrust::device_vector d_keys(num_keys); - thrust::device_vector d_values(num_keys); + thrust::device_vector d_values(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type>( - [] __device__(auto i) { return cuco::pair(i, i); })); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i); })); - thrust::device_vector d_results(num_keys); + thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); - // bulk function test cases - SECTION("All inserted keys-value pairs should be correctly recovered during find") + SECTION("All inserted keys-value pairs should be contained") { map.insert(pairs_begin, pairs_begin + num_keys); - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), d_values.begin()}); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return cuda::std::get<0>(p) == cuda::std::get<1>(p); - })); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); } - SECTION("All non-inserted keys-value pairs should have the empty sentinel value recovered") + SECTION("Non-inserted keys-value pairs should not be contained") { - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(cuco::test::all_of( - d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; })); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); } - SECTION("All inserted keys-value pairs should be contained") + SECTION("size() returns correct count after insertions") + { + REQUIRE(map.size() == 0); + + map.insert(pairs_begin, pairs_begin + num_keys / 2); + REQUIRE(map.size() == num_keys / 2); + + map.insert(pairs_begin + num_keys / 2, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + } + + SECTION("capacity() returns non-zero value") { + REQUIRE(map.capacity() > 0); + REQUIRE(map.capacity() >= 30'000'000); + } + + SECTION("load_factor() is computed correctly") + { + REQUIRE(map.load_factor() == 0.0f); + map.insert(pairs_begin, pairs_begin + num_keys); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + float expected_load_factor = static_cast(num_keys) / map.capacity(); + REQUIRE(map.load_factor() == expected_load_factor); } - SECTION("Non-inserted keys-value pairs should not be contained") + SECTION("insert_or_assign inserts new keys and updates existing") { - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + // Insert initial keys + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); - REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + // Create pairs with same keys but different values (value = key + 1) + auto updated_pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i + 1); })); + + // insert_or_assign should update existing keys, size should stay the same + map.insert_or_assign(updated_pairs_begin, updated_pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + // Verify values were updated + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + REQUIRE(cuco::test::equal(d_results.begin(), + d_results.end(), + thrust::counting_iterator(1), // Values should now be key + 1 + cuda::std::equal_to{})); + + // Insert new keys with insert_or_assign (keys from num_keys to 2*num_keys) + auto new_pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(num_keys), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair(i, i); })); + + map.insert_or_assign(new_pairs_begin, new_pairs_begin + num_keys); + REQUIRE(map.size() == 2 * num_keys); } } diff --git a/tests/dynamic_map/unique_sequence_test_experimental.cu b/tests/dynamic_map/unique_sequence_test_experimental.cu deleted file mode 100644 index b04f8c182..000000000 --- a/tests/dynamic_map/unique_sequence_test_experimental.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include - -TEMPLATE_TEST_CASE_SIG("experimental::dynamic_map: unique sequence", - "", - ((typename Key, typename T), Key, T), - (int32_t, int32_t), - (int32_t, int64_t), - (int64_t, int32_t), - (int64_t, int64_t)) -{ - constexpr std::size_t num_keys{1'000'000}; - - cuco::experimental::dynamic_map map{ - 30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - thrust::device_vector d_keys(num_keys); - thrust::device_vector d_values(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - - auto pairs_begin = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - cuda::proclaim_return_type>( - [] __device__(auto i) { return cuco::pair(i, i); })); - - thrust::device_vector d_results(num_keys); - thrust::device_vector d_contained(num_keys); - - // bulk function test cases - - SECTION("All inserted keys-value pairs should be contained") - { - map.insert(pairs_begin, pairs_begin + num_keys); - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("Non-inserted keys-value pairs should not be contained") - { - // segfaults - map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); - - REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } -}