diff --git a/README.md b/README.md
index f6c80ba2f..e75b8a2b2 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith
 
 ### Major Updates
 
+__02/03/2026__ Modernized `dynamic_map`: promoted `cuco::experimental::dynamic_map` to `cuco::dynamic_map` and removed the legacy implementation
+
 __01/30/2026__ Removed legacy `static_multimap` implementation and promoted `cuco::experimental::static_multimap` to `cuco::static_multimap`
 
 __10/08/2025__ Changed `cuda_allocator` to stream-ordered, requiring `cuda::stream_ref` parameter in `allocate`/`deallocate`.
@@ -244,7 +246,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 
 ### `dynamic_map`
 
-`cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It currently only provides host-bulk APIs. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information.
+`cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It supports `insert`, `insert_or_assign`, `erase`, `find`, `contains`, and `retrieve_all` operations via host-bulk APIs with kernel-based implementations for optimal performance. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information.
 
 #### Examples:
 - [Host-bulk APIs (TODO)]()
diff --git a/benchmarks/dynamic_map/contains_bench.cu b/benchmarks/dynamic_map/contains_bench.cu
index 60d8af1b5..90867ee64 100644
--- a/benchmarks/dynamic_map/contains_bench.cu
+++ b/benchmarks/dynamic_map/contains_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains(
   state.add_element_count(num_keys);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+    map.contains(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/benchmarks/dynamic_map/erase_bench.cu b/benchmarks/dynamic_map/erase_bench.cu
index c6f373138..5e0a02547 100644
--- a/benchmarks/dynamic_map/erase_bench.cu
+++ b/benchmarks/dynamic_map/erase_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,15 +56,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase(
 
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
-               // dynamic map with erase support
                cuco::dynamic_map<Key, Value> map{static_cast<size_t>(initial_size),
                                                  cuco::empty_key<Key>{-1},
                                                  cuco::empty_value<Value>{-1},
                                                  cuco::erased_key<Key>{-2}};
-               map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream());
+               map.insert(pairs.begin(), pairs.end(), {launch.get_stream()});
 
                timer.start();
-               map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream());
+               map.erase(keys.begin(), keys.end(), {launch.get_stream()});
                timer.stop();
              });
 }
diff --git a/benchmarks/dynamic_map/find_bench.cu b/benchmarks/dynamic_map/find_bench.cu
index f68987857..2fd87d27c 100644
--- a/benchmarks/dynamic_map/find_bench.cu
+++ b/benchmarks/dynamic_map/find_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find(
   state.add_element_count(num_keys);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+    map.find(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/benchmarks/dynamic_map/insert_bench.cu b/benchmarks/dynamic_map/insert_bench.cu
index f157a79b4..7c3d24bcf 100644
--- a/benchmarks/dynamic_map/insert_bench.cu
+++ b/benchmarks/dynamic_map/insert_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,15 +57,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert(
 
   state.exec(
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      cuco::dynamic_map<Key, Value> map{static_cast<size_t>(initial_size),
-                                        cuco::empty_key<Key>{-1},
-                                        cuco::empty_value<Value>{-1},
-                                        {},
-                                        launch.get_stream()};
+      cuco::dynamic_map<Key, Value> map{
+        static_cast<size_t>(initial_size), cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
       timer.start();
       for (int64_t i = 0; i < num_keys; i += batch_size) {
-        map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {}, {}, launch.get_stream());
+        map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {launch.get_stream()});
       }
       timer.stop();
     });
diff --git a/benchmarks/dynamic_map/retrieve_all_bench.cu b/benchmarks/dynamic_map/retrieve_all_bench.cu
index fb7d5b0bf..1dd34bbe1 100644
--- a/benchmarks/dynamic_map/retrieve_all_bench.cu
+++ b/benchmarks/dynamic_map/retrieve_all_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_retrieve_all(
   cuco::dynamic_map<Key, Value> map{
     static_cast<size_t>(initial_size), cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
   map.insert(pairs.begin(), pairs.end());
-  // Prepare output buffers
-  thrust::device_vector<Key> retrieved_keys(map.get_size());
-  thrust::device_vector<Value> retrieved_values(map.get_size());
 
-  state.add_element_count(map.get_size());
+  thrust::device_vector<Key> retrieved_keys(map.size());
+  thrust::device_vector<Value> retrieved_values(map.size());
+
+  state.add_element_count(map.size());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.retrieve_all(retrieved_keys.begin(), retrieved_values.begin(), launch.get_stream());
+    map.retrieve_all(retrieved_keys.begin(), retrieved_values.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
deleted file mode 100644
index 962d95b69..000000000
--- a/include/cuco/detail/dynamic_map.inl
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-namespace cuco {
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       empty_key<Key> empty_key_sentinel,
-                                                       empty_value<Value> empty_value_sentinel,
-                                                       Allocator const& alloc,
-                                                       cudaStream_t stream)
-  : empty_key_sentinel_(empty_key_sentinel.value),
-    empty_value_sentinel_(empty_value_sentinel.value),
-    erased_key_sentinel_(empty_key_sentinel.value),
-    size_(0),
-    capacity_(initial_capacity),
-    min_insert_size_(1E4),
-    max_load_factor_(0.60),
-    alloc_{alloc}
-{
-  submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
-    initial_capacity,
-    empty_key<Key>{empty_key_sentinel},
-    empty_value<Value>{empty_value_sentinel},
-    alloc,
-    stream));
-  submap_views_.push_back(submaps_[0]->get_device_view());
-  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->num_successes_);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
-                                                       empty_key<Key> empty_key_sentinel,
-                                                       empty_value<Value> empty_value_sentinel,
-                                                       erased_key<Key> erased_key_sentinel,
-                                                       Allocator const& alloc,
-                                                       cudaStream_t stream)
-  : empty_key_sentinel_(empty_key_sentinel.value),
-    empty_value_sentinel_(empty_value_sentinel.value),
-    erased_key_sentinel_(erased_key_sentinel.value),
-    size_(0),
-    capacity_(initial_capacity),
-    min_insert_size_(1E4),
-    max_load_factor_(0.60),
-    alloc_{alloc}
-{
-  CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
-               "The empty key sentinel and erased key sentinel cannot be the same value.",
-               std::runtime_error);
-
-  submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
-    initial_capacity,
-    empty_key<Key>{empty_key_sentinel_},
-    empty_value<Value>{empty_value_sentinel_},
-    erased_key<Key>{erased_key_sentinel_},
-    alloc,
-    stream));
-  submap_views_.push_back(submaps_[0]->get_device_view());
-  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-  submap_num_successes_.push_back(submaps_[0]->num_successes_);
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
-{
-  // Calculate current total available capacity across all submaps
-  std::size_t total_available_capacity = 0;
-  for (std::size_t i = 0; i < submaps_.size(); ++i) {
-    std::size_t submap_usable_capacity =
-      static_cast<std::size_t>(max_load_factor_ * submaps_[i]->get_capacity());
-    // Only count capacity above the minimum insert threshold
-    if (submap_usable_capacity >= min_insert_size_) {
-      total_available_capacity += submap_usable_capacity - min_insert_size_;
-    }
-  }
-
-  // Create new submaps until we have enough capacity
-  while (total_available_capacity < n) {
-    std::size_t new_submap_capacity = capacity_;
-
-    if (erased_key_sentinel_ != empty_key_sentinel_) {
-      submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
-        new_submap_capacity,
-        empty_key<Key>{empty_key_sentinel_},
-        empty_value<Value>{empty_value_sentinel_},
-        erased_key<Key>{erased_key_sentinel_},
-        alloc_,
-        stream));
-    } else {
-      submaps_.push_back(std::make_unique<cuco::legacy::static_map<Key, Value, Scope, Allocator>>(
-        new_submap_capacity,
-        empty_key<Key>{empty_key_sentinel_},
-        empty_value<Value>{empty_value_sentinel_},
-        alloc_,
-        stream));
-    }
-
-    std::size_t submap_idx = submaps_.size() - 1;
-    submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
-    submap_views_.push_back(submaps_[submap_idx]->get_device_view());
-    submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
-
-    // Add the new submap's usable capacity
-    std::size_t new_usable_capacity =
-      static_cast<std::size_t>(max_load_factor_ * new_submap_capacity);
-    if (new_usable_capacity >= min_insert_size_) {
-      total_available_capacity += new_usable_capacity - min_insert_size_;
-    }
-
-    // Update capacity for next submap (double the size)
-    capacity_ *= 2;
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::insert(
-  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
-{
-  // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
-                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-
-  std::size_t num_to_insert = std::distance(first, last);
-
-  reserve(size_ + num_to_insert, stream);
-
-  std::size_t submap_idx = 0;
-  while (num_to_insert > 0) {
-    std::size_t capacity_remaining =
-      max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size();
-    // If we are tying to insert some of the remaining keys into this submap, we can insert
-    // only if we meet the minimum insert size.
-    if (capacity_remaining >= min_insert_size_) {
-      CUCO_CUDA_TRY(
-        cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream));
-
-      auto const n         = std::min(capacity_remaining, num_to_insert);
-      auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size);
-
-      detail::insert<block_size, tile_size, cuco::pair<key_type, mapped_type>>
-        <<<grid_size, block_size, 0, stream>>>(first,
-                                               first + n,
-                                               submap_views_.data().get(),
-                                               submap_mutable_views_.data().get(),
-                                               submap_num_successes_.data().get(),
-                                               submap_idx,
-                                               submaps_.size(),
-                                               hash,
-                                               key_equal);
-
-      std::size_t h_num_successes;
-      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
-                                    submap_num_successes_[submap_idx],
-                                    sizeof(atomic_ctr_type),
-                                    cudaMemcpyDeviceToHost,
-                                    stream));
-      CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-      submaps_[submap_idx]->size_ += h_num_successes;
-      size_ += h_num_successes;
-      first += n;
-      num_to_insert -= n;
-    }
-    submap_idx++;
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::erase(
-  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
-{
-  // TODO: memset an atomic variable is unsafe
-  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
-                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-
-  auto const num_keys  = std::distance(first, last);
-  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-
-  // zero out submap success counters
-  for (std::size_t i = 0; i < submaps_.size(); ++i) {
-    CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
-  }
-
-  auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
-
-  detail::erase<block_size, tile_size>
-    <<<grid_size, block_size, temp_storage_size, stream>>>(first,
-                                                           first + num_keys,
-                                                           submap_mutable_views_.data().get(),
-                                                           submap_num_successes_.data().get(),
-                                                           submaps_.size(),
-                                                           hash,
-                                                           key_equal);
-
-  for (std::size_t i = 0; i < submaps_.size(); ++i) {
-    std::size_t h_submap_num_successes;
-    CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
-                                  submap_num_successes_[i],
-                                  sizeof(atomic_ctr_type),
-                                  cudaMemcpyDeviceToHost,
-                                  stream));
-    CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-    submaps_[i]->size_ -= h_submap_num_successes;
-    size_ -= h_submap_num_successes;
-  }
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::find(InputIt first,
-                                                     InputIt last,
-                                                     OutputIt output_begin,
-                                                     Hash hash,
-                                                     KeyEqual key_equal,
-                                                     cudaStream_t stream)
-{
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-
-  auto const num_keys  = std::distance(first, last);
-  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-
-  detail::find<block_size, tile_size, Value><<<grid_size, block_size, 0, stream>>>(
-    first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
-  CUCO_CUDA_TRY(cudaDeviceSynchronize());
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename KeyOut, typename ValueOut>
-std::pair<KeyOut, ValueOut> dynamic_map<Key, Value, Scope, Allocator>::retrieve_all(
-  KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const
-{
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-
-  auto const capacity = get_capacity();
-  auto grid_size      = (capacity + stride * block_size - 1) / (stride * block_size);
-
-  std::vector<size_t> submap_cap_prefix(submaps_.size());
-  std::inclusive_scan(
-    submaps_.begin(),
-    submaps_.end(),
-    submap_cap_prefix.begin(),
-    [](auto const& sum, auto const& submap) { return sum + submap->get_capacity(); },
-    size_t{0});
-  thrust::device_vector<size_t> submap_cap_prefix_d(submap_cap_prefix);
-
-  auto counter =
-    detail::counter_storage<size_t, Scope, Allocator>{this->alloc_, cuda::stream_ref{stream}};
-  counter.reset({stream});
-
-  detail::retrieve_all<block_size>
-    <<<grid_size, block_size, 0, stream>>>(keys_out,
-                                           values_out,
-                                           submap_views_.data().get(),
-                                           submaps_.size(),
-                                           capacity,
-                                           counter.data(),
-                                           submap_cap_prefix_d.data().get());
-
-  auto const h_num_out = counter.load_to_host({stream});
-  return {keys_out + h_num_out, values_out + h_num_out};
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::contains(InputIt first,
-                                                         InputIt last,
-                                                         OutputIt output_begin,
-                                                         Hash hash,
-                                                         KeyEqual key_equal,
-                                                         cudaStream_t stream)
-{
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto constexpr tile_size  = 4;
-
-  auto const num_keys  = std::distance(first, last);
-  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
-
-  detail::contains<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
-    first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
-  CUCO_CUDA_TRY(cudaDeviceSynchronize());
-}
-
-}  // namespace cuco
diff --git a/include/cuco/detail/dynamic_map/dynamic_map.inl b/include/cuco/detail/dynamic_map/dynamic_map.inl
index fb65147f5..670239b58 100644
--- a/include/cuco/detail/dynamic_map/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map/dynamic_map.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,22 @@
  * limitations under the License.
  */
 
-#include <cuco/detail/bitwise_compare.cuh>
-#include <cuco/detail/static_map/kernels.cuh>
+#include <cuco/detail/dynamic_map/kernels.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/detail/utils.hpp>
 #include <cuco/operator.hpp>
-#include <cuco/static_map_ref.cuh>
 
+#include <cuda/std/atomic>
 #include <cuda/stream_ref>
 
 #include <algorithm>
 #include <cstddef>
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
 
 namespace cuco {
-namespace experimental {
 
 template <typename Key,
           typename T,
@@ -50,21 +52,59 @@ constexpr dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
   : size_{0},
     capacity_{initial_capacity},
     min_insert_size_{static_cast<size_type>(1E4)},
-    max_load_factor_{0.60},
+    max_load_factor_{0.60f},
     alloc_{alloc}
 {
-  submaps_.push_back(
-    std::make_unique<
-      cuco::static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>>(
-      initial_capacity,
-      empty_key_sentinel,
-      empty_value_sentinel,
-      pred,
-      probing_scheme,
-      scope,
-      storage,
-      alloc,
-      stream));
+  submaps_.push_back(std::make_unique<map_type>(initial_capacity,
+                                                empty_key_sentinel,
+                                                empty_value_sentinel,
+                                                pred,
+                                                probing_scheme,
+                                                scope,
+                                                storage,
+                                                alloc,
+                                                stream));
+}
+
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+constexpr dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  dynamic_map(Extent initial_capacity,
+              empty_key<Key> empty_key_sentinel,
+              empty_value<T> empty_value_sentinel,
+              erased_key<Key> erased_key_sentinel,
+              KeyEqual const& pred,
+              ProbingScheme const& probing_scheme,
+              cuda_thread_scope<Scope> scope,
+              Storage storage,
+              Allocator const& alloc,
+              cuda::stream_ref stream)
+  : size_{0},
+    capacity_{initial_capacity},
+    min_insert_size_{static_cast<size_type>(1E4)},
+    max_load_factor_{0.60f},
+    alloc_{alloc}
+{
+  CUCO_EXPECTS(empty_key_sentinel.value != erased_key_sentinel.value,
+               "The empty key sentinel and erased key sentinel cannot be the same value.",
+               std::runtime_error);
+
+  submaps_.push_back(std::make_unique<map_type>(initial_capacity,
+                                                empty_key_sentinel,
+                                                empty_value_sentinel,
+                                                erased_key_sentinel,
+                                                pred,
+                                                probing_scheme,
+                                                scope,
+                                                storage,
+                                                alloc,
+                                                stream));
 }
 
 template <typename Key,
@@ -82,17 +122,72 @@ void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
   auto num_to_insert = cuco::detail::distance(first, last);
   this->reserve(size_ + num_to_insert, stream);
 
+  // Fast path: single submap, no cross-submap duplicate check needed
+  if (submaps_.size() == 1) {
+    size_ += submaps_.front()->insert(first, last, stream);
+    return;
+  }
+
+  // Multiple submaps: use kernel to check for duplicates across all submaps
+  using ref_type = decltype(submaps_.front()->ref(cuco::op::contains, cuco::op::insert));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::atomic<std::size_t>>;
+  auto counter_allocator = counter_allocator_type{alloc_};
+
   std::size_t submap_idx = 0;
   while (num_to_insert > 0) {
     auto& cur = submaps_[submap_idx];
 
     auto capacity_remaining = max_load_factor_ * cur->capacity() - cur->size();
-    // If we are tying to insert some of the remaining keys into this submap, we can insert
-    // only if we meet the minimum insert size.
     if (capacity_remaining >= min_insert_size_) {
       auto const n = std::min(static_cast<detail::index_type>(capacity_remaining), num_to_insert);
 
-      std::size_t h_num_successes = cur->insert(first, first + n, stream);
+      // Allocate and initialize device counter
+      auto* d_num_successes = counter_allocator.allocate(1, stream);
+      CUCO_CUDA_TRY(
+        cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic<std::size_t>), stream.get()));
+
+      // Allocate and copy refs for all submaps (with both contains and insert ops)
+      auto* d_submap_refs = ref_allocator.allocate(submaps_.size(), stream);
+      std::vector<ref_type> h_submap_refs;
+      h_submap_refs.reserve(submaps_.size());
+      for (auto const& submap : submaps_) {
+        h_submap_refs.push_back(submap->ref(cuco::op::contains, cuco::op::insert));
+      }
+      CUCO_CUDA_TRY(cudaMemcpyAsync(d_submap_refs,
+                                    h_submap_refs.data(),
+                                    sizeof(ref_type) * submaps_.size(),
+                                    cudaMemcpyHostToDevice,
+                                    stream.get()));
+
+      auto constexpr cg_size    = ProbingScheme::cg_size;
+      auto constexpr block_size = cuco::detail::default_block_size();
+      auto const grid_size      = cuco::detail::grid_size(n, cg_size);
+
+      detail::dynamic_map_ns::insert<cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first,
+                                                     n,
+                                                     d_num_successes,
+                                                     d_submap_refs,
+                                                     static_cast<uint32_t>(submap_idx),
+                                                     static_cast<uint32_t>(submaps_.size()));
+
+      // Read back success count
+      std::size_t h_num_successes = 0;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
+                                    d_num_successes,
+                                    sizeof(std::size_t),
+                                    cudaMemcpyDeviceToHost,
+                                    stream.get()));
+      CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+
+      ref_allocator.deallocate(d_submap_refs, submaps_.size(), stream);
+      counter_allocator.deallocate(d_num_successes, 1, stream);
 
       size_ += h_num_successes;
       first += n;
@@ -102,6 +197,101 @@ void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
   }
 }
 
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream)
+{
+  auto num_to_insert = cuco::detail::distance(first, last);
+  this->reserve(size_ + num_to_insert, stream);
+
+  // Fast path: single submap
+  if (submaps_.size() == 1) {
+    auto const old_size = submaps_.front()->size(stream);
+    submaps_.front()->insert_or_assign(first, last, stream);
+    auto const new_size = submaps_.front()->size(stream);
+    size_ += (new_size - old_size);
+    return;
+  }
+
+  // Multiple submaps: use kernel to check for existing keys across all submaps
+  using ref_type = decltype(submaps_.front()->ref(
+    cuco::op::contains, cuco::op::insert, cuco::op::insert_or_assign));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::atomic<std::size_t>>;
+  auto counter_allocator = counter_allocator_type{alloc_};
+
+  std::size_t submap_idx = 0;
+  while (num_to_insert > 0) {
+    auto& cur = submaps_[submap_idx];
+
+    auto capacity_remaining = max_load_factor_ * cur->capacity() - cur->size();
+    if (capacity_remaining >= min_insert_size_) {
+      auto const n = std::min(static_cast<detail::index_type>(capacity_remaining), num_to_insert);
+
+      // Allocate and initialize device counter for new insertions
+      auto* d_num_insertions = counter_allocator.allocate(1, stream);
+      CUCO_CUDA_TRY(
+        cudaMemsetAsync(d_num_insertions, 0, sizeof(cuda::atomic<std::size_t>), stream.get()));
+
+      // Allocate and copy refs for all submaps
+      auto* d_submap_refs = ref_allocator.allocate(submaps_.size(), stream);
+      std::vector<ref_type> h_submap_refs;
+      h_submap_refs.reserve(submaps_.size());
+      for (auto const& submap : submaps_) {
+        h_submap_refs.push_back(
+          submap->ref(cuco::op::contains, cuco::op::insert, cuco::op::insert_or_assign));
+      }
+      CUCO_CUDA_TRY(cudaMemcpyAsync(d_submap_refs,
+                                    h_submap_refs.data(),
+                                    sizeof(ref_type) * submaps_.size(),
+                                    cudaMemcpyHostToDevice,
+                                    stream.get()));
+
+      auto constexpr cg_size    = ProbingScheme::cg_size;
+      auto constexpr block_size = cuco::detail::default_block_size();
+      auto const grid_size      = cuco::detail::grid_size(n, cg_size);
+
+      detail::dynamic_map_ns::insert_or_assign<cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first,
+                                                     n,
+                                                     d_num_insertions,
+                                                     d_submap_refs,
+                                                     static_cast<uint32_t>(submap_idx),
+                                                     static_cast<uint32_t>(submaps_.size()));
+
+      // Read back insertion count (only new insertions, not assignments)
+      std::size_t h_num_insertions = 0;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_insertions,
+                                    d_num_insertions,
+                                    sizeof(std::size_t),
+                                    cudaMemcpyDeviceToHost,
+                                    stream.get()));
+      CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+
+      ref_allocator.deallocate(d_submap_refs, submaps_.size(), stream);
+      counter_allocator.deallocate(d_num_insertions, 1, stream);
+
+      size_ += h_num_insertions;
+      first += n;
+      num_to_insert -= n;
+    }
+    submap_idx++;
+  }
+}
+
 template <typename Key,
           typename T,
           typename Extent,
@@ -113,38 +303,263 @@ template <typename Key,
 void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::reserve(
   size_type n, cuda::stream_ref stream)
 {
-  size_type num_elements_remaining = n;
-  std::size_t submap_idx           = 0;
-  while (num_elements_remaining > 0) {
+  auto const& ref            = *submaps_.front();
+  auto const empty_key_val   = ref.empty_key_sentinel();
+  auto const empty_value_val = ref.empty_value_sentinel();
+  auto const erased_key_val  = ref.erased_key_sentinel();
+  auto const pred            = ref.key_eq();
+  auto const probing_scheme  = ProbingScheme{ref.hash_function()};
+  auto const has_erased_key  = empty_key_val != erased_key_val;
+
+  std::size_t submap_idx = 0;
+  while (n > 0) {
     std::size_t submap_capacity;
 
-    // if the submap already exists
     if (submap_idx < submaps_.size()) {
       submap_capacity = submaps_[submap_idx]->capacity();
-    }
-    // if the submap does not exist yet, create it
-    else {
-      empty_key<Key> empty_key_sentinel{submaps_.front()->empty_key_sentinel()};
-      empty_value<T> empty_value_sentinel{submaps_.front()->empty_value_sentinel()};
-
+    } else {
       submap_capacity = capacity_;
-      submaps_.push_back(std::make_unique<map_type>(submap_capacity,
-                                                    empty_key_sentinel,
-                                                    empty_value_sentinel,
-                                                    KeyEqual{},
-                                                    ProbingScheme{},
-                                                    cuda_thread_scope<Scope>{},
-                                                    Storage{},
-                                                    alloc_,
-                                                    stream));
+
+      if (has_erased_key) {
+        submaps_.push_back(std::make_unique<map_type>(submap_capacity,
+                                                      empty_key<Key>{empty_key_val},
+                                                      empty_value<T>{empty_value_val},
+                                                      erased_key<Key>{erased_key_val},
+                                                      pred,
+                                                      probing_scheme,
+                                                      cuda_thread_scope<Scope>{},
+                                                      Storage{},
+                                                      alloc_,
+                                                      stream));
+      } else {
+        submaps_.push_back(std::make_unique<map_type>(submap_capacity,
+                                                      empty_key<Key>{empty_key_val},
+                                                      empty_value<T>{empty_value_val},
+                                                      pred,
+                                                      probing_scheme,
+                                                      cuda_thread_scope<Scope>{},
+                                                      Storage{},
+                                                      alloc_,
+                                                      stream));
+      }
       capacity_ *= 2;
     }
 
-    num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_;
+    auto const usable_capacity =
+      static_cast<size_type>(max_load_factor_ * submap_capacity) - min_insert_size_;
+    if (usable_capacity >= n) { break; }
+    n -= usable_capacity;
     submap_idx++;
   }
 }
 
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase(
+  InputIt first, InputIt last, cuda::stream_ref stream)
+{
+  auto const& ref = *submaps_.front();
+  CUCO_EXPECTS(ref.empty_key_sentinel() != ref.erased_key_sentinel(),
+               "Erase requires a unique erased key sentinel to be provided at construction.",
+               std::runtime_error);
+
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  // Fast path: single submap
+  if (submaps_.size() == 1) {
+    auto const size_before = submaps_.front()->size(stream);
+    submaps_.front()->erase(first, last, stream);
+    auto const size_after = submaps_.front()->size(stream);
+    size_ -= (size_before - size_after);
+    return;
+  }
+
+  // Multiple submaps: use kernel to erase from all submaps in parallel
+  using erase_ref_type = decltype(submaps_.front()->ref(cuco::op::erase));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<erase_ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::atomic<std::size_t>>;
+  auto counter_allocator = counter_allocator_type{alloc_};
+
+  // Allocate and initialize device counter
+  auto* d_num_successes = counter_allocator.allocate(1, stream);
+  CUCO_CUDA_TRY(
+    cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic<std::size_t>), stream.get()));
+
+  // Allocate and copy erase refs for all submaps
+  auto* d_refs = ref_allocator.allocate(submaps_.size(), stream);
+  std::vector<erase_ref_type> h_refs;
+  h_refs.reserve(submaps_.size());
+  for (auto const& submap : submaps_) {
+    h_refs.push_back(submap->ref(cuco::op::erase));
+  }
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs,
+                                h_refs.data(),
+                                sizeof(erase_ref_type) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+
+  auto constexpr cg_size    = ProbingScheme::cg_size;
+  auto constexpr block_size = cuco::detail::default_block_size();
+  auto const grid_size      = cuco::detail::grid_size(num_keys, cg_size);
+
+  detail::dynamic_map_ns::erase<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
+    first, num_keys, d_num_successes, d_refs, static_cast<uint32_t>(submaps_.size()));
+
+  // Read back success count
+  std::size_t h_num_successes = 0;
+  CUCO_CUDA_TRY(cudaMemcpyAsync(
+    &h_num_successes, d_num_successes, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream.get()));
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+
+  ref_allocator.deallocate(d_refs, submaps_.size(), stream);
+  counter_allocator.deallocate(d_num_successes, 1, stream);
+
+  size_ -= h_num_successes;
+}
+
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase_async(
+  InputIt first, InputIt last, cuda::stream_ref stream)
+{
+  auto const& ref = *submaps_.front();
+  CUCO_EXPECTS(ref.empty_key_sentinel() != ref.erased_key_sentinel(),
+               "Erase requires a unique erased key sentinel to be provided at construction.",
+               std::runtime_error);
+
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  // Fast path: single submap
+  if (submaps_.size() == 1) {
+    submaps_.front()->erase_async(first, last, stream);
+    return;
+  }
+
+  // Multiple submaps: use kernel to erase from all submaps in parallel
+  using erase_ref_type = decltype(submaps_.front()->ref(cuco::op::erase));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<erase_ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+
+  // Allocate and copy erase refs for all submaps
+  auto* d_refs = ref_allocator.allocate(submaps_.size(), stream);
+  std::vector<erase_ref_type> h_refs;
+  h_refs.reserve(submaps_.size());
+  for (auto const& submap : submaps_) {
+    h_refs.push_back(submap->ref(cuco::op::erase));
+  }
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs,
+                                h_refs.data(),
+                                sizeof(erase_ref_type) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+
+  auto constexpr cg_size    = ProbingScheme::cg_size;
+  auto constexpr block_size = cuco::detail::default_block_size();
+  auto const grid_size      = cuco::detail::grid_size(num_keys, cg_size);
+
+  // For async, we don't track success count
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::atomic<std::size_t>>;
+  auto counter_allocator = counter_allocator_type{alloc_};
+  auto* d_num_successes  = counter_allocator.allocate(1, stream);
+  CUCO_CUDA_TRY(
+    cudaMemsetAsync(d_num_successes, 0, sizeof(cuda::atomic<std::size_t>), stream.get()));
+
+  detail::dynamic_map_ns::erase<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
+    first, num_keys, d_num_successes, d_refs, static_cast<uint32_t>(submaps_.size()));
+
+  // Deallocate asynchronously (counter value is discarded for async)
+  ref_allocator.deallocate(d_refs, submaps_.size(), stream);
+  counter_allocator.deallocate(d_num_successes, 1, stream);
+}
+
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt, typename OutputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
+{
+  find_async(first, last, output_begin, stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+}
+
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt, typename OutputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  if (submaps_.size() == 1) {
+    submaps_.front()->find_async(first, last, output_begin, stream);
+    return;
+  }
+
+  using ref_type = decltype(submaps_.front()->ref(cuco::op::find));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+  auto* d_refs       = ref_allocator.allocate(submaps_.size(), stream);
+
+  std::vector<ref_type> h_refs;
+  h_refs.reserve(submaps_.size());
+  for (auto const& submap : submaps_) {
+    h_refs.push_back(submap->ref(cuco::op::find));
+  }
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs,
+                                h_refs.data(),
+                                sizeof(ref_type) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+
+  auto constexpr cg_size    = ProbingScheme::cg_size;
+  auto constexpr block_size = cuco::detail::default_block_size();
+  auto const grid_size      = cuco::detail::grid_size(num_keys, cg_size);
+
+  detail::dynamic_map_ns::find<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
+    first, num_keys, output_begin, d_refs, static_cast<uint32_t>(submaps_.size()));
+
+  ref_allocator.deallocate(d_refs, submaps_.size(), stream);
+}
+
 template <typename Key,
           typename T,
           typename Extent,
@@ -157,24 +572,148 @@ template <typename InputIt, typename OutputIt>
 void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
   InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
-  auto num_keys          = cuco::detail::distance(first, last);
-  std::size_t traversed  = 0;
-  std::size_t submap_idx = 0;
-  while (num_keys > 0 && submap_idx < submaps_.size()) {
-    const auto& cur       = submaps_[submap_idx];
-    const size_t cur_size = cur->size();
-    const size_t num_keys_to_process =
-      std::min(static_cast<detail::index_type>(cur_size), num_keys);
-    CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+  contains_async(first, last, output_begin, stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+}
 
-    cur->contains(first, first + num_keys_to_process, output_begin + traversed, stream);
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename InputIt, typename OutputIt>
+void dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  contains_async(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
 
-    traversed += num_keys_to_process;
-    num_keys -= num_keys_to_process;
-    submap_idx++;
-    first += num_keys_to_process;
+  if (submaps_.size() == 1) {
+    submaps_.front()->contains_async(first, last, output_begin, stream);
+    return;
   }
+
+  using ref_type = decltype(submaps_.front()->ref(cuco::op::contains));
+
+  using ref_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<ref_type>;
+  auto ref_allocator = ref_allocator_type{alloc_};
+  auto* d_refs       = ref_allocator.allocate(submaps_.size(), stream);
+
+  std::vector<ref_type> h_refs;
+  h_refs.reserve(submaps_.size());
+  for (auto const& submap : submaps_) {
+    h_refs.push_back(submap->ref(cuco::op::contains));
+  }
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_refs,
+                                h_refs.data(),
+                                sizeof(ref_type) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+
+  auto constexpr cg_size    = ProbingScheme::cg_size;
+  auto constexpr block_size = cuco::detail::default_block_size();
+  auto const grid_size      = cuco::detail::grid_size(num_keys, cg_size);
+
+  detail::dynamic_map_ns::contains<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
+    first, num_keys, output_begin, d_refs, static_cast<uint32_t>(submaps_.size()));
+
+  ref_allocator.deallocate(d_refs, submaps_.size(), stream);
+}
+
+template <typename Key,
+          typename T,
+          typename Extent,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename Allocator,
+          typename Storage>
+template <typename KeyOut, typename ValueOut>
+std::pair<KeyOut, ValueOut>
+dynamic_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve_all(
+  KeyOut keys_out, ValueOut values_out, cuda::stream_ref stream) const
+{
+  if (size_ == 0) { return {keys_out, values_out}; }
+
+  // Fast path: single submap
+  if (submaps_.size() == 1) { return submaps_.front()->retrieve_all(keys_out, values_out, stream); }
+
+  // Multiple submaps: use kernel
+  using slot_type = typename map_type::value_type;
+
+  // Compute capacity prefix sums and total capacity
+  std::vector<detail::index_type> h_capacity_prefix_sum(submaps_.size());
+  detail::index_type total_capacity = 0;
+  for (std::size_t i = 0; i < submaps_.size(); ++i) {
+    total_capacity += submaps_[i]->capacity();
+    h_capacity_prefix_sum[i] = total_capacity;
+  }
+
+  // Collect slot pointers
+  std::vector<slot_type const*> h_slot_arrays(submaps_.size());
+  for (std::size_t i = 0; i < submaps_.size(); ++i) {
+    h_slot_arrays[i] = submaps_[i]->data();
+  }
+
+  // Allocate device memory
+  using slot_ptr_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<slot_type const*>;
+  auto slot_ptr_allocator = slot_ptr_allocator_type{alloc_};
+  auto* d_slot_arrays     = slot_ptr_allocator.allocate(submaps_.size(), stream);
+
+  using index_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<detail::index_type>;
+  auto index_allocator        = index_allocator_type{alloc_};
+  auto* d_capacity_prefix_sum = index_allocator.allocate(submaps_.size(), stream);
+
+  using counter_allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::atomic<std::size_t>>;
+  auto counter_allocator = counter_allocator_type{alloc_};
+  auto* d_num_out        = counter_allocator.allocate(1, stream);
+
+  // Copy data to device
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_slot_arrays,
+                                h_slot_arrays.data(),
+                                sizeof(slot_type const*) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+  CUCO_CUDA_TRY(cudaMemcpyAsync(d_capacity_prefix_sum,
+                                h_capacity_prefix_sum.data(),
+                                sizeof(detail::index_type) * submaps_.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_num_out, 0, sizeof(cuda::atomic<std::size_t>), stream.get()));
+
+  auto constexpr block_size = detail::default_block_size();
+  auto const grid_size      = detail::grid_size(total_capacity);
+
+  detail::dynamic_map_ns::retrieve_all<block_size, key_type, mapped_type>
+    <<<grid_size, block_size, 0, stream.get()>>>(keys_out,
+                                                 values_out,
+                                                 d_slot_arrays,
+                                                 static_cast<uint32_t>(submaps_.size()),
+                                                 total_capacity,
+                                                 d_num_out,
+                                                 d_capacity_prefix_sum,
+                                                 empty_key_sentinel(),
+                                                 erased_key_sentinel());
+
+  // Read back count
+  std::size_t h_num_out = 0;
+  CUCO_CUDA_TRY(cudaMemcpyAsync(
+    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream.get()));
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream.get()));
+
+  // Deallocate
+  slot_ptr_allocator.deallocate(d_slot_arrays, submaps_.size(), stream);
+  index_allocator.deallocate(d_capacity_prefix_sum, submaps_.size(), stream);
+  counter_allocator.deallocate(d_num_out, 1, stream);
+
+  return {keys_out + h_num_out, values_out + h_num_out};
 }
 
-}  // namespace experimental
 }  // namespace cuco
diff --git a/include/cuco/detail/dynamic_map/kernels.cuh b/include/cuco/detail/dynamic_map/kernels.cuh
new file mode 100644
index 000000000..fc6253dc7
--- /dev/null
+++ b/include/cuco/detail/dynamic_map/kernels.cuh
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+#include <cuco/detail/utility/cuda.cuh>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cuda/std/atomic>
+#include <cuda/std/iterator>
+
+#include <cooperative_groups.h>
+
+namespace cuco::detail::dynamic_map_ns {
+namespace cg = cooperative_groups;
+
+CUCO_SUPPRESS_KERNEL_WARNINGS
+
+/**
+ * @brief Inserts key/value pairs into the map, checking all submaps for duplicates.
+ *
+ * For each key, checks all submaps except the target for existing keys. Only inserts
+ * if the key doesn't exist in any other submap.
+ *
+ * @tparam CGSize Cooperative group size
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of submap device ref with both contains and insert capabilities
+ *
+ * @param first Beginning of the sequence of key/value pairs
+ * @param n Number of keys
+ * @param num_successes Pointer to atomic counter for successful insertions
+ * @param submap_refs Array of submap refs (with contains and insert ops)
+ * @param insert_idx Index of the submap we're inserting into
+ * @param num_submaps Total number of submaps
+ */
+template <int CGSize, int BlockSize, typename InputIt, typename AtomicT, typename Ref>
+CUCO_KERNEL void insert(InputIt first,
+                        cuco::detail::index_type n,
+                        AtomicT* num_successes,
+                        Ref* submap_refs,
+                        uint32_t insert_idx,
+                        uint32_t num_submaps)
+{
+  using BlockReduce = cub::BlockReduce<std::size_t, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  std::size_t thread_num_successes = 0;
+
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename cuda::std::iterator_traits<InputIt>::value_type const pair{*(first + idx)};
+    bool exists = false;
+
+    if constexpr (CGSize == 1) {
+      for (uint32_t i = 0; i < num_submaps && !exists; ++i) {
+        if (i != insert_idx) { exists = submap_refs[i].contains(pair.first); }
+      }
+
+      if (!exists) {
+        if (submap_refs[insert_idx].insert(pair)) { ++thread_num_successes; }
+      }
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+      for (uint32_t i = 0; i < num_submaps && !exists; ++i) {
+        if (i != insert_idx) { exists = submap_refs[i].contains(tile, pair.first); }
+      }
+      tile.sync();
+
+      if (!exists) {
+        if (submap_refs[insert_idx].insert(tile, pair) && tile.thread_rank() == 0) {
+          ++thread_num_successes;
+        }
+      }
+    }
+    idx += loop_stride;
+  }
+
+  // Aggregate success count
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Inserts or assigns key/value pairs, checking all submaps.
+ *
+ * For each key, checks all submaps. If found, assigns the new value. If not found,
+ * inserts into the target submap. Only counts new insertions (not assignments).
+ *
+ * @tparam CGSize Cooperative group size
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of submap device ref with contains, insert, and insert_or_assign capabilities
+ *
+ * @param first Beginning of the sequence of key/value pairs
+ * @param n Number of keys
+ * @param num_insertions Pointer to atomic counter for new insertions (not assignments)
+ * @param submap_refs Array of submap refs
+ * @param insert_idx Index of the submap to insert into if key not found
+ * @param num_submaps Total number of submaps
+ */
+template <int CGSize, int BlockSize, typename InputIt, typename AtomicT, typename Ref>
+CUCO_KERNEL void insert_or_assign(InputIt first,
+                                  cuco::detail::index_type n,
+                                  AtomicT* num_insertions,
+                                  Ref* submap_refs,
+                                  uint32_t insert_idx,
+                                  uint32_t num_submaps)
+{
+  using BlockReduce = cub::BlockReduce<std::size_t, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  std::size_t thread_num_insertions = 0;
+
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename cuda::std::iterator_traits<InputIt>::value_type const pair{*(first + idx)};
+    bool found = false;
+
+    if constexpr (CGSize == 1) {
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        if (submap_refs[i].contains(pair.first)) {
+          submap_refs[i].insert_or_assign(pair);
+          found = true;
+        }
+      }
+
+      if (!found) {
+        if (submap_refs[insert_idx].insert(pair)) { ++thread_num_insertions; }
+      }
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        if (submap_refs[i].contains(tile, pair.first)) {
+          submap_refs[i].insert_or_assign(tile, pair);
+          found = true;
+        }
+      }
+      tile.sync();
+
+      if (!found) {
+        if (submap_refs[insert_idx].insert(tile, pair) && tile.thread_rank() == 0) {
+          ++thread_num_insertions;
+        }
+      }
+    }
+    idx += loop_stride;
+  }
+
+  // Aggregate insertion count
+  std::size_t const block_num_insertions = BlockReduce(temp_storage).Sum(thread_num_insertions);
+  if (threadIdx.x == 0) {
+    num_insertions->fetch_add(block_num_insertions, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Erases keys from all submaps.
+ *
+ * For each key, attempts to erase from all submaps. Tracks total erased count.
+ *
+ * @tparam CGSize Cooperative group size
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of submap device ref with erase capability
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param num_successes Pointer to atomic counter for successful erasures
+ * @param submap_refs Array of submap refs for erase operations
+ * @param num_submaps Total number of submaps
+ */
+template <int CGSize, int BlockSize, typename InputIt, typename AtomicT, typename Ref>
+CUCO_KERNEL void erase(InputIt first,
+                       cuco::detail::index_type n,
+                       AtomicT* num_successes,
+                       Ref* submap_refs,
+                       uint32_t num_submaps)
+{
+  using BlockReduce = cub::BlockReduce<std::size_t, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  std::size_t thread_num_successes = 0;
+
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
+
+    if constexpr (CGSize == 1) {
+      for (uint32_t i = 0; i < num_submaps; ++i) {
+        if (submap_refs[i].erase(key)) {
+          ++thread_num_successes;
+          break;
+        }
+      }
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+      for (uint32_t i = 0; i < num_submaps; ++i) {
+        if (submap_refs[i].erase(tile, key)) {
+          if (tile.thread_rank() == 0) { ++thread_num_successes; }
+          break;
+        }
+      }
+    }
+    idx += loop_stride;
+  }
+
+  // Aggregate success count
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Finds the values corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in any submap, copies its associated value to `(output_begin +
+ * i)`. Else, copies the empty value sentinel.
+ *
+ * @tparam CGSize Cooperative group size
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator
+ * @tparam Ref Type of submap device ref
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param output_begin Beginning of the sequence of values retrieved for each key
+ * @param submap_refs Array of submap device refs
+ * @param num_submaps The number of submaps in the map
+ */
+template <int CGSize, int BlockSize, typename InputIt, typename OutputIt, typename Ref>
+CUCO_KERNEL void find(InputIt first,
+                      cuco::detail::index_type n,
+                      OutputIt output_begin,
+                      Ref const* submap_refs,
+                      uint32_t num_submaps)
+{
+  using mapped_type = typename Ref::mapped_type;
+
+  auto const empty_value_sentinel = submap_refs[0].empty_value_sentinel();
+
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  __shared__ mapped_type write_buffer[BlockSize];
+
+  while (idx < n) {
+    typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
+    auto found_value = empty_value_sentinel;
+    bool found       = false;
+
+    if constexpr (CGSize == 1) {
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        auto const result = submap_refs[i].find(key);
+        if (result != submap_refs[i].end()) {
+          found_value = result->second;
+          found       = true;
+        }
+      }
+      write_buffer[threadIdx.x] = found_value;
+      __syncthreads();
+      *(output_begin + idx) = write_buffer[threadIdx.x];
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        auto const result = submap_refs[i].find(tile, key);
+        if (result != submap_refs[i].end()) {
+          found_value = result->second;
+          found       = true;
+        }
+      }
+
+      if (tile.thread_rank() == 0) { write_buffer[threadIdx.x / CGSize] = found_value; }
+      __syncthreads();
+      if (tile.thread_rank() == 0) { *(output_begin + idx) = write_buffer[threadIdx.x / CGSize]; }
+    }
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, first + n)` are contained in any submap.
+ *
+ * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
+ *
+ * @tparam CGSize Cooperative group size
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator
+ * @tparam Ref Type of submap device ref
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param submap_refs Array of submap device refs
+ * @param num_submaps The number of submaps in the map
+ */
+/**
+ * @brief Retrieves all key-value pairs from all submaps.
+ *
+ * Iterates through all slots across all submaps, outputting non-empty/non-erased pairs.
+ *
+ * @tparam BlockSize The number of threads in the thread block
+ * @tparam Key Key type
+ * @tparam Value Mapped value type
+ * @tparam KeyOut Device accessible output iterator for keys
+ * @tparam ValueOut Device accessible output iterator for values
+ * @tparam SlotT Slot type (cuco::pair<Key, Value>)
+ * @tparam AtomicT Atomic counter type
+ *
+ * @param keys_out Beginning output iterator for keys
+ * @param values_out Beginning output iterator for values
+ * @param slot_arrays Array of pointers to each submap's slot storage
+ * @param num_submaps Number of submaps
+ * @param total_capacity Total number of slots across all submaps
+ * @param num_out Pointer to atomic counter for number of retrieved pairs
+ * @param capacity_prefix_sum Prefix sum of submap capacities
+ * @param empty_key_sentinel Sentinel value for empty key
+ * @param erased_key_sentinel Sentinel value for erased key
+ */
+template <int BlockSize,
+          typename Key,
+          typename Value,
+          typename KeyOut,
+          typename ValueOut,
+          typename SlotT,
+          typename AtomicT>
+CUCO_KERNEL void retrieve_all(KeyOut keys_out,
+                              ValueOut values_out,
+                              SlotT const* const* slot_arrays,
+                              uint32_t num_submaps,
+                              cuco::detail::index_type total_capacity,
+                              AtomicT* num_out,
+                              cuco::detail::index_type const* capacity_prefix_sum,
+                              Key empty_key_sentinel,
+                              Key erased_key_sentinel)
+{
+  using BlockScan = cub::BlockScan<unsigned int, BlockSize>;
+  __shared__ typename BlockScan::TempStorage scan_temp_storage;
+  __shared__ unsigned int block_base;
+
+  auto idx = cuco::detail::global_thread_id();
+
+  while ((idx - threadIdx.x) < total_capacity) {
+    // Determine which submap this slot belongs to
+    uint32_t submap_idx = 0;
+    auto submap_offset  = idx;
+
+    if (idx < total_capacity) {
+      while (submap_idx < num_submaps && idx >= capacity_prefix_sum[submap_idx]) {
+        ++submap_idx;
+      }
+      if (submap_idx > 0) { submap_offset = idx - capacity_prefix_sum[submap_idx - 1]; }
+    }
+
+    // Check if slot is filled (not empty and not erased)
+    bool is_filled = false;
+    Key key{};
+    Value value{};
+
+    if (idx < total_capacity && submap_idx < num_submaps) {
+      auto const& slot = slot_arrays[submap_idx][submap_offset];
+      // Use atomic_ref for thread-safe read
+      cuda::atomic_ref<Key const, cuda::thread_scope_device> key_ref(slot.first);
+      key       = key_ref.load(cuda::std::memory_order_relaxed);
+      is_filled = !cuco::detail::bitwise_compare(key, empty_key_sentinel) &&
+                  !cuco::detail::bitwise_compare(key, erased_key_sentinel);
+      if (is_filled) {
+        cuda::atomic_ref<Value const, cuda::thread_scope_device> value_ref(slot.second);
+        value = value_ref.load(cuda::std::memory_order_relaxed);
+      }
+    }
+
+    // Block scan to compute output positions
+    unsigned int local_idx   = 0;
+    unsigned int block_valid = 0;
+    BlockScan(scan_temp_storage).ExclusiveSum(is_filled ? 1u : 0u, local_idx, block_valid);
+
+    if (threadIdx.x == 0) {
+      block_base = num_out->fetch_add(block_valid, cuda::std::memory_order_relaxed);
+    }
+    __syncthreads();
+
+    if (is_filled) {
+      keys_out[block_base + local_idx]   = key;
+      values_out[block_base + local_idx] = value;
+    }
+
+    idx += cuco::detail::grid_stride();
+  }
+}
+
+template <int CGSize, int BlockSize, typename InputIt, typename OutputIt, typename Ref>
+CUCO_KERNEL void contains(InputIt first,
+                          cuco::detail::index_type n,
+                          OutputIt output_begin,
+                          Ref const* submap_refs,
+                          uint32_t num_submaps)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  __shared__ bool write_buffer[BlockSize];
+
+  while (idx < n) {
+    typename cuda::std::iterator_traits<InputIt>::value_type const key{*(first + idx)};
+    bool found = false;
+
+    if constexpr (CGSize == 1) {
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        found = submap_refs[i].contains(key);
+      }
+      write_buffer[threadIdx.x] = found;
+      __syncthreads();
+      *(output_begin + idx) = write_buffer[threadIdx.x];
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+      for (uint32_t i = 0; i < num_submaps && !found; ++i) {
+        found = submap_refs[i].contains(tile, key);
+      }
+
+      if (tile.thread_rank() == 0) { write_buffer[threadIdx.x / CGSize] = found; }
+      __syncthreads();
+      if (tile.thread_rank() == 0) { *(output_begin + idx) = write_buffer[threadIdx.x / CGSize]; }
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace cuco::detail::dynamic_map_ns
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
deleted file mode 100644
index 6d3f9b9c7..000000000
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/detail/bitwise_compare.cuh>
-#include <cuco/detail/utility/cuda.cuh>
-
-#include <cub/block/block_reduce.cuh>
-#include <cub/block/block_scan.cuh>
-#include <cuda/std/atomic>
-
-#include <cooperative_groups.h>
-
-namespace cuco {
-namespace detail {
-namespace cg = cooperative_groups;
-
-CUCO_SUPPRESS_KERNEL_WARNINGS
-
-/**
- * @brief Inserts all key/value pairs in the range `[first, last)`.
- *
- * If multiple keys in `[first, last)` compare equal, it is unspecified which
- * element is inserted.
- *
- * @tparam block_size
- * @tparam pair_type Type of the pairs contained in the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `value_type`
- * @tparam viewT Type of the `static_map` device views
- * @tparam mutableViewT Type of the `static_map` device mutable views
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param last End of the sequence of key/value pairs
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `contains` operations on each underlying `static_map`
- * @param submap_mutable_views Array of `static_map::device_mutable_view` objects
- * used to perform an `insert` into the target `static_map` submap
- * @param num_successes The number of successfully inserted key/value pairs
- * @param insert_idx The index of the submap we are inserting into
- * @param num_submaps The total number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function used to compare two keys for equality
- */
-template <uint32_t block_size,
-          typename pair_type,
-          typename InputIt,
-          typename viewT,
-          typename mutableViewT,
-          typename atomicT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void insert(InputIt first,
-                        InputIt last,
-                        viewT* submap_views,
-                        mutableViewT* submap_mutable_views,
-                        atomicT* num_successes,
-                        uint32_t insert_idx,
-                        uint32_t num_submaps,
-                        Hash hash,
-                        KeyEqual key_equal)
-{
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  while (first + tid < last) {
-    pair_type insert_pair = *(first + tid);
-    auto exists           = false;
-
-    // manually check for duplicates in those submaps we are not inserting into
-    for (auto i = 0; i < num_submaps; ++i) {
-      if (i != insert_idx) {
-        exists = submap_views[i].contains(insert_pair.first, hash, key_equal);
-        if (exists) { break; }
-      }
-    }
-    if (!exists) {
-      if (submap_mutable_views[insert_idx].insert(insert_pair, hash, key_equal)) {
-        thread_num_successes++;
-      }
-    }
-
-    tid += gridDim.x * blockDim.x;
-  }
-
-  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Inserts all key/value pairs in the range `[first, last)`.
- *
- * If multiple keys in `[first, last)` compare equal, it is unspecified which
- * element is inserted. Uses the CUDA Cooperative Groups API to leverage groups
- * of multiple threads to perform each key/value insertion. This provides a
- * significant boost in throughput compared to the non Cooperative Group
- * `insert` at moderate to high load factors.
- *
- * @tparam block_size
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam pair_type Type of the pairs contained in the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `value_type`
- * @tparam viewT Type of the `static_map` device views
- * @tparam mutableViewT Type of the `static_map` device mutable views
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param last End of the sequence of key/value pairs
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `contains` operations on each underlying `static_map`
- * @param submap_mutable_views Array of `static_map::device_mutable_view` objects
- * used to perform an `insert` into the target `static_map` submap
- * @param submap_num_successes The number of successfully inserted key/value pairs for each submap
- * @param insert_idx The index of the submap we are inserting into
- * @param num_submaps The total number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function used to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          typename pair_type,
-          typename InputIt,
-          typename viewT,
-          typename mutableViewT,
-          typename atomicT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void insert(InputIt first,
-                        InputIt last,
-                        viewT* submap_views,
-                        mutableViewT* submap_mutable_views,
-                        atomicT** submap_num_successes,
-                        uint32_t insert_idx,
-                        uint32_t num_submaps,
-                        Hash hash,
-                        KeyEqual key_equal)
-{
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_successes = 0;
-
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
-  auto it   = first + tid / tile_size;
-
-  while (it < last) {
-    pair_type insert_pair = *it;
-    auto exists           = false;
-
-    // manually check for duplicates in those submaps we are not inserting into
-    for (auto i = 0; i < num_submaps; ++i) {
-      if (i != insert_idx) {
-        exists = submap_views[i].contains(tile, insert_pair.first, hash, key_equal);
-        if (exists) { break; }
-      }
-    }
-    if (!exists) {
-      if (submap_mutable_views[insert_idx].insert(tile, insert_pair, hash, key_equal) &&
-          tile.thread_rank() == 0) {
-        thread_num_successes++;
-      }
-    }
-
-    it += (gridDim.x * blockDim.x) / tile_size;
-  }
-
-  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) {
-    submap_num_successes[insert_idx]->fetch_add(block_num_successes,
-                                                cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
-   insertions.
- * Else, no effect.
- *
- * @tparam block_size The size of the thread block
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam mutableViewT Type of device view allowing modification of hash map storage
- * @tparam atomicT Type of atomic storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
- * perform `erase` operations on each underlying `static_map`
- * @param num_successes The number of successfully erased key/value pairs
- * @param submap_num_successes The number of successfully erased key/value pairs
- * in each submap
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          typename InputIt,
-          typename mutableViewT,
-          typename atomicT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void erase(InputIt first,
-                       InputIt last,
-                       mutableViewT* submap_mutable_views,
-                       atomicT** submap_num_successes,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
-{
-  extern __shared__ unsigned long long submap_block_num_successes[];
-
-  auto tid = block_size * blockIdx.x + threadIdx.x;
-  auto it  = first + tid;
-
-  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
-    submap_block_num_successes[i] = 0;
-  }
-  __syncthreads();
-
-  while (it < last) {
-    for (auto i = 0; i < num_submaps; ++i) {
-      if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
-        atomicAdd(&submap_block_num_successes[i], 1);
-        break;
-      }
-    }
-    it += gridDim.x * blockDim.x;
-  }
-  __syncthreads();
-
-  for (auto i = 0; i < num_submaps; ++i) {
-    if (threadIdx.x == 0) {
-      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                         cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-/**
- * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
- * insertions.
- * Else, no effect.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam mutableViewT Type of device view allowing modification of hash map storage
- * @tparam atomicT Type of atomic storage
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
- * perform `erase` operations on each underlying `static_map`
- * @param num_successes The number of successfully erased key/value pairs
- * @param submap_num_successes The number of successfully erased key/value pairs
- * in each submap
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename mutableViewT,
-          typename atomicT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void erase(InputIt first,
-                       InputIt last,
-                       mutableViewT* submap_mutable_views,
-                       atomicT** submap_num_successes,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
-{
-  extern __shared__ unsigned long long submap_block_num_successes[];
-
-  auto block = cg::this_thread_block();
-  auto tile  = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  auto tid   = block_size * block.group_index().x + block.thread_rank();
-  auto it    = first + tid / tile_size;
-
-  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
-    submap_block_num_successes[i] = 0;
-  }
-  block.sync();
-
-  while (it < last) {
-    auto erased = false;
-    int i       = 0;
-    for (i = 0; i < num_submaps; ++i) {
-      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
-      if (erased) { break; }
-    }
-    if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); }
-    it += (gridDim.x * blockDim.x) / tile_size;
-  }
-  block.sync();
-
-  for (auto i = 0; i < num_submaps; ++i) {
-    if (threadIdx.x == 0) {
-      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
-                                         cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-/**
- * @brief Finds the values corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
- * Else, copies the empty value sentinel.
- *
- * @tparam block_size The number of threads in the thread block
- * @tparam Value The mapped value type for the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of `static_map` device view
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param output_begin Beginning of the sequence of values retrieved for each key
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `find` operations on each underlying `static_map`
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          typename Value,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void find(InputIt first,
-                      InputIt last,
-                      OutputIt output_begin,
-                      viewT* submap_views,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
-{
-  auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
-  auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
-  __shared__ Value writeBuffer[block_size];
-
-  while (first + tid < last) {
-    auto key         = *(first + tid);
-    auto found_value = empty_value_sentinel;
-    for (auto i = 0; i < num_submaps; ++i) {
-      auto submap_view = submap_views[i];
-      auto found       = submap_view.find(key, hash, key_equal);
-      if (found != submap_view.end()) {
-        found_value = found->second.load(cuda::std::memory_order_relaxed);
-        break;
-      }
-    }
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    writeBuffer[threadIdx.x] = found_value;
-    __syncthreads();
-    *(output_begin + tid) = writeBuffer[threadIdx.x];
-    tid += gridDim.x * blockDim.x;
-  }
-}
-
-/**
- * @brief Finds the values corresponding to all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
- * Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage groups
- * of multiple threads to find each key. This provides a significant boost in throughput compared
- * to the non Cooperative Group `find` at moderate to high load factors.
- *
- * @tparam block_size The number of threads in the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to
- * perform find operations
- * @tparam Value The mapped value type for the map
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of `static_map` device view
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param output_begin Beginning of the sequence of values retrieved for each key
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `find` operations on each underlying `static_map`
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          typename Value,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void find(InputIt first,
-                      InputIt last,
-                      OutputIt output_begin,
-                      viewT* submap_views,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
-{
-  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
-  auto key_idx = tid / tile_size;
-  auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
-  __shared__ Value writeBuffer[block_size];
-
-  while (first + key_idx < last) {
-    auto key         = *(first + key_idx);
-    auto found_value = empty_value_sentinel;
-    for (auto i = 0; i < num_submaps; ++i) {
-      auto submap_view = submap_views[i];
-      auto found       = submap_view.find(tile, key, hash, key_equal);
-      if (found != submap_view.end()) {
-        found_value = found->second.load(cuda::std::memory_order_relaxed);
-        break;
-      }
-    }
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found_value; }
-    __syncthreads();
-    if (tile.thread_rank() == 0) {
-      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
-    }
-    key_idx += (gridDim.x * blockDim.x) / tile_size;
-  }
-}
-
-/**
- * @brief Retrieves all of the keys and their associated values.
- *
- * The order in which keys are returned is implementation defined and not guaranteed to be
- * consistent between subsequent calls to `retrieve_all`.
- *
- * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than
- * `get_size()`
- *
- * @tparam block_size The number of threads in the thread block
- * @tparam KeyOutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam ValueOutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of `static_map` device view
- * @tparam AtomicT Atomic counter type
- *
- * @param keys_out Beginning output iterator for keys
- * @param values_out Beginning output iterator for values
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `retrieve_all` operations on each underlying `static_map`
- * @param num_submaps The number of submaps in the map
- * @param capacity The total number of slots of all submaps
- * @param d_num_out Pointer to the device memory location where the number of keys/vals retrieved
- * are stored
- * @param cap_prefix_sum Array of prefix sums of the number of slots in each submap
- * @return Pair of iterators indicating the last elements in the output
- */
-template <uint32_t block_size,
-          typename KeyOutputIt,
-          typename ValueOutputIt,
-          typename viewT,
-          typename AtomicT>
-CUCO_KERNEL void retrieve_all(KeyOutputIt keys_out,
-                              ValueOutputIt values_out,
-                              viewT* submap_views,
-                              uint32_t num_submaps,
-                              uint64_t capacity,
-                              AtomicT* d_num_out,
-                              size_t* cap_prefix_sum)
-{
-  using BlockScan = cub::BlockScan<unsigned int, block_size>;
-
-  __shared__ typename BlockScan::TempStorage scan_temp_storage;
-  __shared__ unsigned int block_base;
-
-  auto tid                       = blockDim.x * blockIdx.x + threadIdx.x;
-  auto const empty_key_sentinel  = submap_views[0].get_empty_key_sentinel();
-  auto const erased_key_sentinel = submap_views[0].get_erased_key_sentinel();
-
-  while ((tid - threadIdx.x) < capacity) {
-    uint32_t submap_idx    = 0;
-    uint32_t submap_offset = tid;
-    while (tid >= cap_prefix_sum[submap_idx] && submap_idx < num_submaps) {
-      ++submap_idx;
-    }
-    if (submap_idx > 0) { submap_offset = tid - cap_prefix_sum[submap_idx - 1]; }
-
-    auto const& current_slot = submap_views[submap_idx].get_slots()[submap_offset];
-    auto const existing_key  = current_slot.first.load(cuda::std::memory_order_relaxed);
-
-    bool const is_filled = not(cuco::detail::bitwise_compare(existing_key, empty_key_sentinel) or
-                               cuco::detail::bitwise_compare(existing_key, erased_key_sentinel));
-
-    unsigned int local_idx   = 0;
-    unsigned int block_valid = 0;
-    BlockScan(scan_temp_storage).ExclusiveSum(is_filled ? 1u : 0u, local_idx, block_valid);
-
-    if (threadIdx.x == 0) {
-      block_base = d_num_out->fetch_add(block_valid, cuda::memory_order_relaxed);
-    }
-    __syncthreads();
-
-    if (is_filled) {
-      auto const value                 = current_slot.second.load(cuda::std::memory_order_relaxed);
-      keys_out[block_base + local_idx] = existing_key;
-      values_out[block_base + local_idx] = value;
-    }
-    tid += gridDim.x * blockDim.x;
-  }
-}
-
-/**
- * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
- *
- * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
- *
- * @tparam block_size The number of threads in the thread block
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of `static_map` device view
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param output_begin Beginning of the sequence of booleans for the presence of each key
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `contains` operations on each underlying `static_map`
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void contains(InputIt first,
-                          InputIt last,
-                          OutputIt output_begin,
-                          viewT* submap_views,
-                          uint32_t num_submaps,
-                          Hash hash,
-                          KeyEqual key_equal)
-{
-  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
-  __shared__ bool writeBuffer[block_size];
-
-  while (first + tid < last) {
-    auto key   = *(first + tid);
-    auto found = false;
-    for (auto i = 0; i < num_submaps; ++i) {
-      found = submap_views[i].contains(key, hash, key_equal);
-      if (found) { break; }
-    }
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    writeBuffer[threadIdx.x] = found;
-    __syncthreads();
-    *(output_begin + tid) = writeBuffer[threadIdx.x];
-    tid += gridDim.x * blockDim.x;
-  }
-}
-
-/**
- * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
- *
- * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
- * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the
- * contains operation for each key. This provides a significant boost in throughput compared
- * to the non Cooperative Group `contains` at moderate to high load factors.
- *
- * @tparam block_size The number of threads in the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to
- * perform find operations
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * convertible to the map's `mapped_type`
- * @tparam viewT Type of `static_map` device view
- * @tparam Hash Unary callable type
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
- * @param output_begin Beginning of the sequence of booleans for the presence of each key
- * @param submap_views Array of `static_map::device_view` objects used to
- * perform `contains` operations on each underlying `static_map`
- * @param num_submaps The number of submaps in the map
- * @param hash The unary function to apply to hash each key
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Hash,
-          typename KeyEqual>
-CUCO_KERNEL void contains(InputIt first,
-                          InputIt last,
-                          OutputIt output_begin,
-                          viewT* submap_views,
-                          uint32_t num_submaps,
-                          Hash hash,
-                          KeyEqual key_equal)
-{
-  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
-  auto key_idx = tid / tile_size;
-  __shared__ bool writeBuffer[block_size];
-
-  while (first + key_idx < last) {
-    auto key   = *(first + key_idx);
-    auto found = false;
-    for (auto i = 0; i < num_submaps; ++i) {
-      found = submap_views[i].contains(tile, key, hash, key_equal);
-      if (found) { break; }
-    }
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
-    __syncthreads();
-    if (tile.thread_rank() == 0) {
-      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
-    }
-    key_idx += (gridDim.x * blockDim.x) / tile_size;
-  }
-}
-}  // namespace detail
-}  // namespace cuco
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 9eb1fa7e4..5b7839cbd 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,24 +16,21 @@
 
 #pragma once
 
-#include <cuco/detail/dynamic_map_kernels.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/static_map.cuh>
 #include <cuco/types.cuh>
 
-#include <cuda/std/atomic>
 #include <cuda/std/functional>
-#include <thrust/device_vector.h>
 
 #include <cstddef>
 #include <memory>
 #include <numeric>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace cuco {
 
-namespace experimental {
 /**
  * @brief A GPU-accelerated, unordered, associative container of key-value
  * pairs with unique keys.
@@ -68,6 +65,7 @@ class dynamic_map {
   using value_type  = typename map_type::value_type;  ///< Key-value pair type
   using size_type   = typename map_type::size_type;   ///< Size type
   using key_equal   = typename map_type::key_equal;   ///< Key equality comparator type
+  using hasher      = typename map_type::hasher;      ///< Hash function type
   using mapped_type = T;                              ///< Payload type
 
   dynamic_map(dynamic_map const&)            = delete;
@@ -84,7 +82,7 @@ class dynamic_map {
   ~dynamic_map()                        = default;
 
   /**
-   * @brief Constructs a dynamically-sized map with erase capability.
+   * @brief Constructs a dynamically-sized map.
    *
    * The capacity of the map will automatically increase as the user adds key/value pairs using
    * `insert`.
@@ -113,178 +111,6 @@ class dynamic_map {
                         Allocator const& alloc              = {},
                         cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
-  /**
-   * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
-   *
-   * If there is already enough space for `n` key/value pairs, the capacity remains the same.
-   *
-   * @param n The number of key value pairs for which there must be space
-   * @param stream Stream used for executing the kernels
-   */
-  void reserve(size_type n, cuda::stream_ref stream);
-
-  /**
-   * @brief Inserts all key/value pairs in the range `[first, last)`.
-   *
-   * If multiple keys in `[first, last)` compare equal, it is unspecified which
-   * element is inserted.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `value_type`
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stream Stream used for executing the kernels
-   */
-  template <typename InputIt>
-  void insert(InputIt first,
-              InputIt last,
-              cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
-
-  /**
-   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
-   *
-   * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
-   *
-   * @tparam InputIt Device accessible input iterator
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * convertible to the map's `mapped_type`
-   *
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the sequence of booleans for the presence of each key
-   * @param stream Stream used for executing the kernels
-   */
-  template <typename InputIt, typename OutputIt>
-  void contains(InputIt first,
-                InputIt last,
-                OutputIt output_begin,
-                cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const;
-
- private:
-  size_type size_{};      ///< Number of keys in the map
-  size_type capacity_{};  ///< Maximum number of keys that can be inserted
-
-  std::vector<std::unique_ptr<map_type>> submaps_;  ///< vector of pointers to each submap
-  size_type min_insert_size_{};                     ///< min remaining capacity of submap for insert
-  float max_load_factor_{};                         ///< Maximum load factor
-  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
-};
-
-}  // namespace experimental
-
-/**
- * @brief A GPU-accelerated, unordered, associative container of key-value
- * pairs with unique keys
- *
- * Automatically grows capacity as necessary until device memory runs out.
- *
- * Allows constant time concurrent inserts or concurrent find operations (not
- * concurrent insert and find) from threads in device code.
- *
- * Current limitations:
- * - Requires keys and values that where `cuco::is_bitwise_comparable_v<T>` is true
- *    - Comparisons against the "sentinel" values will always be done with bitwise comparisons.
- * - Capacity does not shrink automatically
- * - Requires the user to specify sentinel values for both key and mapped value
- *   to indicate empty slots
- * - Does not support concurrent insert and find operations
- *
- * The `dynamic_map` supports host-side "bulk" operations which include `insert`, `find`
- * and `contains`. These are to be used when there are a large number of keys to insert
- * or lookup in the map. For example, given a range of keys specified by device-accessible
- * iterators, the bulk `insert` function will insert all keys into the map.
- *
- * Example:
- * \code{.cpp}
- * int empty_key_sentinel = -1;
- * int empty_value_sentinel = -1;
- *
- * // Constructs a map with 100,000 initial slots using -1 and -1 as the empty key/value
- * // sentinels. Performs one bulk insert of 50,000 keys and a second bulk insert of
- * // 100,000 keys. The map automatically increases capacity to accomodate the excess keys
- * // within the second insert.
- *
- * dynamic_map<int, int> m{100'000,
- *                         empty_key<int>{empty_key_sentinel},
- *                         empty_value<int>{empty_value_sentinel}};
- *
- * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
- * thrust::device_vector<cuda::std::pair<int,int>> pairs_0(50'000);
- * thrust::transform(thrust::make_counting_iterator(0),
- *                   thrust::make_counting_iterator(pairs_0.size()),
- *                   pairs_0.begin(),
- *                   []__device__(auto i){ return cuco::pair{i,i}; };
- *
- * thrust::device_vector<cuda::std::pair<int,int>> pairs_1(100'000);
- * thrust::transform(thrust::make_counting_iterator(50'000),
- *                   thrust::make_counting_iterator(pairs_1.size()),
- *                   pairs_1.begin(),
- *                   []__device__(auto i){ return cuco::pair{i,i}; };
- *
- * // Inserts all pairs into the map
- * m.insert(pairs_0.begin(), pairs_0.end());
- * m.insert(pairs_1.begin(), pairs_1.end());
- * \endcode
- *
- * @tparam Key Arithmetic type used for key
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which insert/find/contains will be performed by
- * individual threads.
- * @tparam Allocator Type of allocator used to allocate submap device storage
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope = cuda::thread_scope_device,
-          typename Allocator       = cuco::cuda_allocator<char>>
-class dynamic_map {
-  static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
-
- public:
-  using value_type      = cuco::pair<Key, Value>;            ///< Type of key/value pairs
-  using key_type        = Key;                               ///< Key type
-  using mapped_type     = Value;                             ///< Type of mapped values
-  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
-  using view_type =
-    typename cuco::legacy::static_map<Key, Value, Scope>::device_view;  ///< Type for submap device
-                                                                        ///< view
-  using mutable_view_type =
-    typename cuco::legacy::static_map<Key, Value, Scope>::device_mutable_view;  ///< Type for submap
-                                                                                ///< mutable device
-                                                                                ///< view
-
-  dynamic_map(dynamic_map const&) = delete;
-  dynamic_map(dynamic_map&&)      = delete;
-
-  dynamic_map& operator=(dynamic_map const&) = delete;
-  dynamic_map& operator=(dynamic_map&&)      = delete;
-
-  /**
-   * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor
-   * and sentinel values.
-   *
-   * The capacity of the map will automatically increase as the user adds key/value pairs using
-   * `insert`.
-   *
-   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
-   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
-   * map's capacity grows.
-   *
-   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
-   * undefined behavior results from attempting to insert any key/value pair
-   * that contains either.
-   *
-   * @param initial_capacity The initial number of slots in the map
-   * @param empty_key_sentinel The reserved key value for empty slots
-   * @param empty_value_sentinel The reserved mapped value for empty slots
-   * @param alloc Allocator used to allocate submap device storage
-   * @param stream Stream used for executing the kernels
-   */
-  dynamic_map(std::size_t initial_capacity,
-              empty_key<Key> empty_key_sentinel,
-              empty_value<Value> empty_value_sentinel,
-              Allocator const& alloc = Allocator{},
-              cudaStream_t stream    = nullptr);
-
   /**
    * @brief Constructs a dynamically-sized map with erase capability.
    *
@@ -292,7 +118,7 @@ class dynamic_map {
    * `insert`.
    *
    * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
-   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
+   * threshold occupancy. The performance of `find` and `contains` gradually decreases each time the
    * map's capacity grows.
    *
    * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
@@ -303,24 +129,26 @@ class dynamic_map {
    * @param empty_key_sentinel The reserved key value for empty slots
    * @param empty_value_sentinel The reserved mapped value for empty slots
    * @param erased_key_sentinel The reserved key value for erased slots
-   * @param alloc Allocator used to allocate submap device storage
-   * @param stream Stream used for executing the kernels
+   * @param pred Key equality binary predicate
+   * @param probing_scheme Probing scheme
+   * @param scope The scope in which operations will be performed
+   * @param storage Kind of storage to use
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the map
    *
    * @throw std::runtime error if the empty key sentinel and erased key sentinel
    * are the same value
    */
-  dynamic_map(std::size_t initial_capacity,
-              empty_key<Key> empty_key_sentinel,
-              empty_value<Value> empty_value_sentinel,
-              erased_key<Key> erased_key_sentinel,
-              Allocator const& alloc = Allocator{},
-              cudaStream_t stream    = nullptr);
-
-  /**
-   * @brief Destroys the map and frees its contents
-   *
-   */
-  ~dynamic_map() {}
+  constexpr dynamic_map(Extent initial_capacity,
+                        empty_key<Key> empty_key_sentinel,
+                        empty_value<T> empty_value_sentinel,
+                        erased_key<Key> erased_key_sentinel,
+                        KeyEqual const& pred                = {},
+                        ProbingScheme const& probing_scheme = {},
+                        cuda_thread_scope<Scope> scope      = {},
+                        Storage storage                     = {},
+                        Allocator const& alloc              = {},
+                        cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
    * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
@@ -330,75 +158,111 @@ class dynamic_map {
    * @param n The number of key value pairs for which there must be space
    * @param stream Stream used for executing the kernels
    */
-  void reserve(std::size_t n, cudaStream_t stream = nullptr);
+  void reserve(size_type n, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
    * @brief Inserts all key/value pairs in the range `[first, last)`.
    *
+   * @note This function synchronizes the given stream.
+   *
    * If multiple keys in `[first, last)` compare equal, it is unspecified which
    * element is inserted.
    *
    * @tparam InputIt Device accessible input iterator whose `value_type` is
    * convertible to the map's `value_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
    * @param first Beginning of the sequence of key/value pairs
    * @param last End of the sequence of key/value pairs
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
    * @param stream Stream used for executing the kernels
    */
-  template <typename InputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
+  template <typename InputIt>
   void insert(InputIt first,
               InputIt last,
-              Hash hash           = Hash{},
-              KeyEqual key_equal  = KeyEqual{},
-              cudaStream_t stream = nullptr);
+              cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+   * already exists in the map, assigns `v` to the mapped_type corresponding to the key `k`.
+   * If the key does not exist, inserts the pair as if by insert.
+   *
+   * @note This function synchronizes the given stream.
+   * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is
+   * inserted or assigned.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * dynamic_map<K, V>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt>
+  void insert_or_assign(InputIt first,
+                        InputIt last,
+                        cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
    * @brief Erases keys in the range `[first, last)`.
    *
-   * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's
-   * associated value from the map. Else, no effect.
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `erase_async`.
    *
-   *  Side-effects:
-   *  - `contains(k) == false`
-   *  - `find(k) == end()`
-   *  - `insert({k,v}) == true`
-   *  - `get_size()` is reduced by the total number of erased keys
+   * For each key `k` in `[first, last)`, if `contains(k) == true`, removes `k` and its
+   * associated value from the map. Else, no effect.
    *
-   * This function synchronizes `stream`.
+   * Side-effects:
+   * - `contains(k) == false`
+   * - `find(k) == end()`
+   * - `insert({k,v}) == true`
+   * - `size()` is reduced by the total number of erased keys
    *
    * Keep in mind that `erase` does not cause the map to shrink its memory allocation.
    *
    * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `value_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
+   * convertible to the map's `key_type`
    *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
    * @param stream Stream used for executing the kernels
    *
    * @throw std::runtime_error if a unique erased key sentinel value was not
    * provided at construction
    */
-  template <typename InputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
+  template <typename InputIt>
   void erase(InputIt first,
              InputIt last,
-             Hash hash           = Hash{},
-             KeyEqual key_equal  = KeyEqual{},
-             cudaStream_t stream = nullptr);
+             cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+
+  /**
+   * @brief Asynchronously erases keys in the range `[first, last)`.
+   *
+   * For each key `k` in `[first, last)`, if `contains(k) == true`, removes `k` and its
+   * associated value from the map. Else, no effect.
+   *
+   * @note `size()` will not be updated. Use the synchronous `erase` if you need accurate size
+   * tracking.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `key_type`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime_error if a unique erased key sentinel value was not
+   * provided at construction
+   */
+  template <typename InputIt>
+  void erase_async(InputIt first,
+                   InputIt last,
+                   cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
    *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `find_async`.
+   *
    * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin +
    * i)`. Else, copies the empty value sentinel.
    *
@@ -406,122 +270,191 @@ class dynamic_map {
    * convertible to the map's `key_type`
    * @tparam OutputIt Device accessible output iterator whose `value_type` is
    * convertible to the map's `mapped_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
    *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of values retrieved for each key
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
    * @param stream Stream used for executing the kernels
    */
-  template <typename InputIt,
-            typename OutputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
+  template <typename InputIt, typename OutputIt>
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash           = Hash{},
-            KeyEqual key_equal  = KeyEqual{},
-            cudaStream_t stream = nullptr);
+            cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const;
 
   /**
-   * @brief Retrieves all of the keys and their associated values.
+   * @brief Asynchronously finds the values corresponding to all keys in the range `[first, last)`.
    *
-   * The order in which keys are returned is implementation defined and not guaranteed to be
-   * consistent between subsequent calls to `retrieve_all`.
+   * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin +
+   * i)`. Else, copies the empty value sentinel.
    *
-   * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than
-   * `get_size()`
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `key_type`
+   * @tparam OutputIt Device accessible output iterator whose `value_type` is
+   * convertible to the map's `mapped_type`
    *
-   * @tparam KeyOut Device accessible random access output iterator whose `value_type` is
-   * convertible from `key_type`.
-   * @tparam ValueOut Device accessible random access output iterator whose `value_type` is
-   * convertible from `mapped_type`.
-   * @param keys_out Beginning output iterator for keys
-   * @param values_out Beginning output iterator for values
-   * @param stream CUDA stream used for this operation
-   * @return Pair of iterators indicating the last elements in the output
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of values retrieved for each key
+   * @param stream Stream used for executing the kernels
    */
-  template <typename KeyOut, typename ValueOut>
-  std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
-                                           ValueOut values_out,
-                                           cudaStream_t stream = 0) const;
+  template <typename InputIt, typename OutputIt>
+  void find_async(InputIt first,
+                  InputIt last,
+                  OutputIt output_begin,
+                  cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
    *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_async`.
+   *
    * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
    *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `key_type`
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * convertible to the map's `mapped_type`
-   * @tparam Hash Unary callable type
-   * @tparam KeyEqual Binary callable type
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
    *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of booleans for the presence of each key
-   * @param hash The unary function to apply to hash each key
-   * @param key_equal The binary function to compare two keys for equality
    * @param stream Stream used for executing the kernels
    */
-  template <typename InputIt,
-            typename OutputIt,
-            typename Hash     = cuco::default_hash_function<key_type>,
-            typename KeyEqual = cuda::std::equal_to<key_type>>
+  template <typename InputIt, typename OutputIt>
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash           = Hash{},
-                KeyEqual key_equal  = KeyEqual{},
-                cudaStream_t stream = nullptr);
+                cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const;
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the map.
+   *
+   * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the map.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}) const;
+
+  /**
+   * @brief Retrieves all of the keys and their associated values.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * The order in which keys are returned is implementation defined and not guaranteed to be
+   * consistent between subsequent calls to `retrieve_all`.
+   *
+   * Behavior is undefined if the range beginning at `keys_out` or `values_out` is less than
+   * `size()`
+   *
+   * @tparam KeyOut Device accessible random access output iterator whose `value_type` is
+   * convertible from `key_type`.
+   * @tparam ValueOut Device accessible random access output iterator whose `value_type` is
+   * convertible from `mapped_type`.
+   * @param keys_out Beginning output iterator for keys
+   * @param values_out Beginning output iterator for values
+   * @param stream CUDA stream used for this operation
+   * @return Pair of iterators indicating the last elements in the output
+   */
+  template <typename KeyOut, typename ValueOut>
+  std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
+                                           ValueOut values_out,
+                                           cuda::stream_ref stream = cuda::stream_ref{
+                                             cudaStream_t{nullptr}}) const;
 
   /**
    * @brief Gets the current number of elements in the map
    *
    * @return The current number of elements in the map
    */
-  std::size_t get_size() const noexcept { return size_; }
+  [[nodiscard]] size_type size() const noexcept { return size_; }
 
   /**
    * @brief Gets the maximum number of elements the hash map can hold.
    *
    * @return The maximum number of elements the hash map can hold
    */
-  std::size_t get_capacity() const noexcept { return capacity_; }
+  [[nodiscard]] size_type capacity() const noexcept { return capacity_; }
 
   /**
    * @brief Gets the load factor of the hash map.
    *
    * @return The load factor of the hash map
    */
-  float get_load_factor() const noexcept { return static_cast<float>(size_) / capacity_; }
+  /**
+   * @brief Gets the current load factor of the map
+   *
+   * @return The current load factor of the map
+   */
+  [[nodiscard]] float load_factor() const noexcept { return static_cast<float>(size_) / capacity_; }
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept
+  {
+    return submaps_.front()->empty_key_sentinel();
+  }
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty value slot.
+   *
+   * @return The sentinel value used to represent an empty value slot
+   */
+  [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept
+  {
+    return submaps_.front()->empty_value_sentinel();
+  }
+
+  /**
+   * @brief Gets the sentinel value used to represent an erased key slot.
+   *
+   * @return The sentinel value used to represent an erased key slot
+   */
+  [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept
+  {
+    return submaps_.front()->erased_key_sentinel();
+  }
+
+  /**
+   * @brief Gets the function used to compare keys for equality
+   *
+   * @return The function used to compare keys for equality
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept { return submaps_.front()->key_eq(); }
+
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept
+  {
+    return submaps_.front()->hash_function();
+  }
 
  private:
-  key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
-  mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
-  key_type erased_key_sentinel_{};      ///< Key value that represents an erased slot
-
-  // TODO: initialize this
-  std::size_t size_{};       ///< Number of keys in the map
-  std::size_t capacity_{};   ///< Maximum number of keys that can be inserted
-  float max_load_factor_{};  ///< Max load factor before capacity growth
-
-  std::vector<std::unique_ptr<cuco::legacy::static_map<key_type, mapped_type, Scope>>>
-    submaps_;                                      ///< vector of pointers to each submap
-  thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
-  thrust::device_vector<mutable_view_type>
-    submap_mutable_views_;         ///< vector of mutable device views for each submap
-  std::size_t min_insert_size_{};  ///< min remaining capacity of submap for insert
-  thrust::device_vector<atomic_ctr_type*>
-    submap_num_successes_;  ///< Number of successfully erased keys for each submap
-  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
+  size_type size_;      ///< Number of keys in the map
+  size_type capacity_;  ///< Capacity for next submap (also returned by capacity())
+
+  std::vector<std::unique_ptr<map_type>> submaps_;  ///< vector of pointers to each submap
+  size_type min_insert_size_;                       ///< min remaining capacity of submap for insert
+  float max_load_factor_;                           ///< Maximum load factor
+  Allocator alloc_;  ///< Allocator passed to submaps to allocate their device storage
 };
+
 }  // namespace cuco
 
-#include <cuco/detail/dynamic_map.inl>
 #include <cuco/detail/dynamic_map/dynamic_map.inl>
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 50b8771d4..dde5b249e 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1249,7 +1249,6 @@ class static_map {
   mapped_type empty_value_sentinel_;  ///< Sentinel value that indicates an empty payload
 };
 
-namespace experimental {
 template <class Key,
           class T,
           class Extent,
@@ -1259,10 +1258,6 @@ template <class Key,
           class Allocator,
           class Storage>
 class dynamic_map;
-}
-
-template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-class dynamic_map;
 
 namespace legacy {
 
@@ -1356,8 +1351,6 @@ class static_map {
                 "declared as safe for bitwise comparison via specialization of "
                 "cuco::is_bitwise_comparable_v<Value>.");
 
-  friend class dynamic_map<Key, Value, Scope, Allocator>;  ///< Dynamic map as friend class
-
  public:
   using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type           = Key;                               ///< Key type
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b5d42770d..a4787f432 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+# Copyright (c) 2018-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -99,10 +99,10 @@ ConfigureTest(STATIC_MAP_TEST
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
     dynamic_map/unique_sequence_test.cu
-    dynamic_map/unique_sequence_test_experimental.cu
     dynamic_map/erase_test.cu
     dynamic_map/find_test.cu
-    dynamic_map/retrieve_all_test.cu)
+    dynamic_map/retrieve_all_test.cu
+    dynamic_map/multiplicity_test.cu)
 
 ###################################################################################################
 # - static_multiset tests -------------------------------------------------------------------------
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
index b723093ff..1ffc9673c 100644
--- a/tests/dynamic_map/erase_test.cu
+++ b/tests/dynamic_map/erase_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,13 @@
 
 #include <cuco/dynamic_map.cuh>
 
+#include <cuda/functional>
 #include <cuda/std/functional>
+#include <cuda/std/tuple>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/sequence.h>
 
 #include <catch2/catch_template_test_macros.hpp>
@@ -48,33 +52,33 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests",
     thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
     thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-    auto pairs_begin =
-      thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()});
+    auto pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<std::size_t>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) {
+          return cuco::pair<Key, Value>{keys[i], values[i]};
+        }));
 
     map.insert(pairs_begin, pairs_begin + num_keys);
 
-    REQUIRE(map.get_size() == num_keys);
+    REQUIRE(map.size() == num_keys);
 
     map.erase(d_keys.begin(), d_keys.end());
 
-    // delete decreases count correctly
-    REQUIRE(map.get_size() == 0);
+    REQUIRE(map.size() == 0);
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    // keys were actaully deleted
     REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{}));
 
-    // ensures that map is reusing deleted slots
     map.insert(pairs_begin, pairs_begin + num_keys);
 
-    REQUIRE(map.get_size() == num_keys);
+    REQUIRE(map.size() == num_keys);
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
     REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{}));
 
-    // erase can act selectively
     map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
@@ -84,7 +88,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests",
     REQUIRE(cuco::test::all_of(
       d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), cuda::std::identity{}));
 
-    // clear map
     map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
   }
 
@@ -99,14 +102,17 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests",
     thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
     thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-    auto pairs_begin =
-      thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()});
+    auto pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<std::size_t>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) {
+          return cuco::pair<Key, Value>{keys[i], values[i]};
+        }));
 
     map.insert(pairs_begin, pairs_begin + num);
 
-    // map should resize twice if the erased slots are successfully reused
-    REQUIRE(map.get_capacity() == 2 * num);
-    // check that keys can be successfully deleted from only the first and second submaps
+    REQUIRE(map.capacity() == 2 * num);
+
     map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys);
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
@@ -116,15 +122,14 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map erase tests",
     REQUIRE(cuco::test::all_of(
       d_keys_exist.begin() + 2 * num_keys, d_keys_exist.end(), cuda::std::identity{}));
 
-    REQUIRE(map.get_size() == 2 * num_keys);
-    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
-    // erases)
+    REQUIRE(map.size() == 2 * num_keys);
+
     map.erase(d_keys.begin(), d_keys.end());
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
     REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{}));
 
-    REQUIRE(map.get_size() == 0);
+    REQUIRE(map.size() == 0);
   }
 }
diff --git a/tests/dynamic_map/find_test.cu b/tests/dynamic_map/find_test.cu
index b83964735..ebd88a029 100644
--- a/tests/dynamic_map/find_test.cu
+++ b/tests/dynamic_map/find_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sequence.h>
 
@@ -52,24 +54,25 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests",
     thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
     thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-    auto pairs_begin =
-      thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()});
+    auto pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<std::size_t>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) {
+          return cuco::pair<Key, Value>{keys[i], values[i]};
+        }));
 
     map.insert(pairs_begin, pairs_begin + num_keys);
 
-    REQUIRE(map.get_size() == num_keys);
+    REQUIRE(map.size() == num_keys);
 
-    // Find all inserted keys
     map.find(d_keys.begin(), d_keys.end(), d_found_values.begin());
 
-    // Verify that all keys were found with correct values
     auto zip_equal = cuda::proclaim_return_type<bool>(
       [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); });
     auto zip =
       thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_found_values.begin()});
     REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
 
-    // Test finding non-existent keys
     thrust::device_vector<Key> d_nonexistent_keys(100);
     thrust::device_vector<Value> d_nonexistent_values(100);
 
@@ -80,19 +83,15 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests",
 
     map.find(d_nonexistent_keys.begin(), d_nonexistent_keys.end(), d_nonexistent_values.begin());
 
-    // Verify that non-existent keys return empty value sentinel
     auto empty_zip = thrust::make_zip_iterator(
       cuda::std::tuple{d_nonexistent_values.begin(),
                        thrust::constant_iterator<Value>{cuco::empty_value<Value>{-1}.value}});
     REQUIRE(cuco::test::all_of(empty_zip, empty_zip + 100, zip_equal));
 
-    // Test finding a mix of existing and non-existing keys
     thrust::device_vector<Key> d_mixed_keys(200);
     thrust::device_vector<Value> d_mixed_values(200);
 
-    // First half: existing keys
     thrust::copy(d_keys.begin(), d_keys.begin() + 100, d_mixed_keys.begin());
-    // Second half: non-existing keys
     thrust::sequence(thrust::device,
                      d_mixed_keys.begin() + 100,
                      d_mixed_keys.end(),
@@ -100,12 +99,10 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests",
 
     map.find(d_mixed_keys.begin(), d_mixed_keys.end(), d_mixed_values.begin());
 
-    // Verify first half found correct values
     auto first_half_zip =
       thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_mixed_values.begin()});
     REQUIRE(cuco::test::all_of(first_half_zip, first_half_zip + 100, zip_equal));
 
-    // Verify second half returned empty value sentinel
     auto second_half_empty_zip = thrust::make_zip_iterator(
       cuda::std::tuple{d_mixed_values.begin() + 100,
                        thrust::constant_iterator<Value>{cuco::empty_value<Value>{-1}.value}});
@@ -121,14 +118,17 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests",
     thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
     thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
 
-    auto pairs_begin =
-      thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()});
+    auto pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<std::size_t>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [keys = d_keys.begin(), values = d_values.begin()] __device__(auto i) {
+          return cuco::pair<Key, Value>{keys[i], values[i]};
+        }));
 
     map.insert(pairs_begin, pairs_begin + num_keys);
 
-    REQUIRE(map.get_size() == num_keys);
+    REQUIRE(map.size() == num_keys);
 
-    // Find all keys before erase
     map.find(d_keys.begin(), d_keys.end(), d_found_values.begin());
 
     auto zip_equal = cuda::proclaim_return_type<bool>(
@@ -137,73 +137,20 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map find tests",
       thrust::make_zip_iterator(cuda::std::tuple{d_values.begin(), d_found_values.begin()});
     REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
 
-    // Erase first half of keys
     map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
 
-    REQUIRE(map.get_size() == num_keys / 2);
+    REQUIRE(map.size() == num_keys / 2);
 
-    // Find all keys after erase
     map.find(d_keys.begin(), d_keys.end(), d_found_values.begin());
 
-    // First half should return empty value sentinel (erased)
     auto first_half_empty_zip = thrust::make_zip_iterator(
       cuda::std::tuple{d_found_values.begin(),
                        thrust::constant_iterator<Value>{cuco::empty_value<Value>{-1}.value}});
     REQUIRE(
       cuco::test::all_of(first_half_empty_zip, first_half_empty_zip + num_keys / 2, zip_equal));
 
-    // Second half should return correct values (not erased)
     auto second_half_zip = thrust::make_zip_iterator(
       cuda::std::tuple{d_values.begin() + num_keys / 2, d_found_values.begin() + num_keys / 2});
     REQUIRE(cuco::test::all_of(second_half_zip, second_half_zip + num_keys / 2, zip_equal));
   }
-
-  cuco::dynamic_map<Key, Value> indentity_hash_map{
-    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};
-
-  SECTION("Check find in a all erased submap")
-  {
-    constexpr float default_load_factor     = 0.60;
-    constexpr std::size_t first_insert_size = num_keys * default_load_factor;
-
-    thrust::device_vector<Key> d_keys(num_keys);
-    thrust::device_vector<Value> d_values(num_keys);
-    thrust::device_vector<Value> d_found_values(num_keys);
-
-    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
-    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
-
-    auto pairs_begin =
-      thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()});
-
-    // To construct a map with all erased keys, we can't insert all at once
-    indentity_hash_map.insert(
-      pairs_begin, pairs_begin + first_insert_size, cuco::identity_hash<Key>());
-    REQUIRE(indentity_hash_map.get_size() == first_insert_size);
-
-    indentity_hash_map.erase(
-      d_keys.begin(), d_keys.begin() + first_insert_size, cuco::identity_hash<Key>());
-    REQUIRE(indentity_hash_map.get_size() == 0);
-
-    indentity_hash_map.insert(
-      pairs_begin + first_insert_size, pairs_begin + num_keys, cuco::identity_hash<Key>());
-    REQUIRE(indentity_hash_map.get_size() == num_keys - first_insert_size);
-
-    indentity_hash_map.erase(
-      d_keys.begin() + first_insert_size, d_keys.end(), cuco::identity_hash<Key>());
-    REQUIRE(indentity_hash_map.get_size() == 0);
-
-    // we've construct a dynamic_map with one submap whose keys are all erased keys (-2 in this
-    // case) this find would run forever if we don't check whether we have iterated all keys in a
-    // submap
-    indentity_hash_map.find(
-      d_keys.begin(), d_keys.end(), d_found_values.begin(), cuco::identity_hash<Key>());
-    // all d_found_values should be empty value sentinel (-1 in this case)
-    auto empty_zip = thrust::make_zip_iterator(
-      cuda::std::tuple{d_found_values.begin(),
-                       thrust::constant_iterator<Value>{cuco::empty_value<Value>{-1}.value}});
-    auto zip_equal = cuda::proclaim_return_type<bool>(
-      [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); });
-    REQUIRE(cuco::test::all_of(empty_zip, empty_zip + num_keys, zip_equal));
-  }
-}
\ No newline at end of file
+}
diff --git a/tests/dynamic_map/multiplicity_test.cu b/tests/dynamic_map/multiplicity_test.cu
new file mode 100644
index 000000000..5fc7eddb7
--- /dev/null
+++ b/tests/dynamic_map/multiplicity_test.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+
+#include <cuda/functional>
+#include <cuda/std/functional>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG("dynamic_map: cross-submap duplicate handling",
+                       "",
+                       ((typename Key, typename T), Key, T),
+                       (int32_t, int32_t),
+                       (int64_t, int64_t))
+{
+  // Use capacity large enough to satisfy min_insert_size_ (10,000 default)
+  // but small enough to force multiple submaps after several inserts
+  constexpr std::size_t initial_capacity{50'000};
+  constexpr std::size_t num_keys{20'000};  // Fill about 2/3 of first submap (load factor ~0.6)
+
+  cuco::dynamic_map<Key, T> map{initial_capacity,
+                                cuco::empty_key<Key>{-1},
+                                cuco::empty_value<T>{-1},
+                                cuco::erased_key<Key>{-2}};
+
+  // Create pairs for first submap (keys 0 to num_keys-1)
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    cuda::proclaim_return_type<cuco::pair<Key, T>>(
+                                      [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
+
+  // Create pairs for second submap (keys num_keys to 2*num_keys-1)
+  auto pairs_begin_2 =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(num_keys),
+                                    cuda::proclaim_return_type<cuco::pair<Key, T>>(
+                                      [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
+
+  thrust::device_vector<Key> d_keys(num_keys);
+  thrust::device_vector<T> d_results(num_keys);
+  thrust::device_vector<bool> d_contained(num_keys);
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  SECTION("insert does not insert duplicates across submaps")
+  {
+    // Fill first submap
+    map.insert(pairs_begin, pairs_begin + num_keys);
+    REQUIRE(map.size() == num_keys);
+
+    // Insert new keys to trigger second submap creation
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Try to insert duplicates of keys from first submap - should not increase size
+    map.insert(pairs_begin, pairs_begin + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Try to insert duplicates with DIFFERENT values - should still not insert and preserve
+    // originals
+    auto duplicate_pairs_diff_values = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>(
+        [] __device__(auto i) { return cuco::pair<Key, T>(i, i + 999); }));
+    map.insert(duplicate_pairs_diff_values, duplicate_pairs_diff_values + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Verify original values are preserved (not overwritten by duplicate insert attempts)
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(0),
+                              cuda::std::equal_to<T>{}));
+  }
+
+  SECTION("contains finds keys in any submap")
+  {
+    // Fill first submap
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    // Insert new keys to trigger second submap
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+
+    // Keys in FIRST submap should be found
+    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+
+    // Keys in SECOND submap should be found
+    thrust::device_vector<Key> d_keys_2(num_keys);
+    thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys);
+    map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+
+    // Non-existent keys should NOT be found
+    thrust::device_vector<Key> d_keys_nonexistent(num_keys);
+    thrust::sequence(
+      thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 2 * num_keys);
+    map.contains(d_keys_nonexistent.begin(), d_keys_nonexistent.end(), d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+  }
+
+  SECTION("find retrieves values from any submap")
+  {
+    // Fill first submap
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    // Insert new keys to trigger second submap
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+
+    // Find keys from FIRST submap - should return correct values
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(0),
+                              cuda::std::equal_to<T>{}));
+
+    // Find keys from SECOND submap - should return correct values
+    thrust::device_vector<Key> d_keys_2(num_keys);
+    thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys);
+    map.find(d_keys_2.begin(), d_keys_2.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(num_keys),
+                              cuda::std::equal_to<T>{}));
+
+    // Non-existent keys should return empty_value_sentinel (-1)
+    thrust::device_vector<Key> d_keys_nonexistent(num_keys);
+    thrust::sequence(
+      thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 2 * num_keys);
+    map.find(d_keys_nonexistent.begin(), d_keys_nonexistent.end(), d_results.begin());
+    REQUIRE(cuco::test::all_of(
+      d_results.begin(), d_results.end(), [] __device__(T val) { return val == T{-1}; }));
+  }
+
+  SECTION("erase removes keys from any submap")
+  {
+    // Fill first submap
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    // Insert new keys to trigger second submap
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Erase keys from FIRST submap
+    map.erase(d_keys.begin(), d_keys.end());
+    REQUIRE(map.size() == num_keys);
+
+    // Verify keys from first submap are no longer contained
+    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+
+    // Verify find returns sentinel for erased keys
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    REQUIRE(cuco::test::all_of(
+      d_results.begin(), d_results.end(), [] __device__(T val) { return val == T{-1}; }));
+
+    // Verify keys from SECOND submap are still there
+    thrust::device_vector<Key> d_keys_2(num_keys);
+    thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys);
+    map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+
+    // Erase same keys again (already erased) - should not change size
+    map.erase(d_keys.begin(), d_keys.end());
+    REQUIRE(map.size() == num_keys);
+
+    // Erase non-existent keys - should not change size
+    thrust::device_vector<Key> d_keys_nonexistent(num_keys);
+    thrust::sequence(
+      thrust::device, d_keys_nonexistent.begin(), d_keys_nonexistent.end(), 3 * num_keys);
+    map.erase(d_keys_nonexistent.begin(), d_keys_nonexistent.end());
+    REQUIRE(map.size() == num_keys);
+
+    // Now erase keys from SECOND submap
+    map.erase(d_keys_2.begin(), d_keys_2.end());
+    REQUIRE(map.size() == 0);
+
+    // Verify all keys are gone
+    map.contains(d_keys_2.begin(), d_keys_2.end(), d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+  }
+
+  SECTION("insert_or_assign updates in correct submap without creating duplicates")
+  {
+    // Fill first submap with values = keys
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    // Insert new keys to trigger second submap
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Create pairs with same keys as first submap but different values (value = key + 100)
+    auto updated_pairs = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>(
+        [] __device__(auto i) { return cuco::pair<Key, T>(i, i + 100); }));
+
+    // insert_or_assign should UPDATE values in first submap, not insert into second
+    map.insert_or_assign(updated_pairs, updated_pairs + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);  // Size should not change
+
+    // Verify values were updated in first submap
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(100),  // Values should now be key + 100
+                              cuda::std::equal_to<T>{}));
+
+    // Verify second submap values are unchanged
+    thrust::device_vector<Key> d_keys_2(num_keys);
+    thrust::sequence(thrust::device, d_keys_2.begin(), d_keys_2.end(), num_keys);
+    map.find(d_keys_2.begin(), d_keys_2.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(num_keys),
+                              cuda::std::equal_to<T>{}));
+
+    // Test INSERT behavior: insert_or_assign with completely NEW keys should increase size
+    auto new_pairs = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(2 * num_keys),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>(
+        [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
+    map.insert_or_assign(new_pairs, new_pairs + num_keys);
+    REQUIRE(map.size() == 3 * num_keys);  // Size should increase by num_keys
+
+    // Verify newly inserted keys exist with correct values
+    thrust::device_vector<Key> d_keys_new(num_keys);
+    thrust::sequence(thrust::device, d_keys_new.begin(), d_keys_new.end(), 2 * num_keys);
+    map.find(d_keys_new.begin(), d_keys_new.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(2 * num_keys),
+                              cuda::std::equal_to<T>{}));
+
+    // Test MIXED behavior: some keys exist (update), some don't (insert)
+    // Use keys from 0 to num_keys/2 (exist in first submap) and
+    // keys from 3*num_keys to 3*num_keys + num_keys/2 (don't exist)
+    auto mixed_pairs = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>([] __device__(auto i) {
+        Key key = (i < num_keys / 2) ? Key(i) : Key(3 * num_keys + i - num_keys / 2);
+        return cuco::pair<Key, T>(key, T(i + 500));
+      }));
+    std::size_t const size_before = map.size();
+    map.insert_or_assign(mixed_pairs, mixed_pairs + num_keys);
+    // Only num_keys/2 new keys should be inserted
+    REQUIRE(map.size() == size_before + num_keys / 2);
+  }
+
+  SECTION("retrieve_all retrieves from all submaps")
+  {
+    // Fill first submap
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    // Insert new keys to trigger second submap
+    map.insert(pairs_begin_2, pairs_begin_2 + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
+
+    // Retrieve all key-value pairs
+    thrust::device_vector<Key> d_retrieved_keys(2 * num_keys);
+    thrust::device_vector<T> d_retrieved_values(2 * num_keys);
+    auto const end = map.retrieve_all(d_retrieved_keys.begin(), d_retrieved_values.begin());
+    auto const num_retrieved = std::distance(d_retrieved_keys.begin(), end.first);
+
+    // Should retrieve all keys from both submaps
+    REQUIRE(num_retrieved == 2 * num_keys);
+
+    // Sort by key to verify all expected keys are present
+    thrust::sort_by_key(
+      thrust::device, d_retrieved_keys.begin(), d_retrieved_keys.end(), d_retrieved_values.begin());
+
+    // Keys should be 0 to 2*num_keys-1
+    thrust::device_vector<Key> d_expected_keys(2 * num_keys);
+    thrust::sequence(thrust::device, d_expected_keys.begin(), d_expected_keys.end());
+    REQUIRE(cuco::test::equal(d_retrieved_keys.begin(),
+                              d_retrieved_keys.end(),
+                              d_expected_keys.begin(),
+                              cuda::std::equal_to<Key>{}));
+
+    // Values should match keys (since we inserted key=value pairs)
+    REQUIRE(cuco::test::equal(d_retrieved_values.begin(),
+                              d_retrieved_values.end(),
+                              thrust::counting_iterator<T>(0),
+                              cuda::std::equal_to<T>{}));
+  }
+}
diff --git a/tests/dynamic_map/retrieve_all_test.cu b/tests/dynamic_map/retrieve_all_test.cu
index 72f0df329..35f057e4b 100644
--- a/tests/dynamic_map/retrieve_all_test.cu
+++ b/tests/dynamic_map/retrieve_all_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cuco/dynamic_map.cuh>
 
+#include <cuda/functional>
 #include <cuda/std/functional>
 #include <thrust/device_vector.h>
 #include <thrust/equal.h>
@@ -59,7 +60,7 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests",
 
     map.insert(pairs, pairs + num_keys);
 
-    REQUIRE(map.get_size() == num_keys);
+    REQUIRE(map.size() == num_keys);
 
     thrust::device_vector<Key> retrieved_keys(num_keys);
     thrust::device_vector<Value> retrieved_values(num_keys);
@@ -70,7 +71,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests",
     REQUIRE(keys_out == retrieved_keys.end());
     REQUIRE(values_out == retrieved_values.end());
 
-    // d_keys and d_values are already sorted
     thrust::sort(retrieved_keys.begin(), retrieved_keys.end());
     thrust::sort(retrieved_values.begin(), retrieved_values.end());
 
@@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests",
 
     map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
 
-    REQUIRE(map.get_size() == num_keys / 2);
+    REQUIRE(map.size() == num_keys / 2);
 
     thrust::device_vector<Key> retrieved_keys(num_keys / 2);
     thrust::device_vector<Value> retrieved_values(num_keys / 2);
@@ -107,7 +107,6 @@ TEMPLATE_TEST_CASE_SIG("dynamic_map retrieve_all tests",
     REQUIRE(std::distance(retrieved_keys.begin(), keys_out) == num_keys / 2);
     REQUIRE(std::distance(retrieved_values.begin(), values_out) == num_keys / 2);
 
-    // d_keys and d_values are already sorted
     thrust::sort(retrieved_keys.begin(), retrieved_keys.end());
     thrust::sort(retrieved_values.begin(), retrieved_values.end());
 
diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu
index afec6b82f..13cf08d0b 100644
--- a/tests/dynamic_map/unique_sequence_test.cu
+++ b/tests/dynamic_map/unique_sequence_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,75 +19,113 @@
 #include <cuco/dynamic_map.cuh>
 
 #include <cuda/functional>
-#include <cuda/std/tuple>
+#include <cuda/std/functional>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/sequence.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 
-TEMPLATE_TEST_CASE_SIG("dynamic_map unique sequence tests",
+TEMPLATE_TEST_CASE_SIG("dynamic_map: unique sequence",
                        "",
-                       ((typename Key, typename Value), Key, Value),
+                       ((typename Key, typename T), Key, T),
                        (int32_t, int32_t),
                        (int32_t, int64_t),
                        (int64_t, int32_t),
                        (int64_t, int64_t))
 {
-  constexpr std::size_t num_keys{50'000'000};
+  constexpr std::size_t num_keys{1'000'000};
 
-  cuco::dynamic_map<Key, Value> map{
-    30'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  cuco::dynamic_map<Key, T> map{30'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<T>{-1}};
 
   thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<Value> d_values(num_keys);
+  thrust::device_vector<T> d_values(num_keys);
 
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   thrust::sequence(thrust::device, d_values.begin(), d_values.end());
 
-  auto pairs_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    cuda::proclaim_return_type<cuco::pair<Key, Value>>(
-      [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); }));
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    cuda::proclaim_return_type<cuco::pair<Key, T>>(
+                                      [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
 
-  thrust::device_vector<Value> d_results(num_keys);
+  thrust::device_vector<T> d_results(num_keys);
   thrust::device_vector<bool> d_contained(num_keys);
 
-  // bulk function test cases
-  SECTION("All inserted keys-value pairs should be correctly recovered during find")
+  SECTION("All inserted keys-value pairs should be contained")
   {
     map.insert(pairs_begin, pairs_begin + num_keys);
-    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
-    auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), d_values.begin()});
+    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
 
-    REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) {
-      return cuda::std::get<0>(p) == cuda::std::get<1>(p);
-    }));
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
   }
 
-  SECTION("All non-inserted keys-value pairs should have the empty sentinel value recovered")
+  SECTION("Non-inserted keys-value pairs should not be contained")
   {
-    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
 
-    REQUIRE(cuco::test::all_of(
-      d_results.begin(), d_results.end(), [] __device__(auto const& p) { return p == -1; }));
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
   }
 
-  SECTION("All inserted keys-value pairs should be contained")
+  SECTION("size() returns correct count after insertions")
+  {
+    REQUIRE(map.size() == 0);
+
+    map.insert(pairs_begin, pairs_begin + num_keys / 2);
+    REQUIRE(map.size() == num_keys / 2);
+
+    map.insert(pairs_begin + num_keys / 2, pairs_begin + num_keys);
+    REQUIRE(map.size() == num_keys);
+  }
+
+  SECTION("capacity() returns non-zero value")
   {
+    REQUIRE(map.capacity() > 0);
+    REQUIRE(map.capacity() >= 30'000'000);
+  }
+
+  SECTION("load_factor() is computed correctly")
+  {
+    REQUIRE(map.load_factor() == 0.0f);
+
     map.insert(pairs_begin, pairs_begin + num_keys);
-    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
 
-    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+    float expected_load_factor = static_cast<float>(num_keys) / map.capacity();
+    REQUIRE(map.load_factor() == expected_load_factor);
   }
 
-  SECTION("Non-inserted keys-value pairs should not be contained")
+  SECTION("insert_or_assign inserts new keys and updates existing")
   {
-    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
+    // Insert initial keys
+    map.insert(pairs_begin, pairs_begin + num_keys);
+    REQUIRE(map.size() == num_keys);
 
-    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
+    // Create pairs with same keys but different values (value = key + 1)
+    auto updated_pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>(
+        [] __device__(auto i) { return cuco::pair<Key, T>(i, i + 1); }));
+
+    // insert_or_assign should update existing keys, size should stay the same
+    map.insert_or_assign(updated_pairs_begin, updated_pairs_begin + num_keys);
+    REQUIRE(map.size() == num_keys);
+
+    // Verify values were updated
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    REQUIRE(cuco::test::equal(d_results.begin(),
+                              d_results.end(),
+                              thrust::counting_iterator<T>(1),  // Values should now be key + 1
+                              cuda::std::equal_to<T>{}));
+
+    // Insert new keys with insert_or_assign (keys from num_keys to 2*num_keys)
+    auto new_pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<int>(num_keys),
+      cuda::proclaim_return_type<cuco::pair<Key, T>>(
+        [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
+
+    map.insert_or_assign(new_pairs_begin, new_pairs_begin + num_keys);
+    REQUIRE(map.size() == 2 * num_keys);
   }
 }
diff --git a/tests/dynamic_map/unique_sequence_test_experimental.cu b/tests/dynamic_map/unique_sequence_test_experimental.cu
deleted file mode 100644
index b04f8c182..000000000
--- a/tests/dynamic_map/unique_sequence_test_experimental.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <test_utils.hpp>
-
-#include <cuco/dynamic_map.cuh>
-
-#include <cuda/functional>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/sequence.h>
-
-#include <catch2/catch_template_test_macros.hpp>
-
-TEMPLATE_TEST_CASE_SIG("experimental::dynamic_map: unique sequence",
-                       "",
-                       ((typename Key, typename T), Key, T),
-                       (int32_t, int32_t),
-                       (int32_t, int64_t),
-                       (int64_t, int32_t),
-                       (int64_t, int64_t))
-{
-  constexpr std::size_t num_keys{1'000'000};
-
-  cuco::experimental::dynamic_map<Key, T> map{
-    30'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<T>{-1}};
-
-  thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<T> d_values(num_keys);
-
-  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
-  thrust::sequence(thrust::device, d_values.begin(), d_values.end());
-
-  auto pairs_begin =
-    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
-                                    cuda::proclaim_return_type<cuco::pair<Key, T>>(
-                                      [] __device__(auto i) { return cuco::pair<Key, T>(i, i); }));
-
-  thrust::device_vector<T> d_results(num_keys);
-  thrust::device_vector<bool> d_contained(num_keys);
-
-  // bulk function test cases
-
-  SECTION("All inserted keys-value pairs should be contained")
-  {
-    map.insert(pairs_begin, pairs_begin + num_keys);
-    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
-
-    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
-  }
-
-  SECTION("Non-inserted keys-value pairs should not be contained")
-  {
-    // segfaults
-    map.contains(d_keys.begin(), d_keys.end(), d_contained.begin());
-
-    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{}));
-  }
-}