rapidsai · rapids-bot · Feb 22, 2026 · Feb 3, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -26,6 +26,7 @@
 
 // TODO: This shouldn't be invoking anything from spatial/knn
 #include "../ann_utils.cuh"
+#include "../smem_utils.cuh"
 
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
@@ -589,8 +590,6 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
     THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 
-  RAFT_CUDA_TRY(
-    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
   // Initialize hash table
   const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
   set_value_batch(traversed_hashmap_ptr,
@@ -608,26 +607,29 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
                  num_queries,
                  smem_size);
 
-  kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
-                                                       topk_distances_ptr,
-                                                       dataset_desc.dev_ptr(stream),
-                                                       queries_ptr,
-                                                       graph.data_handle(),
-                                                       max_elements,
-                                                       graph.extent(1),
-                                                       source_indices_ptr,
-                                                       ps.num_random_samplings,
-                                                       ps.rand_xor_mask,
-                                                       dev_seed_ptr,
-                                                       num_seeds,
-                                                       visited_hash_bitlen,
-                                                       traversed_hashmap_ptr,
-                                                       traversed_hash_bitlen,
-                                                       ps.itopk_size,
-                                                       ps.min_iterations,
-                                                       ps.max_iterations,
-                                                       num_executed_iterations,
-                                                       sample_filter);
+  auto const& kernel_launcher = [&](auto const& kernel) -> void {
+    kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
+                                                         topk_distances_ptr,
+                                                         dataset_desc.dev_ptr(stream),
+                                                         queries_ptr,
+                                                         graph.data_handle(),
+                                                         max_elements,
+                                                         graph.extent(1),
+                                                         source_indices_ptr,
+                                                         ps.num_random_samplings,
+                                                         ps.rand_xor_mask,
+                                                         dev_seed_ptr,
+                                                         num_seeds,
+                                                         visited_hash_bitlen,
+                                                         traversed_hashmap_ptr,
+                                                         traversed_hash_bitlen,
+                                                         ps.itopk_size,
+                                                         ps.min_iterations,
+                                                         ps.max_iterations,
+                                                         num_executed_iterations,
+                                                         sample_filter);
+  };
+  cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(kernel, smem_size, kernel_launcher);
 }
 
 }  // namespace multi_cta_search

@@ -26,6 +26,7 @@
 
 // TODO: This shouldn't be invoking anything from spatial/knn
 #include "../ann_utils.cuh"
+#include "../smem_utils.cuh"
 
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -2312,36 +2313,38 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
     using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
     auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
       choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     dim3 thread_dims(block_size, 1, 1);
     dim3 block_dims(1, num_queries, 1);
     RAFT_LOG_DEBUG(
       "Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
-    kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
-                                                           topk_distances_ptr,
-                                                           topk,
-                                                           dataset_desc.dev_ptr(stream),
-                                                           queries_ptr,
-                                                           graph.data_handle(),
-                                                           graph.extent(1),
-                                                           source_indices_ptr,
-                                                           ps.num_random_samplings,
-                                                           ps.rand_xor_mask,
-                                                           dev_seed_ptr,
-                                                           num_seeds,
-                                                           hashmap_ptr,
-                                                           max_candidates,
-                                                           max_itopk,
-                                                           ps.itopk_size,
-                                                           ps.search_width,
-                                                           ps.min_iterations,
-                                                           ps.max_iterations,
-                                                           num_executed_iterations,
-                                                           hash_bitlen,
-                                                           small_hash_bitlen,
-                                                           small_hash_reset_interval,
-                                                           sample_filter);
+    auto const& kernel_launcher = [&](auto const& kernel) -> void {
+      kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
+                                                             topk_distances_ptr,
+                                                             topk,
+                                                             dataset_desc.dev_ptr(stream),
+                                                             queries_ptr,
+                                                             graph.data_handle(),
+                                                             graph.extent(1),
+                                                             source_indices_ptr,
+                                                             ps.num_random_samplings,
+                                                             ps.rand_xor_mask,
+                                                             dev_seed_ptr,
+                                                             num_seeds,
+                                                             hashmap_ptr,
+                                                             max_candidates,
+                                                             max_itopk,
+                                                             ps.itopk_size,
+                                                             ps.search_width,
+                                                             ps.min_iterations,
+                                                             ps.max_iterations,
+                                                             num_executed_iterations,
+                                                             hash_bitlen,
+                                                             small_hash_bitlen,
+                                                             small_hash_reset_interval,
+                                                             sample_filter);
+    };
+    cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(
+      kernel, smem_size, kernel_launcher);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }

@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/error.hpp>
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+
+namespace cuvs::neighbors::detail {
+
+/**
+ * @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
+ * This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
+ * atomically.
+ *
+ * Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
+ * guarantees that the kernel is safe to launch.
+ *
+ * @tparam KernelT The type of the kernel.
+ * @tparam InvocationT The type of the invocation function.
+ * @param kernel The kernel function address (for whom the smem-size is specified).
+ * @param smem_size The size of the dynamic shared memory to be set.
+ * @param launch The kernel launch function/lambda.
+ */
+template <typename KernelT, typename KernelLauncherT>
+void safely_launch_kernel_with_smem_size(KernelT const& kernel,
+                                         uint32_t smem_size,
+                                         KernelLauncherT const& launch)
+{
+  // the last smem size is parameterized by the kernel thanks to the template parameter.
+  static std::atomic<uint32_t> current_smem_size{0};
+  auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
+  if (smem_size > last_smem_size) {
+    // We still need a mutex for the critical section: actualize last_smem_size and set the
+    // attribute.
+    static auto mutex = std::mutex{};
+    auto guard        = std::lock_guard<std::mutex>{mutex};
+    if (!current_smem_size.compare_exchange_strong(
+          last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
+      // The value has been updated by another thread between the load and the mutex acquisition.
+      if (smem_size > last_smem_size) {
+        current_smem_size.store(smem_size, std::memory_order_relaxed);
+      }
+    }
+    // Only update if the last seen value is smaller than the new one.
+    if (smem_size > last_smem_size) {
+      auto launch_status =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+      RAFT_EXPECTS(launch_status == cudaSuccess,
+                   "Failed to set max dynamic shared memory size to %u bytes",
+                   smem_size);
+    }
+  }
+  // We don't need to guard the kernel launch because the smem_size can only grow.
+  return launch(kernel);
+}
+
+}  // namespace cuvs::neighbors::detail
@@ -156,6 +156,7 @@ ConfigureTest(
 ConfigureTest(
   NAME NEIGHBORS_ANN_CAGRA_TEST_BUGS
   PATH neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu
+       neighbors/ann_cagra/bug_issue_93_reproducer.cu
   GPUS 1
   PERCENT 100
 )

@@ -0,0 +1,126 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Reproducer for https://github.com/rapidsai/cuvs-lucene/issues/93
+ *   cuvsCagraSearch returned 0 (Reason=cudaErrorInvalidValue:invalid argument)
+ *
+ * ROOT CAUSE:
+ *   `cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)`
+ *   is not thread-safe. It sets a CUDA-context-wide attribute. When two threads call it
+ *   concurrently with different smem_size values, the following race occurs:
+ *     1. Thread A sets max-dynamic-shared-mem to SIZE_A (larger).
+ *     2. Thread B overwrites it with SIZE_B (smaller).
+ *     3. Thread A launches its kernel requesting SIZE_A of shared memory,
+ *        but the CUDA context now only allows SIZE_B → cudaErrorInvalidValue.
+ *
+ * HOW IT MANIFESTS IN cuvs-lucene:
+ *   Lucene's TaskExecutor dispatches per-segment CAGRA searches to a thread pool.
+ *   Each segment has a different number of vectors (e.g. 25, 26, 47), leading to
+ *   different graph degrees after reduction, and therefore different smem_size values
+ *   in the single-CTA search kernel. The concurrent cudaFuncSetAttribute calls race.
+ *
+ * REPRODUCTION STRATEGY:
+ *   Build CAGRA indices with different dataset sizes (different graph degrees),
+ *   then search them concurrently from separate threads, each with its own raft::resources.
+ *   This mirrors the cuvs-lucene setup where each thread gets a ThreadLocal CuVSResources.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/random/rng.cuh>
+
+#include <cstdint>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace cuvs::neighbors::cagra {
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+TEST(Issue93Reproducer, ConcurrentSearchDifferentGraphDegrees)
+{
+  raft::resources handle;
+  raft::random::RngState rng(6181234567890123459ULL);
+
+  // Dataset sizes from REPRODUCER.md warnings (different sizes → different graph degrees).
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  std::vector<int> dataset_sizes = {25, 26, 47, 25};
+  constexpr int dim              = 64;
+  constexpr int top_k            = 10;
+
+  // Build indices on the main thread.
+  std::vector<cagra::index<float, uint32_t>> indices;
+  for (int n_rows : dataset_sizes) {
+    auto database = raft::make_device_matrix<float, int64_t>(handle, n_rows, dim);
+    raft::random::uniform(
+      handle, rng, database.data_handle(), n_rows * dim, -1.0F, 1.0F);  // NOLINT
+
+    cagra::index_params ip;
+    ip.metric                    = cuvs::distance::DistanceType::L2Expanded;
+    ip.intermediate_graph_degree = 128;  // NOLINT
+    ip.graph_degree              = 64;   // NOLINT
+    ip.graph_build_params =
+      graph_build_params::nn_descent_params(ip.intermediate_graph_degree, ip.metric);
+
+    indices.push_back(cagra::build(handle, ip, raft::make_const_mdspan(database.view())));
+  }
+  raft::resource::sync_stream(handle);
+
+  // Search concurrently from multiple threads until the first failure.
+  const int num_threads = static_cast<int>(indices.size());
+  std::mutex error_mutex;
+  std::string first_error;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  for (int iter = 0; iter < 50 && first_error.empty(); ++iter) {
+    std::vector<std::thread> threads;
+    for (int t = 0; t < num_threads; ++t) {
+      threads.emplace_back([&, t, iter]() {
+        raft::resources thread_handle;
+        raft::random::RngState thread_rng(42ULL + static_cast<uint64_t>(t) +
+                                          static_cast<uint64_t>(iter) * 1000ULL);
+        try {
+          auto query = raft::make_device_matrix<float, int64_t>(thread_handle, 1, dim);
+          raft::random::uniform(thread_handle, thread_rng, query.data_handle(), dim, -1.0F, 1.0F);
+
+          // Match cuvs-lucene params: Java's Panama zero-initializes the struct,
+          // and SINGLE_CTA = 0 in the enum, so algo is SINGLE_CTA.
+          cagra::search_params sp;
+          sp.itopk_size   = top_k;
+          sp.search_width = 1;
+          sp.algo         = search_algo::SINGLE_CTA;
+
+          auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(thread_handle, 1, top_k);
+          auto distances = raft::make_device_matrix<float, int64_t>(thread_handle, 1, top_k);
+
+          cagra::search(thread_handle,
+                        sp,
+                        indices[static_cast<size_t>(t)],
+                        raft::make_const_mdspan(query.view()),
+                        neighbors.view(),
+                        distances.view());
+
+          raft::resource::sync_stream(thread_handle);
+        } catch (const std::exception& e) {
+          std::lock_guard<std::mutex> lock(error_mutex);
+          if (first_error.empty()) { first_error = e.what(); }
+        }
+      });
+    }
+    for (auto& th : threads) {
+      th.join();
+    }
+  }
+
+  ASSERT_TRUE(first_error.empty()) << first_error;
+}
+
+}  // namespace cuvs::neighbors::cagra