Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
96af4dd
Set max-dynamic-shared-mem with thread-safety
mythrocks Feb 3, 2026
305cb55
Copyright date. Formatting.
mythrocks Feb 4, 2026
4463666
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 4, 2026
677f6fe
Moved other call to set cudaFuncAttributeMaxDynamicSharedMemorySize.
mythrocks Feb 4, 2026
5e9e410
Moved call-sites in other files.
mythrocks Feb 4, 2026
f55cbd3
Copyright date.
mythrocks Feb 5, 2026
b394b19
Resolved merge conflicts. Moved all call-sites to use the new `optio…
mythrocks Feb 5, 2026
f3efb37
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 6, 2026
8792c40
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 10, 2026
e55eb23
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 10, 2026
2a5a00f
Invoke kernel within critical section.
mythrocks Feb 10, 2026
b1b97cc
Removed old function.
mythrocks Feb 10, 2026
f50d4d4
Tie the kernel to its launcher.
mythrocks Feb 11, 2026
c2b1c43
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 11, 2026
8d66233
Better error reporting.
mythrocks Feb 11, 2026
1acb864
Merge branch 'main' into cuda-invalid-argument-kernel-error
achirkin Feb 12, 2026
1d41373
Remove the safety fix for persistent kernel (only one kernel must run…
achirkin Feb 12, 2026
f6cf7d3
Add a reproducer
achirkin Feb 12, 2026
9b13500
Fix style
achirkin Feb 12, 2026
8d6fb1a
Apply suggestion from @achirkin
mythrocks Feb 13, 2026
1cd8b2c
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 13, 2026
9ed3400
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 13, 2026
1955dce
Fixed formatting again.
mythrocks Feb 13, 2026
3ba77d5
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 13, 2026
5a8771c
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 16, 2026
45dff11
Apply suggestion from @achirkin
achirkin Feb 18, 2026
8c107e1
Merge branch 'main' into cuda-invalid-argument-kernel-error
achirkin Feb 18, 2026
a6f3071
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 19, 2026
d3f6ecd
Fixed format string.
mythrocks Feb 19, 2026
088fcc8
Fixed thrust header.
mythrocks Feb 19, 2026
856e695
Fix compile error for thrust make_counting_iterator
mythrocks Feb 19, 2026
eff47c8
Revert "Fix compile error for thrust make_counting_iterator"
mythrocks Feb 20, 2026
b59b8c3
Cherrypick fix from #1825.
mythrocks Feb 20, 2026
3581c01
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 20, 2026
80825ab
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/cudart_utils.hpp> // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
Expand Down Expand Up @@ -589,8 +590,6 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
}

RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
// Initialize hash table
const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
set_value_batch(traversed_hashmap_ptr,
Expand All @@ -608,26 +607,29 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
num_queries,
smem_size);

kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
topk_distances_ptr,
dataset_desc.dev_ptr(stream),
queries_ptr,
graph.data_handle(),
max_elements,
graph.extent(1),
source_indices_ptr,
ps.num_random_samplings,
ps.rand_xor_mask,
dev_seed_ptr,
num_seeds,
visited_hash_bitlen,
traversed_hashmap_ptr,
traversed_hash_bitlen,
ps.itopk_size,
ps.min_iterations,
ps.max_iterations,
num_executed_iterations,
sample_filter);
auto const& kernel_launcher = [&](auto const& kernel) -> void {
kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
topk_distances_ptr,
dataset_desc.dev_ptr(stream),
queries_ptr,
graph.data_handle(),
max_elements,
graph.extent(1),
source_indices_ptr,
ps.num_random_samplings,
ps.rand_xor_mask,
dev_seed_ptr,
num_seeds,
visited_hash_bitlen,
traversed_hashmap_ptr,
traversed_hash_bitlen,
ps.itopk_size,
ps.min_iterations,
ps.max_iterations,
num_executed_iterations,
sample_filter);
};
cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(kernel, smem_size, kernel_launcher);
}

} // namespace multi_cta_search
Expand Down
55 changes: 29 additions & 26 deletions cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/integer_utils.hpp>
Expand Down Expand Up @@ -2312,36 +2313,38 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
dim3 thread_dims(block_size, 1, 1);
dim3 block_dims(1, num_queries, 1);
RAFT_LOG_DEBUG(
"Launching kernel with %u threads, %u block %u smem", block_size, num_queries, smem_size);
kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
topk_distances_ptr,
topk,
dataset_desc.dev_ptr(stream),
queries_ptr,
graph.data_handle(),
graph.extent(1),
source_indices_ptr,
ps.num_random_samplings,
ps.rand_xor_mask,
dev_seed_ptr,
num_seeds,
hashmap_ptr,
max_candidates,
max_itopk,
ps.itopk_size,
ps.search_width,
ps.min_iterations,
ps.max_iterations,
num_executed_iterations,
hash_bitlen,
small_hash_bitlen,
small_hash_reset_interval,
sample_filter);
auto const& kernel_launcher = [&](auto const& kernel) -> void {
kernel<<<block_dims, thread_dims, smem_size, stream>>>(topk_indices_ptr,
topk_distances_ptr,
topk,
dataset_desc.dev_ptr(stream),
queries_ptr,
graph.data_handle(),
graph.extent(1),
source_indices_ptr,
ps.num_random_samplings,
ps.rand_xor_mask,
dev_seed_ptr,
num_seeds,
hashmap_ptr,
max_candidates,
max_itopk,
ps.itopk_size,
ps.search_width,
ps.min_iterations,
ps.max_iterations,
num_executed_iterations,
hash_bitlen,
small_hash_bitlen,
small_hash_reset_interval,
sample_filter);
};
cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(
kernel, smem_size, kernel_launcher);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}
}
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/neighbors/detail/smem_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <raft/core/error.hpp>

#include <atomic>
#include <cstdint>
#include <mutex>

namespace cuvs::neighbors::detail {

/**
* @brief (Thread-)Safely invoke a kernel with a maximum dynamic shared memory size.
* This is required because the sequence `cudaFuncSetAttribute` + kernel launch is not executed
* atomically.
*
* Used this way, the cudaFuncAttributeMaxDynamicSharedMemorySize can only grow and thus
* guarantees that the kernel is safe to launch.
*
* @tparam KernelT The type of the kernel.
* @tparam InvocationT The type of the invocation function.
* @param kernel The kernel function address (for whom the smem-size is specified).
* @param smem_size The size of the dynamic shared memory to be set.
* @param launch The kernel launch function/lambda.
*/
template <typename KernelT, typename KernelLauncherT>
void safely_launch_kernel_with_smem_size(KernelT const& kernel,
uint32_t smem_size,
KernelLauncherT const& launch)
{
// the last smem size is parameterized by the kernel thanks to the template parameter.
static std::atomic<uint32_t> current_smem_size{0};
auto last_smem_size = current_smem_size.load(std::memory_order_relaxed);
if (smem_size > last_smem_size) {
// We still need a mutex for the critical section: actualize last_smem_size and set the
// attribute.
static auto mutex = std::mutex{};
auto guard = std::lock_guard<std::mutex>{mutex};
if (!current_smem_size.compare_exchange_strong(
last_smem_size, smem_size, std::memory_order_relaxed, std::memory_order_relaxed)) {
// The value has been updated by another thread between the load and the mutex acquisition.
if (smem_size > last_smem_size) {
current_smem_size.store(smem_size, std::memory_order_relaxed);
}
}
// Only update if the last seen value is smaller than the new one.
if (smem_size > last_smem_size) {
auto launch_status =
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
RAFT_EXPECTS(launch_status == cudaSuccess,
"Failed to set max dynamic shared memory size to %u bytes",
smem_size);
}
}
// We don't need to guard the kernel launch because the smem_size can only grow.
return launch(kernel);
}

} // namespace cuvs::neighbors::detail
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ ConfigureTest(
ConfigureTest(
NAME NEIGHBORS_ANN_CAGRA_TEST_BUGS
PATH neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu
neighbors/ann_cagra/bug_issue_93_reproducer.cu
GPUS 1
PERCENT 100
)
Expand Down
126 changes: 126 additions & 0 deletions cpp/tests/neighbors/ann_cagra/bug_issue_93_reproducer.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*
* Reproducer for https://github.com/rapidsai/cuvs-lucene/issues/93
* cuvsCagraSearch returned 0 (Reason=cudaErrorInvalidValue:invalid argument)
*
* ROOT CAUSE:
* `cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)`
* is not thread-safe. It sets a CUDA-context-wide attribute. When two threads call it
* concurrently with different smem_size values, the following race occurs:
* 1. Thread A sets max-dynamic-shared-mem to SIZE_A (larger).
* 2. Thread B overwrites it with SIZE_B (smaller).
* 3. Thread A launches its kernel requesting SIZE_A of shared memory,
* but the CUDA context now only allows SIZE_B → cudaErrorInvalidValue.
*
* HOW IT MANIFESTS IN cuvs-lucene:
* Lucene's TaskExecutor dispatches per-segment CAGRA searches to a thread pool.
* Each segment has a different number of vectors (e.g. 25, 26, 47), leading to
* different graph degrees after reduction, and therefore different smem_size values
* in the single-CTA search kernel. The concurrent cudaFuncSetAttribute calls race.
*
* REPRODUCTION STRATEGY:
* Build CAGRA indices with different dataset sizes (different graph degrees),
* then search them concurrently from separate threads, each with its own raft::resources.
* This mirrors the cuvs-lucene setup where each thread gets a ThreadLocal CuVSResources.
*/

#include <gtest/gtest.h>

#include <cuvs/distance/distance.hpp>
#include <cuvs/neighbors/cagra.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/device_resources.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/random/rng.cuh>

#include <cstdint>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

namespace cuvs::neighbors::cagra {

// NOLINTNEXTLINE(readability-identifier-naming)
TEST(Issue93Reproducer, ConcurrentSearchDifferentGraphDegrees)
{
raft::resources handle;
raft::random::RngState rng(6181234567890123459ULL);

// Dataset sizes from REPRODUCER.md warnings (different sizes → different graph degrees).
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
std::vector<int> dataset_sizes = {25, 26, 47, 25};
constexpr int dim = 64;
constexpr int top_k = 10;

// Build indices on the main thread.
std::vector<cagra::index<float, uint32_t>> indices;
for (int n_rows : dataset_sizes) {
auto database = raft::make_device_matrix<float, int64_t>(handle, n_rows, dim);
raft::random::uniform(
handle, rng, database.data_handle(), n_rows * dim, -1.0F, 1.0F); // NOLINT

cagra::index_params ip;
ip.metric = cuvs::distance::DistanceType::L2Expanded;
ip.intermediate_graph_degree = 128; // NOLINT
ip.graph_degree = 64; // NOLINT
ip.graph_build_params =
graph_build_params::nn_descent_params(ip.intermediate_graph_degree, ip.metric);

indices.push_back(cagra::build(handle, ip, raft::make_const_mdspan(database.view())));
}
raft::resource::sync_stream(handle);

// Search concurrently from multiple threads until the first failure.
const int num_threads = static_cast<int>(indices.size());
std::mutex error_mutex;
std::string first_error;

// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
for (int iter = 0; iter < 50 && first_error.empty(); ++iter) {
std::vector<std::thread> threads;
for (int t = 0; t < num_threads; ++t) {
threads.emplace_back([&, t, iter]() {
raft::resources thread_handle;
raft::random::RngState thread_rng(42ULL + static_cast<uint64_t>(t) +
static_cast<uint64_t>(iter) * 1000ULL);
try {
auto query = raft::make_device_matrix<float, int64_t>(thread_handle, 1, dim);
raft::random::uniform(thread_handle, thread_rng, query.data_handle(), dim, -1.0F, 1.0F);

// Match cuvs-lucene params: Java's Panama zero-initializes the struct,
// and SINGLE_CTA = 0 in the enum, so algo is SINGLE_CTA.
cagra::search_params sp;
sp.itopk_size = top_k;
sp.search_width = 1;
sp.algo = search_algo::SINGLE_CTA;

auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(thread_handle, 1, top_k);
auto distances = raft::make_device_matrix<float, int64_t>(thread_handle, 1, top_k);

cagra::search(thread_handle,
sp,
indices[static_cast<size_t>(t)],
raft::make_const_mdspan(query.view()),
neighbors.view(),
distances.view());

raft::resource::sync_stream(thread_handle);
} catch (const std::exception& e) {
std::lock_guard<std::mutex> lock(error_mutex);
if (first_error.empty()) { first_error = e.what(); }
}
});
}
for (auto& th : threads) {
th.join();
}
}

ASSERT_TRUE(first_error.empty()) << first_error;
}

} // namespace cuvs::neighbors::cagra
Loading