-
Notifications
You must be signed in to change notification settings - Fork 160
[Java] New off-heap Dataset support for CAGRA and Bruteforce #902
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
297fe51
fa1e2e4
9241eed
27a15f6
f4ac8b5
d6193a0
7a75d9a
9ee1e4f
208bbaf
40c65df
fb01145
ff0e08f
25d7a7b
f29ebc1
5373e5d
06e3f67
30e7546
70177ef
6148cff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| /* | ||
| * Copyright (c) 2025, NVIDIA CORPORATION. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.nvidia.cuvs; | ||
|
|
||
| import com.nvidia.cuvs.spi.CuVSProvider; | ||
|
|
||
| /** | ||
| * This represents a wrapper for a dataset to be used for index construction. | ||
| * The purpose is to allow a caller to place the vectors into native memory | ||
| * directly, instead of requiring the caller to load all the vectors into the heap | ||
| * (e.g. with a float[][]). | ||
| * | ||
| * @since 25.06 | ||
| */ | ||
| public interface Dataset extends AutoCloseable { | ||
mythrocks marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** | ||
| * Add a single vector to the dataset. | ||
| * | ||
| * @param vector A float array of as many elements as the dimensions | ||
| */ | ||
| public void addVector(float[] vector); | ||
|
|
||
| /** | ||
| * Create a new instance of a dataset | ||
| * | ||
| * @param size Number of vectors in the dataset | ||
| * @param dimensions Size of each vector in the dataset | ||
| * @return new instance of {@link Dataset} | ||
| */ | ||
| static Dataset create(int size, int dimensions) { | ||
| return CuVSProvider.provider().newDataset(size, dimensions); | ||
| } | ||
|
|
||
| /** | ||
| * Gets the size of the dataset | ||
| * | ||
| * @return Size of the dataset | ||
| */ | ||
| public int size(); | ||
|
|
||
| /** | ||
| * Gets the dimensions of the vectors in this dataset | ||
| * | ||
| * @return Dimensions of the vectors in the dataset | ||
| */ | ||
| public int dimensions(); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,6 +43,7 @@ | |
| import com.nvidia.cuvs.BruteForceIndexParams; | ||
| import com.nvidia.cuvs.BruteForceQuery; | ||
| import com.nvidia.cuvs.CuVSResources; | ||
| import com.nvidia.cuvs.Dataset; | ||
| import com.nvidia.cuvs.SearchResults; | ||
| import com.nvidia.cuvs.internal.common.Util; | ||
| import com.nvidia.cuvs.internal.panama.cuvsBruteForceIndex; | ||
|
|
@@ -71,7 +72,8 @@ public class BruteForceIndexImpl implements BruteForceIndex{ | |
| private static final MethodHandle deserializeMethodHandle = downcallHandle("deserialize_brute_force_index", | ||
| FunctionDescriptor.ofVoid(ADDRESS, ADDRESS, ADDRESS, ADDRESS)); | ||
|
|
||
| private final float[][] dataset; | ||
| private final float[][] vectors; | ||
| private final Dataset dataset; | ||
| private final CuVSResourcesImpl resources; | ||
| private final IndexReference bruteForceIndexReference; | ||
| private final BruteForceIndexParams bruteForceIndexParams; | ||
|
|
@@ -86,8 +88,10 @@ public class BruteForceIndexImpl implements BruteForceIndex{ | |
| * @param bruteForceIndexParams an instance of {@link BruteForceIndexParams} | ||
| * holding the index parameters | ||
| */ | ||
| private BruteForceIndexImpl(float[][] dataset, CuVSResourcesImpl resources, BruteForceIndexParams bruteForceIndexParams) | ||
| private BruteForceIndexImpl(float[][] vectors, Dataset dataset, CuVSResourcesImpl resources, | ||
| BruteForceIndexParams bruteForceIndexParams) | ||
| throws Throwable { | ||
| this.vectors = vectors; | ||
| this.dataset = dataset; | ||
| this.resources = resources; | ||
| this.bruteForceIndexParams = bruteForceIndexParams; | ||
|
|
@@ -102,6 +106,7 @@ private BruteForceIndexImpl(float[][] dataset, CuVSResourcesImpl resources, Brut | |
| */ | ||
| private BruteForceIndexImpl(InputStream inputStream, CuVSResourcesImpl resources) throws Throwable { | ||
| this.bruteForceIndexParams = null; | ||
| this.vectors = null; | ||
| this.dataset = null; | ||
| this.resources = resources; | ||
| this.bruteForceIndexReference = deserialize(inputStream); | ||
|
|
@@ -127,6 +132,7 @@ public void destroyIndex() throws Throwable { | |
| } finally { | ||
| destroyed = true; | ||
| } | ||
| if (dataset != null) dataset.close(); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -137,10 +143,11 @@ public void destroyIndex() throws Throwable { | |
| * index | ||
| */ | ||
| private IndexReference build() throws Throwable { | ||
| long rows = dataset.length; | ||
| long cols = rows > 0 ? dataset[0].length : 0; | ||
| long rows = dataset != null? dataset.size(): vectors.length; | ||
| long cols = dataset != null? dataset.dimensions(): (rows > 0 ? vectors[0].length : 0); | ||
|
|
||
| MemorySegment dataSeg = Util.buildMemorySegment(resources.getArena(), dataset); | ||
| MemorySegment dataSeg = dataset != null? ((DatasetImpl) dataset).seg: | ||
| Util.buildMemorySegment(resources.getArena(), vectors); | ||
| try (var localArena = Arena.ofConfined()) { | ||
| MemorySegment returnValue = localArena.allocate(C_INT); | ||
| MemorySegment indexSeg = (MemorySegment) indexMethodHandle.invokeExact( | ||
|
|
@@ -284,7 +291,8 @@ public static BruteForceIndex.Builder newBuilder(CuVSResources cuvsResources) { | |
| */ | ||
| public static class Builder implements BruteForceIndex.Builder { | ||
|
|
||
| private float[][] dataset; | ||
| private float[][] vectors; | ||
| private Dataset dataset; | ||
|
Comment on lines
+294
to
+295
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we foresee deprecating the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't plan for it. I think many users might find the float[][] easier to use, but those who are conscious of performance might want to try the other. Do you suggest we deprecate the float[][]?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very late here, but did you consider writing a second DatasetImpl to hold the float[][]? This way you do not need to keep around two fields, of which one is always null, and if between them every time in the various Index implementations. It would be easier to maintain. I can draft a quick PR to show what I mean if you are interested.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, externally you can provide data either as a float[][] or a Dataset, but internally it will just be a Dataset. |
||
| private final CuVSResourcesImpl cuvsResources; | ||
| private BruteForceIndexParams bruteForceIndexParams; | ||
| private InputStream inputStream; | ||
|
|
@@ -327,11 +335,23 @@ public Builder from(InputStream inputStream) { | |
| /** | ||
| * Sets the dataset for building the {@link BruteForceIndex}. | ||
| * | ||
| * @param dataset a two-dimensional float array | ||
| * @param vectors a two-dimensional float array | ||
| * @return an instance of this Builder | ||
| */ | ||
| @Override | ||
| public Builder withDataset(float[][] dataset) { | ||
| public Builder withDataset(float[][] vectors) { | ||
| this.vectors = vectors; | ||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * Sets the dataset for building the {@link BruteForceIndex}. | ||
| * | ||
| * @param dataset a {@link Dataset} object containing the vectors | ||
| * @return an instance of this Builder | ||
| */ | ||
| @Override | ||
| public Builder withDataset(Dataset dataset) { | ||
| this.dataset = dataset; | ||
| return this; | ||
| } | ||
|
|
@@ -346,7 +366,7 @@ public BruteForceIndexImpl build() throws Throwable { | |
| if (inputStream != null) { | ||
| return new BruteForceIndexImpl(inputStream, cuvsResources); | ||
| } else { | ||
| return new BruteForceIndexImpl(dataset, cuvsResources, bruteForceIndexParams); | ||
| return new BruteForceIndexImpl(vectors, dataset, cuvsResources, bruteForceIndexParams); | ||
| } | ||
| } | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.