Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion java/settings.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ toolchainManagement {

rootProject.name = "vortex-root"

// API bindings
// API bindings (JMH benchmarks live in vortex-jni's `jmh` source set; see vortex-jni/BENCHMARKS.md)
include("vortex-jni")

// Spark integration
include("vortex-spark_2.12")
project(":vortex-spark_2.12").projectDir = file("vortex-spark")

Expand Down
143 changes: 141 additions & 2 deletions java/vortex-jni/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
// SPDX-FileCopyrightText: Copyright the Vortex contributors

import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
import net.ltgt.gradle.errorprone.errorprone
import org.gradle.kotlin.dsl.support.serviceOf

plugins {
`java-library`
`jvm-test-suite`
id("com.gradleup.shadow") version "9.4.2"
id("me.champeau.jmh") version "0.7.3"
}

dependencies {
Expand Down Expand Up @@ -92,8 +94,9 @@ tasks.withType<Test>().all {
)
}

// shade guava and arrow dependencies
tasks.withType<ShadowJar> {
// shade guava and arrow dependencies in the published jar only. The JMH benchmark links the real
// (unrelocated) Arrow classes, so its jar must not be relocated — scope this to the `shadowJar` task.
tasks.named<ShadowJar>("shadowJar") {
relocate("com.google.common", "dev.vortex.relocated.com.google.common")
relocate("org.apache.arrow", "dev.vortex.relocated.org.apache.arrow") {
// exclude C Data Interface since JNI cannot be relocated
Expand Down Expand Up @@ -214,4 +217,140 @@ tasks.register("generateJniHeaders") {
dependsOn("compileJava")
}

// ---------------------------------------------------------------------------
// JMH benchmarks (src/jmh). See BENCHMARKS.md.
//
// The read-boundary benchmark is meaningless against a debug native lib, so the `jmh` task builds
// and stages the release_debug cdylib itself (buildJmhNativeLib) rather than reusing the dev
// `makeTestFiles` debug build. The benchmark links the real Arrow classes off the runtime classpath
// (it is not run from the relocated shadowJar), so no relocation applies to it.
// ---------------------------------------------------------------------------
// Shared canonical benchmark file, generated by the Rust side and read by BOTH the JMH benchmark and
// the Rust `read_boundary` Divan bench so the two measure reads of the exact same bytes.
val workspaceRoot = rootProject.projectDir.absoluteFile.parentFile
val benchFile = workspaceRoot.resolve("target/vortex-jni-bench/data.vortex")

jmh {
jmhVersion.set("1.37")
// These reach the forked benchmark JVM. The Arrow C Data Interface needs the --add-opens; the
// system property points the benchmark at the shared canonical file.
jvmArgsAppend.addAll(
"--add-opens=java.base/java.nio=ALL-UNNAMED",
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
"-Dvortex.jni.bench.file=${benchFile.absolutePath}",
)
}

// Generate the shared canonical .vortex file via the Rust generator example. Idempotent: skipped
// while the file exists (delete it to regenerate). Both `jmh` and the Rust bench read this file.
val generateBenchFile =
tasks.register("generateBenchFile") {
description = "Generate the shared canonical .vortex file read by the JMH and Rust read benchmarks"
group = "verification"

outputs.file(benchFile)

doLast {
benchFile.parentFile.mkdirs()
serviceOf<ExecOperations>().exec {
workingDir = workspaceRoot
executable = "cargo"
args(
"run",
"--profile",
"release_debug",
"--quiet",
"--package",
"vortex-jni",
"--example",
"gen_bench_data",
"--",
benchFile.absolutePath,
)
}
}
}

// JMH benchmark classes/methods must be public and non-final, which the nopen checker forbids, and
// the generated JMH glue trips error-prone under -Werror. Relax both for the jmh source set only;
// main and test keep full strictness.
tasks.withType<JavaCompile>().configureEach {
if (name.lowercase().contains("jmh")) {
options.errorprone.enabled.set(false)
options.compilerArgs.remove("-Werror")
}
}

// Skip the redundant debug `makeTestFiles` build when this invocation runs the benchmark; the
// benchmark consumes the release_debug lib staged by buildJmhNativeLib instead.
val benchmarkRequested = objects.property(Boolean::class.java).convention(false)
gradle.taskGraph.whenReady {
benchmarkRequested.set(allTasks.any { it.project == project && (it.name == "jmh" || it.name == "jmhJar") })
}
tasks.named("makeTestFiles").configure {
onlyIf { !benchmarkRequested.get() }
}

val buildJmhNativeLib =
tasks.register("buildJmhNativeLib") {
description = "Build the release_debug vortex-jni cdylib and stage it for the JMH benchmark"
group = "verification"

// Stage on top of the processed resources so the benchmark loads it from the runtime classpath.
dependsOn("processResources")

doLast {
val workspaceRoot = rootProject.projectDir.absoluteFile.parentFile

serviceOf<ExecOperations>().exec {
workingDir = workspaceRoot
executable = "cargo"
args("build", "--profile", "release_debug", "--package", "vortex-jni")
}

val osName = System.getProperty("os.name").lowercase()
val osArch = System.getProperty("os.arch").lowercase()
val osShortName =
when {
osName.contains("mac") -> "darwin"
osName.contains("nix") || osName.contains("nux") -> "linux"
osName.contains("win") -> "win"
else -> throw GradleException("Unsupported OS for buildJmhNativeLib: $osName")
}
val libExt =
when (osShortName) {
"darwin" -> ".dylib"
"linux" -> ".so"
"win" -> ".dll"
else -> throw GradleException("Unsupported OS short name: $osShortName")
}

copy {
from("$workspaceRoot/target/release_debug/libvortex_jni$libExt")
into(layout.buildDirectory.dir("resources/main/native/$osShortName-$osArch"))
}
}
}

tasks.named("jmh").configure {
dependsOn(buildJmhNativeLib)
dependsOn(generateBenchFile)
}
tasks.named("jmhJar").configure { dependsOn(buildJmhNativeLib) }

// Standalone read-batch-granularity diagnostic (VortexJniBatchDiagnostic, not a JMH benchmark). Run
// it off the jmh runtime classpath, which carries the real Arrow classes and the staged
// release_debug lib (me.champeau.jmh's fat `jmhJar` does not bundle deps under com.gradleup.shadow).
tasks.register<JavaExec>("batchDiagnostic") {
description = "Run the standalone read-batch-granularity diagnostic (VortexJniBatchDiagnostic)"
group = "verification"
dependsOn("buildJmhNativeLib")
classpath = sourceSets["jmh"].runtimeClasspath
mainClass.set("dev.vortex.bench.VortexJniBatchDiagnostic")
jvmArgs(
"--add-opens=java.base/java.nio=ALL-UNNAMED",
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
)
}

description = "JNI bindings for the Vortex format"
99 changes: 99 additions & 0 deletions java/vortex-jni/src/jmh/java/dev/vortex/bench/BenchData.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

package dev.vortex.bench;

import static java.nio.charset.StandardCharsets.UTF_8;

import dev.vortex.api.Session;
import dev.vortex.api.VortexWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import org.apache.arrow.c.ArrowArray;
import org.apache.arrow.c.ArrowSchema;
import org.apache.arrow.c.Data;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.ViewVarCharVector;
import org.apache.arrow.vector.types.FloatingPointPrecision;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;

/**
* Shared synthetic table used by both {@link VortexJniReadBenchmark} and {@link VortexJniBatchDiagnostic} so they
* measure and inspect the exact same data shape: six columns (2× int64, 2× float64, 2× Utf8View) over {@link #ROWS}
* rows, with a deterministic fixed seed.
*
* <p>{@code id} is sequential, {@code cat} is a periodic low-cardinality column kept non-null so a {@code cat='alpha'}
* filter has selectivity exactly {@code 1/|CATS|}, and {@code tag} is high-cardinality with a 10% null rate to exercise
* a validity buffer.
*/
final class BenchData {

static final int ROWS = 2_000_000;
static final String[] CATS = {
"alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel",
"india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa"
};

private BenchData() {}

static Schema schema() {
return new Schema(List.of(
Field.notNullable("id", new ArrowType.Int(64, true)),
Field.notNullable("x", new ArrowType.Int(64, true)),
Field.notNullable("y", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)),
Field.notNullable("z", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)),
Field.nullable("cat", ArrowType.Utf8View.INSTANCE),
Field.nullable("tag", ArrowType.Utf8View.INSTANCE)));
}

static void writeTable(Session session, BufferAllocator allocator, String uri, int chunk) throws Exception {
Schema schema = schema();
Random rnd = new Random(42);
try (VortexWriter writer = VortexWriter.create(session, uri, schema, new HashMap<>(), allocator);
VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) {
BigIntVector id = (BigIntVector) root.getVector("id");
BigIntVector x = (BigIntVector) root.getVector("x");
Float8Vector y = (Float8Vector) root.getVector("y");
Float8Vector z = (Float8Vector) root.getVector("z");
ViewVarCharVector cat = (ViewVarCharVector) root.getVector("cat");
ViewVarCharVector tag = (ViewVarCharVector) root.getVector("tag");

long written = 0;
while (written < ROWS) {
int batch = (int) Math.min(chunk, ROWS - written);
for (FieldVector v : root.getFieldVectors()) {
v.reset();
}
for (int i = 0; i < batch; i++) {
long r = written + i;
id.setSafe(i, r);
x.setSafe(i, rnd.nextInt(1_000_000));
y.setSafe(i, rnd.nextDouble());
z.setSafe(i, rnd.nextDouble());
// cat stays non-null and deterministic so filter selectivity is exactly 1/|CATS|.
cat.setSafe(i, CATS[(int) (r % CATS.length)].getBytes(UTF_8));
// tag carries nulls (every 10th row) and high-cardinality values to exercise a validity buffer.
if (r % 10 == 0) {
tag.setNull(i);
} else {
tag.setSafe(i, Long.toString(r).getBytes(UTF_8));
}
}
root.setRowCount(batch);
try (ArrowArray arr = ArrowArray.allocateNew(allocator);
ArrowSchema sch = ArrowSchema.allocateNew(allocator)) {
Data.exportVectorSchemaRoot(allocator, root, null, arr, sch);
writer.writeBatch(arr.memoryAddress(), sch.memoryAddress());
}
written += batch;
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

package dev.vortex.bench;

import dev.vortex.api.DataSource;
import dev.vortex.api.Partition;
import dev.vortex.api.Scan;
import dev.vortex.api.ScanOptions;
import dev.vortex.api.Session;
import dev.vortex.jni.NativeLoader;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ipc.ArrowReader;

/**
* Standalone diagnostic (not a JMH benchmark): writes the shared {@link BenchData} table at several writer chunk sizes
* and prints the resulting read-batch row-count distribution, showing that Vortex coalesces to a stable read-batch
* granularity (~64K rows) independent of how the file was written.
*
* <p>Run it with {@code ./gradlew :vortex-jni:batchDiagnostic}.
*/
public final class VortexJniBatchDiagnostic {

private VortexJniBatchDiagnostic() {}

public static void main(String[] args) throws Exception {
NativeLoader.loadJni();
for (int chunk : new int[] {8192, 65536, 131072}) {
BufferAllocator alloc = new RootAllocator(Long.MAX_VALUE);
Session sess = Session.create();
Path f = Files.createTempFile("vortex-jni-diag-" + chunk + "-", ".vortex");
Files.deleteIfExists(f);
String uri = f.toAbsolutePath().toUri().toString();
BenchData.writeTable(sess, alloc, uri, chunk);
DataSource ds = DataSource.open(sess, uri);
long batches = 0;
long rowsSeen = 0;
long minRows = Long.MAX_VALUE;
long maxRows = 0;
Scan scan = ds.scan(ScanOptions.of());
while (scan.hasNext()) {
Partition partition = scan.next();
try (ArrowReader reader = partition.scanArrow(alloc)) {
while (reader.loadNextBatch()) {
int rows = reader.getVectorSchemaRoot().getRowCount();
batches++;
rowsSeen += rows;
minRows = Math.min(minRows, rows);
maxRows = Math.max(maxRows, rows);
}
}
}
System.out.printf(
"writeChunkRows=%d -> %d read batches over %d rows (min=%d, max=%d, avg=%d)%n",
chunk, batches, rowsSeen, minRows, maxRows, batches == 0 ? 0 : rowsSeen / batches);
Files.deleteIfExists(f);
}
}
}
Loading
Loading