Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
912 changes: 617 additions & 295 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,14 @@ similar = "3.0.0"
sketches-ddsketch = "0.4.0"
smallvec = "1.15.1"
smol = "2.0.2"
spatialbench = "0.2"
spatialbench-arrow = "0.2"
# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream
# catches up, write its generated batches with a matching parquet instead of converting
# arrow versions at the boundary.
spatialbench-parquet = { package = "parquet", version = "56", features = [
"async",
] }
static_assertions = "1.1"
strum = "0.28"
syn = { version = "2.0.117", features = ["full"] }
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/duckdb-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ pub struct DuckClient {
connection: Option<Connection>,
pub db_path: PathBuf,
pub threads: Option<usize>,
/// `INSTALL spatial; LOAD spatial;` for SpatialBench.
init_sql: Vec<String>,
}

impl DuckClient {
Expand Down Expand Up @@ -67,9 +69,19 @@ impl DuckClient {
connection: Some(connection),
db_path,
threads,
init_sql: Vec::new(),
})
}

/// Run `statements` now and after every subsequent [`DuckClient::reopen`].
pub fn set_init_sql(&mut self, statements: Vec<String>) -> Result<()> {
for stmt in &statements {
self.connection().query(stmt)?;
}
self.init_sql = statements;
Ok(())
}

pub fn open_and_setup_database(
path: Option<PathBuf>,
threads: Option<usize>,
Expand Down Expand Up @@ -108,6 +120,14 @@ impl DuckClient {
self.db = Some(db);
self.connection = Some(connection);

// Replay init SQL (e.g. LOAD spatial).
for stmt in &self.init_sql {
self.connection
.as_ref()
.vortex_expect("connection just opened")
.query(stmt)?;
}

Ok(())
}

Expand All @@ -123,6 +143,7 @@ impl DuckClient {
connection: Some(connection),
db_path,
threads: None,
init_sql: Vec::new(),
})
}

Expand Down
3 changes: 2 additions & 1 deletion benchmarks/duckdb-bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,13 @@ fn main() -> anyhow::Result<()> {
&filtered_queries,
mode,
|format| {
let ctx = DuckClient::new(
let mut ctx = DuckClient::new(
&*benchmark,
format,
args.delete_duckdb_database,
args.threads,
)?;
ctx.set_init_sql(benchmark.engine_init_sql(Engine::DuckDB))?;
ctx.register_tables(&*benchmark, format)?;

// Duckdb doesn't support octet_length for strings but we need this
Expand Down
3 changes: 3 additions & 0 deletions vortex-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ regex = { workspace = true }
reqwest = { workspace = true, features = ["stream"] }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
spatialbench = { workspace = true }
spatialbench-arrow = { workspace = true }
spatialbench-parquet = { workspace = true }
sysinfo = { workspace = true }
tabled = { workspace = true, features = ["std"] }
target-lexicon = { workspace = true }
Expand Down
172 changes: 172 additions & 0 deletions vortex-bench/spatialbench.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
-- SpatialBench queries (DuckDB dialect), from sedona-spatialbench DuckDBSpatialBenchBenchmark
-- (spatialbench-queries/print_queries.py). Query logic is unchanged, only reformatted for readability
-- and numbered Q1..Q12 (canonical order). The harness splits the file on semicolons, so a comment
-- must never contain one.

-- Q1: trips starting within 50km of Sedona city center, ordered by distance.
SELECT
t.t_tripkey,
ST_X(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lon,
ST_Y(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lat,
t.t_pickuptime,
ST_Distance(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT (-111.7610 34.8697)')) AS distance_to_center
FROM trip t
WHERE ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT (-111.7610 34.8697)'), 0.45)
ORDER BY distance_to_center ASC, t.t_tripkey ASC;

-- Q2: count trips starting within the Coconino County (Arizona) zone.
SELECT COUNT(*) AS trip_count_in_coconino_county
FROM trip t
WHERE ST_Intersects(
ST_GeomFromWKB(t.t_pickuploc),
(SELECT ST_GeomFromWKB(z.z_boundary) FROM zone z WHERE z.z_name = 'Coconino County' LIMIT 1)
);

-- Q3: monthly trip statistics within 15km of Sedona city center (10km bbox + 5km buffer).
SELECT
DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
COUNT(t.t_tripkey) AS total_trips,
AVG(t.t_distance) AS avg_distance,
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration,
AVG(t.t_fare) AS avg_fare
FROM trip t
WHERE ST_DWithin(
ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromText('POLYGON((-111.9060 34.7347, -111.6160 34.7347, -111.6160 35.0047, -111.9060 35.0047, -111.9060 34.7347))'),
0.045
)
GROUP BY pickup_month
ORDER BY pickup_month;

-- Q4: zone distribution of the top 1000 trips by tip amount.
SELECT z.z_zonekey, z.z_name, COUNT(*) AS trip_count
FROM zone z
JOIN (
SELECT t.t_pickuploc
FROM trip t
ORDER BY t.t_tip DESC, t.t_tripkey ASC
LIMIT 1000
) top_trips ON ST_Within(ST_GeomFromWKB(top_trips.t_pickuploc), ST_GeomFromWKB(z.z_boundary))
GROUP BY z.z_zonekey, z.z_name
ORDER BY trip_count DESC, z.z_zonekey ASC;

-- Q5: monthly travel patterns for repeat customers (convex hull of dropoff locations).
SELECT
c.c_custkey,
c.c_name AS customer_name,
DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
ST_Area(ST_ConvexHull(ST_Collect(ARRAY_AGG(ST_GeomFromWKB(t.t_dropoffloc))))) AS monthly_travel_hull_area,
COUNT(*) AS dropoff_count
FROM trip t
JOIN customer c ON t.t_custkey = c.c_custkey
GROUP BY c.c_custkey, c.c_name, pickup_month
HAVING dropoff_count > 5
ORDER BY dropoff_count DESC, c.c_custkey ASC;

-- Q6: zone statistics for trips intersecting a bounding box.
SELECT
z.z_zonekey,
z.z_name,
COUNT(t.t_tripkey) AS total_pickups,
AVG(t.t_totalamount) AS avg_distance,
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration
FROM trip t, zone z
WHERE ST_Intersects(
ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110 34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'),
ST_GeomFromWKB(z.z_boundary)
)
AND ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary))
GROUP BY z.z_zonekey, z.z_name
ORDER BY total_pickups DESC, z.z_zonekey ASC;

-- Q7: detect potential route detours by comparing reported vs. geometric distances.
WITH trip_lengths AS (
SELECT
t.t_tripkey,
t.t_distance AS reported_distance_m,
ST_Length(ST_MakeLine(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(t.t_dropoffloc))) / 0.000009 AS line_distance_m
FROM trip t
)
SELECT
t.t_tripkey,
t.reported_distance_m,
t.line_distance_m,
t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio
FROM trip_lengths t
ORDER BY detour_ratio DESC NULLS LAST, reported_distance_m DESC, t_tripkey ASC;

-- Q8: count nearby pickups for each building within ~500m.
SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count
FROM trip t
JOIN building b ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary), 0.0045)
GROUP BY b.b_buildingkey, b.b_name
ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC;

-- Q9: building conflation (duplicate/overlap detection via IoU).
WITH b1 AS (
SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom FROM building
),
b2 AS (
SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom FROM building
),
pairs AS (
SELECT
b1.id AS building_1,
b2.id AS building_2,
ST_Area(b1.geom) AS area1,
ST_Area(b2.geom) AS area2,
ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area
FROM b1
JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom)
)
SELECT
building_1,
building_2,
area1,
area2,
overlap_area,
CASE
WHEN overlap_area = 0 THEN 0.0
WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0
ELSE overlap_area / (area1 + area2 - overlap_area)
END AS iou
FROM pairs
ORDER BY iou DESC, building_1 ASC, building_2 ASC;

-- Q10: zone statistics for trips starting within each zone.
SELECT
z.z_zonekey,
z.z_name AS pickup_zone,
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration,
AVG(t.t_distance) AS avg_distance,
COUNT(t.t_tripkey) AS num_trips
FROM zone z
LEFT JOIN trip t ON ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary))
GROUP BY z.z_zonekey, z.z_name
ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC;

-- Q11: count trips that cross between different zones.
SELECT COUNT(*) AS cross_zone_trip_count
FROM trip t
JOIN zone pickup_zone ON ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(pickup_zone.z_boundary))
JOIN zone dropoff_zone ON ST_Within(ST_GeomFromWKB(t.t_dropoffloc), ST_GeomFromWKB(dropoff_zone.z_boundary))
WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey;

-- Q12: five nearest buildings per trip pickup (CROSS JOIN LATERAL, since DuckDB spatial has no ST_KNN).
SELECT
t.t_tripkey,
t.t_pickuploc,
nb.b_buildingkey,
nb.building_name,
nb.distance_to_building
FROM trip t
CROSS JOIN LATERAL (
SELECT
b.b_buildingkey,
b.b_name AS building_name,
ST_Distance(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary)) AS distance_to_building
FROM building b
ORDER BY distance_to_building
LIMIT 5
) AS nb
ORDER BY nb.distance_to_building, nb.b_buildingkey;
7 changes: 7 additions & 0 deletions vortex-bench/src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use glob::Pattern;
use url::Url;

use crate::BenchmarkDataset;
use crate::Engine;
use crate::Format;

/// Specification for a table in a benchmark dataset.
Expand All @@ -32,6 +33,12 @@ pub trait Benchmark: Send + Sync {
/// Get all available queries for this benchmark
fn queries(&self) -> anyhow::Result<Vec<(usize, String)>>;

/// SQL an `engine` must run before this benchmark's queries (e.g. loading engine
/// extensions). Runners replay these after every (re)open. Default: none.
fn engine_init_sql(&self, _engine: Engine) -> Vec<String> {
Vec::new()
}

/// Generate or prepare base data for the benchmark (typically Parquet format).
/// This is the canonical source data that can be converted to other formats.
/// This should be idempotent - safe to call multiple times.
Expand Down
7 changes: 7 additions & 0 deletions vortex-bench/src/datasets/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ pub enum BenchmarkDataset {
ClickBenchSorted,
#[serde(rename = "public-bi")]
PublicBi { name: String },
#[serde(rename = "spatialbench")]
SpatialBench { scale_factor: String },
#[serde(rename = "statpopgen")]
StatPopGen { n_rows: u64 },
#[serde(rename = "polarsignals")]
Expand All @@ -90,6 +92,7 @@ impl BenchmarkDataset {
BenchmarkDataset::ClickBench { .. } => "clickbench",
BenchmarkDataset::ClickBenchSorted => "clickbench-sorted",
BenchmarkDataset::PublicBi { .. } => "public-bi",
BenchmarkDataset::SpatialBench { .. } => "spatialbench",
BenchmarkDataset::StatPopGen { .. } => "statpopgen",
BenchmarkDataset::PolarSignals { .. } => "polarsignals",
BenchmarkDataset::Fineweb => "fineweb",
Expand All @@ -110,6 +113,9 @@ impl Display for BenchmarkDataset {
},
BenchmarkDataset::ClickBenchSorted => write!(f, "clickbench-sorted"),
BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"),
BenchmarkDataset::SpatialBench { scale_factor } => {
write!(f, "spatialbench(sf={scale_factor})")
}
BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"),
BenchmarkDataset::PolarSignals { n_rows } => {
write!(f, "polarsignals(n_rows={n_rows})")
Expand Down Expand Up @@ -168,6 +174,7 @@ impl BenchmarkDataset {
],
BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::ClickBenchSorted => &["hits"],
BenchmarkDataset::PublicBi { .. } => todo!(),
BenchmarkDataset::SpatialBench { .. } => &["trip", "building", "zone"],
BenchmarkDataset::StatPopGen { .. } => &["statpopgen"],
BenchmarkDataset::PolarSignals { .. } => &["stacktraces"],
BenchmarkDataset::Fineweb => &["fineweb"],
Expand Down
11 changes: 11 additions & 0 deletions vortex-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ use vortex::file::VortexWriteOptions;
use vortex::file::WriteStrategyBuilder;
use vortex::utils::aliases::hash_map::HashMap;

use crate::spatialbench::SpatialBenchBenchmark;

pub mod appian;
pub mod benchmark;
pub mod clickbench;
Expand All @@ -52,6 +54,7 @@ pub mod public_bi;
pub mod random_access;
pub mod realnest;
pub mod runner;
pub mod spatialbench;
pub mod statpopgen;
pub mod tpcds;
pub mod tpch;
Expand Down Expand Up @@ -268,6 +271,8 @@ pub enum BenchmarkArg {
PolarSignals,
#[clap(name = "public-bi")]
PublicBi,
#[clap(name = "spatialbench")]
SpatialBench,
}

/// Default scale factor for TPC-related benchmarks
Expand Down Expand Up @@ -334,6 +339,12 @@ pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result<Box<dyn
let benchmark = PublicBiBenchmark::new(dataset)?;
Ok(Box::new(benchmark) as _)
}
BenchmarkArg::SpatialBench => {
let scale_factor = opts.get(SCALE_FACTOR_KEY).unwrap_or(DEFAULT_SCALE_FACTOR);
let remote_data_dir = opts.get_as::<String>(REMOTE_DATA_KEY);
let benchmark = SpatialBenchBenchmark::new(scale_factor.to_string(), remote_data_dir)?;
Ok(Box::new(benchmark) as _)
}
}
}

Expand Down
Loading
Loading