From fb760372566c1de45ec98bc626ab294e4507aed4 Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Mon, 22 Jun 2026 13:30:52 -0400 Subject: [PATCH] feat: duckdb-vortex over spatialbench infra Signed-off-by: Nemo Yu --- Cargo.lock | 929 ++++++++++++------ Cargo.toml | 9 + benchmarks/duckdb-bench/src/lib.rs | 22 + benchmarks/duckdb-bench/src/main.rs | 3 +- vortex-bench/Cargo.toml | 6 + vortex-bench/spatialbench.sql | 228 +++++ vortex-bench/src/benchmark.rs | 8 + vortex-bench/src/datasets/mod.rs | 18 + vortex-bench/src/lib.rs | 27 +- vortex-bench/src/spatialbench/benchmark.rs | 200 ++++ vortex-bench/src/spatialbench/datagen/mod.rs | 15 + .../src/spatialbench/datagen/native.rs | 179 ++++ .../src/spatialbench/datagen/table.rs | 63 ++ vortex-bench/src/spatialbench/datagen/wkb.rs | 99 ++ vortex-bench/src/spatialbench/mod.rs | 11 + vortex-bench/src/v3.rs | 9 + vortex-duckdb/src/convert/dtype.rs | 13 +- vortex-duckdb/src/convert/expr.rs | 151 ++- vortex-duckdb/src/exporter/extension.rs | 12 + vortex-duckdb/src/exporter/geo.rs | 23 + vortex-geo/Cargo.toml | 1 + vortex-geo/src/extension/point.rs | 65 +- vortex-geo/src/extension/polygon.rs | 51 +- vortex-geo/src/scalar_fn/distance.rs | 85 +- 24 files changed, 1894 insertions(+), 333 deletions(-) create mode 100644 vortex-bench/spatialbench.sql create mode 100644 vortex-bench/src/spatialbench/benchmark.rs create mode 100644 vortex-bench/src/spatialbench/datagen/mod.rs create mode 100644 vortex-bench/src/spatialbench/datagen/native.rs create mode 100644 vortex-bench/src/spatialbench/datagen/table.rs create mode 100644 vortex-bench/src/spatialbench/datagen/wkb.rs create mode 100644 vortex-bench/src/spatialbench/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 21e54dc1a3a..baa07f020a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -196,25 +196,57 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" +[[package]] +name = "arrow" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb98341a7e051bb79731ecb33ec00cbd6e0e315a542d6732b46d462c9215ea2" +dependencies = [ + "arrow-arith 56.2.1", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ord 56.2.1", + "arrow-row 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "arrow-string 56.2.1", +] + [[package]] name = "arrow" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", "arrow-csv", - "arrow-data", - "arrow-ipc", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", +] + +[[package]] +name = "arrow-arith" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce4751cbc4bcccfeeea79df9571ff1dc066d61e44723c7604d11c7937f5b560" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "num", ] [[package]] @@ -223,14 +255,30 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b02ccba2e977a3aabb4384036109ca32f552399a2bc0588f925f91ed073ce70c" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "chrono", + "half", + "hashbrown 0.16.1", + "num", +] + [[package]] name = "arrow-array" version = "58.3.0" @@ -238,9 +286,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "chrono", "chrono-tz", "half", @@ -256,9 +304,9 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "049230728cd6e093088c8d231b4beede184e35cad7777c1505c0d5a8571f4376" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "bytes", "bzip2", "crc", @@ -274,6 +322,17 @@ dependencies = [ "zstd", ] +[[package]] +name = "arrow-buffer" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90f8bece6a9ee316a699fbbfde368a206676a1206ce89b50f07937648e76c3c" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-buffer" version = "58.3.0" @@ -286,18 +345,39 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ffe645cfb4e80b1ca37a3a106ce7b4af66ccdd60c655a57e6b9aab096164a7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "atoi", "base64", "chrono", @@ -314,41 +394,67 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78468c813909465dd0f858950c8a0614eb63608134acf95c602ec21381258b28" +dependencies = [ + "arrow-buffer 56.2.1", + "arrow-schema 56.2.1", + "half", + "num", +] + [[package]] name = "arrow-data" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f88b0fbb33af28089ccd3e4dcd0ff09de46842168d00220b920f7231feddf5" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.1", "zstd", ] @@ -358,12 +464,12 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -377,17 +483,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed58a38c3db0a2cf75ef70e3cb6bc4bd0da0a3d390de37c36139b31fae826e8" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", +] + [[package]] name = "arrow-ord" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", +] + +[[package]] +name = "arrow-row" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "079ced0517daf4f09b070d09ff641cee7cc331aa216bebcb25d1a6474ad53086" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "half", ] [[package]] @@ -396,13 +528,19 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "half", ] +[[package]] +name = "arrow-schema" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a0d5eb3fe25337ff83e8333a08379bdd1540b0961b1c888f6e505d971c198e1" + [[package]] name = "arrow-schema" version = "58.3.0" @@ -414,6 +552,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2368a78bd32902dba39d52519d70f63799c8b5dc8a9477129a30c2fd3dc70c19" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "num", +] + [[package]] name = "arrow-select" version = "58.3.0" @@ -421,24 +573,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dece58a130b9187756ded8bc071bd8ee9dd7a146566af244b297c7e632fd1ef7" +dependencies = [ + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-data 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "memchr", "num-traits", "regex", @@ -1392,11 +1561,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "7.2.2" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width 0.2.2", ] @@ -1419,8 +1589,8 @@ name = "compress-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "clap", @@ -1428,7 +1598,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "lance-bench", - "parquet", + "parquet 58.3.0", "regex", "tokio", "tracing", @@ -1944,8 +2114,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bytes", "chrono", @@ -1993,8 +2163,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "997a31e15872606a49478e670c58302094c97cb96abb0a7d60720f8e92170040" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "bzip2", "chrono", @@ -2032,7 +2202,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "sqlparser 0.62.0", "tempfile", "tokio", @@ -2073,7 +2243,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 53.1.0", @@ -2098,7 +2268,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7dd61161508f8f5fa1107774ea687bd753c22d83a32eebf963549f89de14139" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2123,7 +2293,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2146,7 +2316,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897c70f871277f9ce99aa38347be0d679bbe3e617156c4d2a8378cec8a2a0891" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2170,8 +2340,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -2192,9 +2362,9 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c9ded5d87d9172319e006f2afdb9928d72dbacd6a90a458d8acb1e3b43a65" dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "chrono", "foldhash 0.2.0", "half", @@ -2204,7 +2374,7 @@ dependencies = [ "libc", "log", "object_store", - "parquet", + "parquet 58.3.0", "recursive", "sqlparser 0.62.0", "tokio", @@ -2240,7 +2410,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "chrono", @@ -2269,7 +2439,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd7d295b2ec7c00d8a56562f41ed41062cf0af75549ed891c12a0a09eddfefe" dependencies = [ - "arrow", + "arrow 58.3.0", "async-compression", "async-trait", "bytes", @@ -2305,8 +2475,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2329,8 +2499,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552b0b3f342f7ec41b3fbd70f6339dc82a30cfd0349e7f280e7852528085349f" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.3.0", + "arrow-ipc 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2353,7 +2523,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb517d08967d536284ce70afb5fe8583133779249f2d7b90587d339741a7f195" dependencies = [ - "arrow", + "arrow 58.3.0", "arrow-avro", "async-trait", "bytes", @@ -2372,7 +2542,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2395,7 +2565,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68850aa426b897e879c8b87e512ea8124f1d0a2869a4e51808ddaaddf1bc0ada" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2418,7 +2588,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 53.1.0", @@ -2442,7 +2612,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402f93242ae08ef99139ee2c528a49d087efe88d5c7b2c3ff5480855a40ce54f" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2465,7 +2635,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffd2499c1bee0eeccf6a57156105700eeeb17bc701899ac719183c4e74231450" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "datafusion-common 54.0.0", @@ -2486,7 +2656,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", + "parquet 58.3.0", "tokio", ] @@ -2508,8 +2678,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "chrono", "dashmap", @@ -2531,8 +2701,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37a8643ab852eb68864e1b72ae789e8066282dce48eea6347ffb0aee33d1ccc0" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "async-trait", "dashmap", "datafusion-common 54.0.0", @@ -2553,7 +2723,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "chrono", "datafusion-common 53.1.0", @@ -2575,8 +2745,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6932f4d71eed9c8d9341476a2b845aadfabde5495d08dbcd8fc23881f49fa7a0" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "async-trait", "chrono", "datafusion-common 54.0.0", @@ -2598,7 +2768,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2611,7 +2781,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0225491839a31b1f7d2cb8092c2d50792e2fe1c1724e4e6d08e011f5feaf4ed2" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "indexmap 2.14.0", "itertools 0.14.0", @@ -2623,8 +2793,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2655,8 +2825,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14872c47bfc3d21e53ec82f57074e6987a15941c1e2f43cde4ac6ae2746634e3" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.3.0", + "arrow-buffer 58.3.0", "base64", "blake2", "blake3", @@ -2688,7 +2858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2709,7 +2879,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75a2ca14e1b609be21e657e2d3130b2f446456b08393b377bb721a33952d2e09" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2731,7 +2901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", "datafusion-physical-expr-common 53.1.0", @@ -2743,7 +2913,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ece74ba09092d2ef9c9b54a38445450aea292a1f8b04faf531936b723a24b3c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", "datafusion-physical-expr-common 54.0.0", @@ -2755,8 +2925,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-execution 53.1.0", @@ -2780,8 +2950,8 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3e3f9ee8ca59bf70518802107de6f1b88a9509efdc629fadc5de9d6b2d5ef5" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.3.0", + "arrow-ord 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-execution 54.0.0", @@ -2805,7 +2975,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 53.1.0", "datafusion-common 53.1.0", @@ -2821,7 +2991,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89161dffc22cf2b50f9f4b1bee83b5221d3b4ed7c2e37fd7aa2b22a5297b3a26" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "datafusion-catalog 54.0.0", "datafusion-common 54.0.0", @@ -2837,7 +3007,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-doc 53.1.0", "datafusion-expr 53.1.0", @@ -2855,7 +3025,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7339345b226b3874037708bf5023ba1c2de705128f8457a095aae5ae9cb9c78" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-doc 54.0.0", "datafusion-expr 54.0.0", @@ -2914,7 +3084,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", @@ -2933,7 +3103,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77f20e8cf9e8654d92f4c16b24c487353ee5bf153ffc12d5772cd399ab8cd281" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", @@ -2954,7 +3124,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-expr-common 53.1.0", @@ -2976,7 +3146,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f015a4a82f6f7ff7e1d8d4bf3870a936752fa38b17705dfcc14adef95aa8922c" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-expr-common 54.0.0", @@ -2998,7 +3168,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-expr 53.1.0", "datafusion-functions 53.1.0", @@ -3013,7 +3183,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51e6ffff8acdfe54e0ea15ccf38115c4a9184433b0439f42907637928d00a235" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-expr 54.0.0", "datafusion-functions 54.0.0", @@ -3029,7 +3199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash 0.8.12", - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 53.1.0", "datafusion-expr-common 53.1.0", @@ -3045,7 +3215,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7967a3e171c6a4bf09474b3f7a14f1a3db13ed1714ba12156f33fcce2bba54e8" dependencies = [ - "arrow", + "arrow 58.3.0", "chrono", "datafusion-common 54.0.0", "datafusion-expr-common 54.0.0", @@ -3062,7 +3232,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-execution 53.1.0", "datafusion-expr 53.1.0", @@ -3080,7 +3250,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59ff803e2a96054cb6d83f35f9e60fd4f42eac515e1932bd1b2dbc91d5fcbf36" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-execution 54.0.0", "datafusion-expr 54.0.0", @@ -3100,9 +3270,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 53.1.0", "datafusion-common-runtime 53.1.0", @@ -3131,11 +3301,11 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "776ee54d47d15bdb126452f9ca17b03761e3b004682914beaedd3f86eb507fbc" dependencies = [ - "arrow", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", + "arrow 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion-common 54.0.0", "datafusion-common-runtime 54.0.0", @@ -3164,7 +3334,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 53.1.0", "datafusion-datasource 53.1.0", "datafusion-expr-common 53.1.0", @@ -3181,7 +3351,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fb9e5774660aa69c3ba93c610f175f75b65cb8c3776edb3626de8f3a4f4ee3" dependencies = [ - "arrow", + "arrow 58.3.0", "datafusion-common 54.0.0", "datafusion-datasource 54.0.0", "datafusion-expr-common 54.0.0", @@ -3225,7 +3395,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "390bb0bf37cb2b95ffd65eacd66f60df50793d1f94097799e416f39477a51957" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "crc32fast", @@ -3255,7 +3425,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 53.1.0", @@ -3273,7 +3443,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6094ad36a3ed6d7ac87b20b479b2d0b118250f66cf997603829fdc65b44a7099" dependencies = [ - "arrow", + "arrow 58.3.0", "bigdecimal", "chrono", "datafusion-common 54.0.0", @@ -3292,7 +3462,7 @@ version = "54.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0c08025966108056d3547d879c4d39e246277494f59ca12920f78187d95eea1" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bigdecimal", "clap", @@ -3834,7 +4004,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcd0ce0249ac12fd44fcde62d435c36d881952c2f0df4d1de24b45e1dbba5ddb" dependencies = [ - "arrow-array", + "arrow-array 58.3.0", "rand 0.9.4", ] @@ -4059,14 +4229,27 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dafe7b7de3fab1a8b7099fd6a6434ca955fa65065f9c19f0f8a133693f3c2b0e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "geo-traits", "geoarrow-schema", "num-traits", "wkb", - "wkt", + "wkt 0.14.0", +] + +[[package]] +name = "geoarrow-cast" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41c308d653690a4e8ef3cbba69696056bd819e624766ece66d64cc26a638acc1" +dependencies = [ + "arrow-schema 58.3.0", + "geo-traits", + "geoarrow-array", + "geoarrow-schema", + "wkt 0.14.0", ] [[package]] @@ -4075,7 +4258,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d4a7edb2a1d87024a93805332a9c8184a0354836271d42c0d18cf628a5e3cd0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "geo-traits", "serde", "serde_json", @@ -4091,6 +4274,33 @@ dependencies = [ "libm", ] +[[package]] +name = "geojson" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e26f3c45b36fccc9cf2805e61d4da6bc4bbd5a3a9589b01afa3a40eff703bd79" +dependencies = [ + "log", + "serde", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "geozero" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f28f34864745eb2f123c990c6ffd92c1584bd39439b3f27ff2a0f4ea5b309b" +dependencies = [ + "geo-types", + "geojson", + "log", + "scroll", + "serde_json", + "thiserror 1.0.69", + "wkt 0.11.1", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -5015,16 +5225,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3944aca86f4c78f4da04af1c2bf33e664a2826b7af72972ad200d6b9de59019f" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ipc", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "async_cell", @@ -5086,13 +5296,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "253f4a0a70580c985b91e65e9ca6cad644825a4078de28d8efbacf3ffbd7ecdc" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytes", "futures", "getrandom 0.2.17", @@ -5107,13 +5317,13 @@ name = "lance-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-cast", + "arrow-cast 58.3.0", "async-trait", "clap", "futures", "lance", "lance-encoding", - "parquet", + "parquet 58.3.0", "tempfile", "tokio", "tracing", @@ -5137,9 +5347,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13f84020da5a484e2f07dd1796e09785ed7cd889857ebc4cb77e32ef214ee594" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5174,13 +5384,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7460597a66534a75987993d4dac5bc330586d99c5b79ae73367dbcbd4e29e576" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "chrono", "datafusion 53.1.0", @@ -5206,10 +5416,10 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046f5506ed2271cd941a050de7bf535dd3aedc291aadec836a63fa56c5926e3b" dependencies = [ - "arrow", - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "chrono", "futures", "half", @@ -5226,13 +5436,13 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7af54edf43dcf9d6a56cc636eb35d457e68373c6448dca3f0891b3325b4a24e6" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "bytemuck", "byteorder", "bytes", @@ -5263,12 +5473,12 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0772ae2d6207995dc1eb28aff9507f78e90b3362b58f311da001e9dc25f3d736" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5297,12 +5507,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e71fbfb51096a903cb524fe0da716f5f15fbc4a6b6f84cd6dec21abf319c5e84" dependencies = [ "arc-swap", - "arrow", - "arrow-arith", - "arrow-array", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-channel", "async-recursion", "async-trait", @@ -5361,14 +5571,14 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bab8c98ef1b870b20541d27f3ca4efdf7c9f5c25214233be07d231ba88900219" dependencies = [ - "arrow", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow 58.3.0", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-recursion", "async-trait", "byteorder", @@ -5401,9 +5611,9 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4c51cad0ac780b02dc4da48528479e7693c03e8d05390510bbc69ca2a9a1f1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "cc", "deepsize", "half", @@ -5419,7 +5629,7 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "014e8332ca0615506342e0d3af608639864b68396973be14239f09c9f21f1fc2" dependencies = [ - "arrow", + "arrow 58.3.0", "async-trait", "bytes", "lance-core", @@ -5447,11 +5657,11 @@ version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d" dependencies = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "async-trait", "byteorder", "bytes", @@ -5794,6 +6004,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.1" @@ -6136,6 +6355,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -6170,6 +6403,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -6524,6 +6779,41 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "56.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3abbfef8a25900f4925c86e4cb881ea24672ca3c31ee4fb50a8083c4c56d313" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.1", + "arrow-buffer 56.2.1", + "arrow-cast 56.2.1", + "arrow-data 56.2.1", + "arrow-ipc 56.2.1", + "arrow-schema 56.2.1", + "arrow-select 56.2.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.11.6", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.3.0" @@ -6531,12 +6821,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-data 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "base64", "brotli", "bytes", @@ -6545,7 +6835,7 @@ dependencies = [ "futures", "half", "hashbrown 0.17.1", - "lz4_flex", + "lz4_flex 0.13.1", "num-bigint", "num-integer", "num-traits", @@ -6566,8 +6856,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74c8db065291f088a2aad8ab831853eae1871c0d311c8d0b83bbc3b7e735d0fc" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6582,8 +6872,8 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a530e8d5b5e14efcb39c9a6ec55432ad11f6afb7dc4455a79be0dc615fe3cc31" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.3.0", + "arrow-schema 58.3.0", "chrono", "half", "indexmap 2.14.0", @@ -6599,7 +6889,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00ed89908289f67caa2ca078f9ff9aacd6229a313ec92b12bf4f48f613dc2b97" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "base64", "chrono", "parquet-variant", @@ -7990,6 +8280,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" + [[package]] name = "seahash" version = "4.1.0" @@ -8411,6 +8707,30 @@ dependencies = [ "smallvec", ] +[[package]] +name = "spatialbench" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07f3f4b67ccf571f183d3695aa6b9d6f996864c31782a480e97a23ed0f2f6f18" +dependencies = [ + "geo", + "once_cell", + "rand 0.8.6", + "serde", +] + +[[package]] +name = "spatialbench-arrow" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad89c32ed9e258bcc89713c296c7437963ce31f511eb8a408d2046e853294206" +dependencies = [ + "arrow 56.2.1", + "geo", + "geozero", + "spatialbench", +] + [[package]] name = "sqllogictest" version = "0.29.1" @@ -9146,7 +9466,7 @@ name = "tpchgen-arrow" version = "2.0.2" source = "git+https://github.com/clflushopt/tpchgen-rs.git?rev=438e9c2dbc25b2fff82c0efc08b3f13b5707874f#438e9c2dbc25b2fff82c0efc08b3f13b5707874f" dependencies = [ - "arrow", + "arrow 58.3.0", "tpchgen", ] @@ -9427,12 +9747,12 @@ name = "vortex" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", + "arrow-array 58.3.0", "codspeed-divan-compat", "fastlanes", "futures", "mimalloc", - "parquet", + "parquet 58.3.0", "paste", "rand 0.10.1", "rand_distr 0.6.0", @@ -9498,15 +9818,15 @@ dependencies = [ "arbitrary", "arc-swap", "arcref", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-data 58.3.0", + "arrow-ord 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", + "arrow-string 58.3.0", "async-lock", "bytes", "cfg-if", @@ -9571,13 +9891,15 @@ name = "vortex-bench" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", - "arrow-select", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", + "arrow-select 58.3.0", "async-trait", "bzip2", "clap", "futures", + "geoarrow", + "geoarrow-cast", "get_dir", "glob", "humansize", @@ -9588,12 +9910,15 @@ dependencies = [ "noodles-bgzf", "noodles-vcf", "parking_lot", - "parquet", + "parquet 56.2.1", + "parquet 58.3.0", "rand 0.10.1", "regex", "reqwest 0.13.4", "serde", "serde_json", + "spatialbench", + "spatialbench-arrow", "sysinfo", "tabled", "target-lexicon", @@ -9608,6 +9933,7 @@ dependencies = [ "url", "uuid", "vortex", + "vortex-geo", "vortex-tensor", ] @@ -9646,7 +9972,7 @@ dependencies = [ name = "vortex-buffer" version = "0.1.0" dependencies = [ - "arrow-buffer", + "arrow-buffer 58.3.0", "bitvec", "bytes", "codspeed-divan-compat", @@ -9678,13 +10004,13 @@ dependencies = [ name = "vortex-compat" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-select", + "arrow-array 58.3.0", + "arrow-select 58.3.0", "base16ct", "bytes", "clap", "futures", - "parquet", + "parquet 58.3.0", "reqwest 0.13.4", "serde", "serde_json", @@ -9725,11 +10051,11 @@ dependencies = [ name = "vortex-compute" version = "0.1.0" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-schema", + "arrow-arith 58.3.0", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-cast 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "num-traits", "rand 0.10.1", @@ -9752,7 +10078,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-schema", + "arrow-schema 58.3.0", "async-trait", "bindgen", "bytes", @@ -9783,7 +10109,7 @@ dependencies = [ name = "vortex-cuda-ffi" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-array", @@ -9805,8 +10131,8 @@ name = "vortex-cxx" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "cxx", "futures", @@ -9820,8 +10146,8 @@ name = "vortex-datafusion" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-trait", "datafusion 54.0.0", "datafusion-catalog 54.0.0", @@ -9920,7 +10246,7 @@ dependencies = [ name = "vortex-error" version = "0.1.0" dependencies = [ - "arrow-schema", + "arrow-schema 58.3.0", "flatbuffers", "jiff", "object_store", @@ -9955,8 +10281,8 @@ dependencies = [ name = "vortex-ffi" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "cbindgen", @@ -10076,12 +10402,13 @@ dependencies = [ name = "vortex-geo" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "geo", "geo-traits", "geo-types", "geoarrow", + "geoarrow-cast", "prost 0.14.4", "rstest", "vortex-array", @@ -10143,8 +10470,8 @@ dependencies = [ name = "vortex-jni" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-fs", "futures", "jni", @@ -10163,8 +10490,8 @@ dependencies = [ name = "vortex-json" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "vortex-array", "vortex-error", "vortex-session", @@ -10175,8 +10502,8 @@ name = "vortex-layout" version = "0.1.0" dependencies = [ "arcref", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "async-stream", "async-trait", "bit-vec", @@ -10266,9 +10593,9 @@ dependencies = [ name = "vortex-parquet-variant" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-buffer 58.3.0", + "arrow-schema 58.3.0", "chrono", "parquet-variant", "parquet-variant-compute", @@ -10312,9 +10639,9 @@ dependencies = [ name = "vortex-python" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-data 58.3.0", + "arrow-schema 58.3.0", "async-fs", "bytes", "itertools 0.14.0", @@ -10336,9 +10663,9 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-row", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-row 58.3.0", + "arrow-schema 58.3.0", "bytes", "codspeed-divan-compat", "mimalloc", @@ -10357,8 +10684,8 @@ name = "vortex-runend" version = "0.1.0" dependencies = [ "arbitrary", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "itertools 0.14.0", "num-traits", @@ -10456,8 +10783,8 @@ dependencies = [ name = "vortex-tensor" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "codspeed-divan-compat", "half", "itertools 0.14.0", @@ -10480,8 +10807,8 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "futures", "vortex", "vortex-cuda", @@ -10492,8 +10819,8 @@ name = "vortex-tui" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-schema 58.3.0", "clap", "console_error_panic_hook", "crossterm", @@ -10506,7 +10833,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "js-sys", - "parquet", + "parquet 58.3.0", "ratatui", "ratzilla", "serde", @@ -10533,9 +10860,9 @@ dependencies = [ name = "vortex-web-wasm" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 58.3.0", + "arrow-ipc 58.3.0", + "arrow-schema 58.3.0", "console_error_panic_hook", "futures", "js-sys", @@ -11079,6 +11406,18 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "wkt" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54f7f1ff4ea4c18936d6cd26a6fd24f0003af37e951a8e0e8b9e9a2d0bd0a46d" +dependencies = [ + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "wkt" version = "0.14.0" diff --git a/Cargo.toml b/Cargo.toml index 4d04d36403e..e690aa88cf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -162,6 +162,7 @@ geo = "0.31.0" geo-traits = "0.3.0" geo-types = "0.7.19" geoarrow = "0.8.0" +geoarrow-cast = "0.8.0" get_dir = "0.5.0" glob = "0.3.2" goldenfile = "1" @@ -241,6 +242,14 @@ similar = "3.0.0" sketches-ddsketch = "0.4.0" smallvec = "1.15.1" smol = "2.0.2" +spatialbench = "0.2" +spatialbench-arrow = "0.2" +# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream +# catches up, write its generated batches with a matching parquet instead of converting +# arrow versions at the boundary. +spatialbench-parquet = { package = "parquet", version = "56", features = [ + "async", +] } static_assertions = "1.1" strum = "0.28" syn = { version = "2.0.117", features = ["full"] } diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index fed9f82b004..74f7a0cfa64 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -27,6 +27,9 @@ pub struct DuckClient { connection: Option, pub db_path: PathBuf, pub threads: Option, + /// Replayed on every (re)open, since extensions load per instance. Currently + /// `INSTALL spatial; LOAD spatial;` for SpatialBench. + init_sql: Vec, } impl DuckClient { @@ -68,9 +71,19 @@ impl DuckClient { connection: Some(connection), db_path, threads, + init_sql: Vec::new(), }) } + /// Run `statements` now and after every subsequent [`DuckClient::reopen`]. + pub fn set_init_sql(&mut self, statements: Vec) -> Result<()> { + for stmt in &statements { + self.connection().query(stmt)?; + } + self.init_sql = statements; + Ok(()) + } + pub fn open_and_setup_database( path: Option, threads: Option, @@ -118,6 +131,14 @@ impl DuckClient { self.db = Some(db); self.connection = Some(connection); + // Replay init SQL (e.g. LOAD spatial) — extensions are per-instance. + for stmt in &self.init_sql { + self.connection + .as_ref() + .vortex_expect("connection just opened") + .query(stmt)?; + } + Ok(()) } @@ -133,6 +154,7 @@ impl DuckClient { connection: Some(connection), db_path, threads: None, + init_sql: Vec::new(), }) } diff --git a/benchmarks/duckdb-bench/src/main.rs b/benchmarks/duckdb-bench/src/main.rs index 8ba4937f566..cf4fa071067 100644 --- a/benchmarks/duckdb-bench/src/main.rs +++ b/benchmarks/duckdb-bench/src/main.rs @@ -171,12 +171,13 @@ fn main() -> anyhow::Result<()> { &filtered_queries, mode, |format| { - let ctx = DuckClient::new( + let mut ctx = DuckClient::new( &*benchmark, format, args.delete_duckdb_database, args.threads, )?; + ctx.set_init_sql(benchmark.engine_init_sql(Engine::DuckDB))?; ctx.register_tables(&*benchmark, format)?; // Duckdb doesn't support octet_length for strings but we need this diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml index 3b793c6124a..0187bdb986e 100644 --- a/vortex-bench/Cargo.toml +++ b/vortex-bench/Cargo.toml @@ -23,6 +23,7 @@ vortex = { workspace = true, features = [ "tokio", "zstd", ] } +vortex-geo = { workspace = true } vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex. anyhow = { workspace = true } @@ -33,6 +34,8 @@ async-trait = { workspace = true } bzip2 = { workspace = true } clap = { workspace = true, features = ["derive"] } futures = { workspace = true } +geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } get_dir = { workspace = true } glob = { workspace = true } humansize = { workspace = true } @@ -48,6 +51,9 @@ regex = { workspace = true } reqwest = { workspace = true, features = ["stream"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } +spatialbench = { workspace = true } +spatialbench-arrow = { workspace = true } +spatialbench-parquet = { workspace = true } sysinfo = { workspace = true } tabled = { workspace = true, features = ["std"] } target-lexicon = { workspace = true } diff --git a/vortex-bench/spatialbench.sql b/vortex-bench/spatialbench.sql new file mode 100644 index 00000000000..a55ba851cd6 --- /dev/null +++ b/vortex-bench/spatialbench.sql @@ -0,0 +1,228 @@ +-- SpatialBench queries (Apache Sedona), WKB dialect. See sedona-spatialbench/docs/queries.md. +-- Numbered from Q0 (= SpatialBench Q1). Only Q0 is wired up today, the rest are not run yet. + +-- Q0: Find trips starting within 50km of the Sedona city center, ranked by distance. +SELECT + t_tripkey, + ST_X(ST_GeomFromWKB(t_pickuploc)) AS pickup_lon, + ST_Y(ST_GeomFromWKB(t_pickuploc)) AS pickup_lat, + t_pickuptime, + ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) AS distance_to_center +FROM trip +WHERE ST_Distance(ST_GeomFromWKB(t_pickuploc), ST_Point(-111.7610::double, 34.8697::double)) <= 0.45::double +ORDER BY distance_to_center ASC, t_tripkey ASC; + +-- Q1: Count trips starting within Coconino County (Arizona) zone. +SELECT COUNT(*) AS trip_count_in_coconino_county +FROM trip t +WHERE ST_Intersects( + ST_GeomFromWKB(t.t_pickuploc), + ( + SELECT ST_GeomFromWKB(z.z_boundary) + FROM zone z + WHERE z.z_name = 'Coconino County' + LIMIT 1 + ) +); + +-- Q2: Monthly trip statistics within a 15km radius of the Sedona city center. +SELECT + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + COUNT(t.t_tripkey) AS total_trips, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_fare) AS avg_fare +FROM trip t +WHERE ST_DWithin( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromText('POLYGON(( + -111.9060 34.7347, -111.6160 34.7347, + -111.6160 35.0047, -111.9060 35.0047, + -111.9060 34.7347 + ))'), -- Bounding box around Sedona + 0.045 -- Additional 5km buffer in degrees +) +GROUP BY pickup_month +ORDER BY pickup_month; + +-- Q3: Zone distribution of top 1000 trips by tip amount. +SELECT + z.z_zonekey, + z.z_name, + COUNT(*) AS trip_count +FROM + zone z + JOIN ( + SELECT t.t_pickuploc + FROM trip t + ORDER BY t.t_tip DESC, t.t_tripkey ASC + LIMIT 1000 + ) top_trips + ON ST_Within( + ST_GeomFromWKB(top_trips.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY trip_count DESC, z.z_zonekey ASC; + +-- Q4: Monthly travel patterns for repeat customers (convex hull of dropoff locations). +SELECT + c.c_custkey, + c.c_name AS customer_name, + DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, + ST_Area( + ST_ConvexHull(ST_Collect(ST_GeomFromWKB(t.t_dropoffloc))) + ) AS monthly_travel_hull_area, + COUNT(*) as dropoff_count +FROM trip t +JOIN customer c + ON t.t_custkey = c.c_custkey +GROUP BY c.c_custkey, c.c_name, pickup_month +HAVING dropoff_count > 5 -- Only include repeat customers +ORDER BY monthly_travel_hull_area DESC, c.c_custkey ASC; + +-- Q5: Zone statistics for trips within a 50km radius of the Sedona city center. +SELECT + z.z_zonekey, + z.z_name, + COUNT(t.t_tripkey) AS total_pickups, + AVG(t.t_distance) AS avg_distance, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration +FROM trip t, zone z +WHERE ST_Intersects( + ST_GeomFromText('POLYGON(( + -112.2110 34.4197, -111.3110 34.4197, + -111.3110 35.3197, -112.2110 35.3197, + -112.2110 34.4197 + ))'), -- Bounding box around Sedona + ST_GeomFromWKB(z.z_boundary) + ) + AND ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY total_pickups DESC, z.z_zonekey ASC; + +-- Q6: Detect potential route detours by comparing reported vs. geometric distances. +WITH trip_lengths AS ( + SELECT + t.t_tripkey, + t.t_distance AS reported_distance_m, + ST_Length( + ST_MakeLine( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(t.t_dropoffloc) + ) + ) * 111111 AS line_distance_m -- Approx. meters per degree + FROM trip t +) +SELECT + t.t_tripkey, + t.reported_distance_m, + t.line_distance_m, + t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio +FROM trip_lengths t +ORDER BY + detour_ratio DESC NULLS LAST, + reported_distance_m DESC, + t_tripkey ASC; + +-- Q7: Count nearby pickups for each building within a 500m radius. +SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count +FROM trip t +JOIN building b +ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(b.b_boundary), 0.0045) -- ~500m +GROUP BY b.b_buildingkey, b.b_name +ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC; + +-- Q8: Building conflation (duplicate/overlap detection via IoU). +WITH b1 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +b2 AS ( + SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom + FROM building +), +pairs AS ( + SELECT + b1.id AS building_1, + b2.id AS building_2, + ST_Area(b1.geom) AS area1, + ST_Area(b2.geom) AS area2, + ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area + FROM b1 + JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom) +) +SELECT + building_1, + building_2, + area1, + area2, + overlap_area, + CASE + WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0 + ELSE overlap_area / (area1 + area2 - overlap_area) + END AS iou +FROM pairs +ORDER BY iou DESC, building_1 ASC, building_2 ASC; + +-- Q9: Zone statistics for trips starting within each zone. +SELECT + z.z_zonekey, + z.z_name AS pickup_zone, + AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration, + AVG(t.t_distance) AS avg_distance, + COUNT(t.t_tripkey) AS num_trips +FROM + zone z + LEFT JOIN trip t + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary) + ) +GROUP BY z.z_zonekey, z.z_name +ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC; + +-- Q10: Count trips that cross between different zones. +SELECT COUNT(*) AS cross_zone_trip_count +FROM + trip t + JOIN zone pickup_zone + ON ST_Within( + ST_GeomFromWKB(t.t_pickuploc), + ST_GeomFromWKB(pickup_zone.z_boundary) + ) + JOIN zone dropoff_zone + ON ST_Within( + ST_GeomFromWKB(t.t_dropoffloc), + ST_GeomFromWKB(dropoff_zone.z_boundary) + ) +WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey; + +-- Q11: Find five nearest buildings to each trip pickup location using KNN join. +WITH trip_with_geom AS ( + SELECT + t_tripkey, + t_pickuploc, + ST_GeomFromWKB(t_pickuploc) as pickup_geom + FROM trip +), +building_with_geom AS ( + SELECT + b_buildingkey, + b_name, + b_boundary, + ST_GeomFromWKB(b_boundary) as boundary_geom + FROM building +) +SELECT + t.t_tripkey, + t.t_pickuploc, + b.b_buildingkey, + b.b_name AS building_name, + ST_Distance(t.pickup_geom, b.boundary_geom) AS distance_to_building +FROM trip_with_geom t +JOIN building_with_geom b + ON ST_KNN(t.pickup_geom, b.boundary_geom, 5, FALSE) +ORDER BY t.t_tripkey ASC, distance_to_building ASC, b.b_buildingkey ASC; diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index 2872a02aa64..a7a76585b70 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -8,6 +8,7 @@ use glob::Pattern; use url::Url; use crate::BenchmarkDataset; +use crate::Engine; use crate::Format; /// Specification for a table in a benchmark dataset. @@ -32,6 +33,13 @@ pub trait Benchmark: Send + Sync { /// Get all available queries for this benchmark fn queries(&self) -> anyhow::Result>; + /// SQL an `engine` must run before this benchmark's queries (e.g. loading engine + /// extensions). Runners replay these after every (re)open. Default: none. + fn engine_init_sql(&self, engine: Engine) -> Vec { + let _ = engine; + Vec::new() + } + /// Generate or prepare base data for the benchmark (typically Parquet format). /// This is the canonical source data that can be converted to other formats. /// This should be idempotent - safe to call multiple times. diff --git a/vortex-bench/src/datasets/mod.rs b/vortex-bench/src/datasets/mod.rs index 3e72ba69e7f..caaaf88f749 100644 --- a/vortex-bench/src/datasets/mod.rs +++ b/vortex-bench/src/datasets/mod.rs @@ -69,6 +69,11 @@ pub enum BenchmarkDataset { ClickBench { flavor: Flavor }, #[serde(rename = "public-bi")] PublicBi { name: String }, + #[serde(rename = "spatialbench")] + SpatialBench { + scale_factor: String, + native_points: bool, + }, #[serde(rename = "statpopgen")] StatPopGen { n_rows: u64 }, #[serde(rename = "polarsignals")] @@ -87,6 +92,7 @@ impl BenchmarkDataset { BenchmarkDataset::TpcDS { .. } => "tpcds", BenchmarkDataset::ClickBench { .. } => "clickbench", BenchmarkDataset::PublicBi { .. } => "public-bi", + BenchmarkDataset::SpatialBench { .. } => "spatialbench", BenchmarkDataset::StatPopGen { .. } => "statpopgen", BenchmarkDataset::PolarSignals { .. } => "polarsignals", BenchmarkDataset::Fineweb => "fineweb", @@ -106,6 +112,17 @@ impl Display for BenchmarkDataset { Flavor::Single => write!(f, "clickbench-single"), }, BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"), + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => { + let points = if *native_points { + ", points=native" + } else { + "" + }; + write!(f, "spatialbench(sf={scale_factor}{points})") + } BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"), BenchmarkDataset::PolarSignals { n_rows } => { write!(f, "polarsignals(n_rows={n_rows})") @@ -163,6 +180,7 @@ impl BenchmarkDataset { "supplier", ], BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(), + BenchmarkDataset::SpatialBench { .. } => &["trip", "building"], BenchmarkDataset::StatPopGen { .. } => &["statpopgen"], BenchmarkDataset::PolarSignals { .. } => &["stacktraces"], BenchmarkDataset::Fineweb => &["fineweb"], diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 30ff45c97a8..b131906d85f 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -34,6 +34,8 @@ use vortex::file::VortexWriteOptions; use vortex::file::WriteStrategyBuilder; use vortex::utils::aliases::hash_map::HashMap; +use crate::spatialbench::SpatialBenchBenchmark; + pub mod appian; pub mod benchmark; pub mod clickbench; @@ -51,6 +53,7 @@ pub mod public_bi; pub mod random_access; pub mod realnest; pub mod runner; +pub mod spatialbench; pub mod statpopgen; pub mod tpcds; pub mod tpch; @@ -72,8 +75,11 @@ use vortex::session::VortexSession; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; -pub static SESSION: LazyLock = - LazyLock::new(|| VortexSession::default().with_tokio()); +pub static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default().with_tokio(); + vortex_geo::initialize(&session); + session +}); #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct Target { @@ -265,6 +271,8 @@ pub enum BenchmarkArg { PolarSignals, #[clap(name = "public-bi")] PublicBi, + #[clap(name = "spatialbench")] + SpatialBench, } /// Default scale factor for TPC-related benchmarks @@ -326,6 +334,21 @@ pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result { + let scale_factor = opts.get(SCALE_FACTOR_KEY).unwrap_or(DEFAULT_SCALE_FACTOR); + let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); + let native_points = match opts.get("points") { + None | Some("wkb") => false, + Some("native") => true, + Some(other) => bail!("unknown points option {other:?}, expected wkb or native"), + }; + let benchmark = SpatialBenchBenchmark::new( + scale_factor.to_string(), + remote_data_dir, + native_points, + )?; + Ok(Box::new(benchmark) as _) + } } } diff --git a/vortex-bench/src/spatialbench/benchmark.rs b/vortex-bench/src/spatialbench/benchmark.rs new file mode 100644 index 00000000000..b1c76bf9705 --- /dev/null +++ b/vortex-bench/src/spatialbench/benchmark.rs @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench benchmark implementation + +use std::fs; + +use url::Url; + +use crate::Benchmark; +use crate::BenchmarkDataset; +use crate::Engine; +use crate::Format; +use crate::TableSpec; +use crate::spatialbench::datagen; +use crate::spatialbench::datagen::Table; +use crate::utils::file::resolve_data_url; +use crate::workspace_root; + +/// Data-dir subfolder for the native-geometry Vortex files (`points=native`). +pub const NATIVE_DIR: &str = "vortex-native"; + +/// Data-dir subfolder for the native-geometry GeoParquet files (`points=native`). +pub const PARQUET_NATIVE_DIR: &str = "parquet-native"; + +/// Queries wired up to run (0-based, `spatialbench.sql` order): Q0 (radius filter) and Q7 (building +/// join). The file holds the full suite; the rest need tables/functions not wired yet. +const SUPPORTED_QUERIES: &[usize] = &[0, 7]; + +/// SpatialBench geospatial benchmark (Apache Sedona): a `trip` point table and `building` polygons, +/// queried with spatial filters and joins. See . +pub struct SpatialBenchBenchmark { + pub scale_factor: String, + pub data_url: Url, + /// `--opt points=native`: store geometry as native `Point`/`Polygon` (not WKB), read the native + /// dirs, and strip `ST_GeomFromWKB` from the queries. + pub native_points: bool, +} + +impl SpatialBenchBenchmark { + pub fn new( + scale_factor: String, + use_remote_data_dir: Option, + native_points: bool, + ) -> anyhow::Result { + Ok(Self { + data_url: resolve_data_url( + use_remote_data_dir.as_deref(), + &format!("spatialbench/{scale_factor}"), + )?, + scale_factor, + native_points, + }) + } +} + +#[async_trait::async_trait] +impl Benchmark for SpatialBenchBenchmark { + fn queries(&self) -> anyhow::Result> { + // The file is the WKB dialect; `points=native` strips `ST_GeomFromWKB(..)`. Statements are + // `;`-separated, numbered 0-based in file order; only `SUPPORTED_QUERIES` run. + let queries_file = workspace_root() + .join("vortex-bench") + .join("spatialbench") + .with_extension("sql"); + let contents = fs::read_to_string(queries_file)?; + let contents = if self.native_points { + strip_wkb_wrappers(&contents) + } else { + contents + }; + Ok(contents + .split_terminator(';') + .map(str::trim) + .map(str::to_string) + .enumerate() + .filter(|(idx, _)| SUPPORTED_QUERIES.contains(idx)) + .collect()) + } + + async fn generate_base_data(&self) -> anyhow::Result<()> { + if self.data_url.scheme() != "file" { + return Ok(()); + } + + let base_data_dir = self + .data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", self.data_url.as_str()))?; + + datagen::generate_tables(&self.scale_factor, base_data_dir.clone()).await?; + + if self.native_points { + let parquet_dir = base_data_dir.join(Format::Parquet.name()); + let native_dir = base_data_dir.join(NATIVE_DIR); + let parquet_native_dir = base_data_dir.join(PARQUET_NATIVE_DIR); + // Natively encode every table with geometry columns (trip Point, building Polygon). + for table in [Table::Trip, Table::Building] { + datagen::write_native_vortex(table, &parquet_dir, &native_dir).await?; + datagen::write_native_parquet(table, &parquet_dir, &parquet_native_dir).await?; + } + } + Ok(()) + } + + fn format_path(&self, format: Format, base_url: &Url) -> anyhow::Result { + if self.native_points { + // points=native reads the native-geometry dirs (Vortex / GeoParquet); other formats + // would feed WKB to the stripped SQL, so bail. + let dir = match format { + Format::OnDiskVortex => NATIVE_DIR, + Format::Parquet => PARQUET_NATIVE_DIR, + other => anyhow::bail!( + "points=native only supports the vortex and parquet formats, got {other}" + ), + }; + return Ok(base_url.join(&format!("{dir}/"))?); + } + Ok(base_url.join(&format!("{}/", format.name()))?) + } + + fn expected_row_counts(&self) -> Option> { + // Q0 result count by scale factor (index 0), cross-checked against a brute-force WKB decode. + match self.scale_factor.as_str() { + "0.1" => Some(vec![6]), + "1.0" => Some(vec![94]), + "3.0" => Some(vec![267]), + _ => None, + } + } + + fn dataset(&self) -> BenchmarkDataset { + BenchmarkDataset::SpatialBench { + scale_factor: self.scale_factor.clone(), + native_points: self.native_points, + } + } + + fn dataset_name(&self) -> &str { + "spatialbench" + } + + fn dataset_display(&self) -> String { + format!("spatialbench(sf={})", self.scale_factor) + } + + fn data_url(&self) -> &Url { + &self.data_url + } + + fn table_specs(&self) -> Vec { + vec![ + TableSpec::new("trip", None), + TableSpec::new("building", None), + ] + } + + /// Scope each table to its own `{table}_*.{ext}` files; the default globs every file in the + /// format dir, conflating the `trip` and `building` schemas. + fn pattern(&self, table_name: &str, format: Format) -> Option { + Some( + format!("{}_*.{}", table_name, format.ext()) + .parse() + .expect("valid glob pattern"), + ) + } + + /// DuckDB needs the `spatial` extension for `ST_*`; the runner replays it on each (re)open. + /// First INSTALL needs network. + fn engine_init_sql(&self, engine: Engine) -> Vec { + match engine { + Engine::DuckDB => vec!["INSTALL spatial;".to_string(), "LOAD spatial;".to_string()], + _ => Vec::new(), + } + } +} + +/// Drop each `ST_GeomFromWKB(col)` wrapper down to `col`: native columns are already geometries. +fn strip_wkb_wrappers(sql: &str) -> String { + const OPEN: &str = "ST_GeomFromWKB("; + let mut out = String::with_capacity(sql.len()); + let mut rest = sql; + while let Some(pos) = rest.find(OPEN) { + out.push_str(&rest[..pos]); + let after = &rest[pos + OPEN.len()..]; + match after.find(')') { + Some(close) => { + out.push_str(&after[..close]); + rest = &after[close + 1..]; + } + // Unbalanced wrapper: emit it verbatim and stop rewriting. + None => { + out.push_str(OPEN); + rest = after; + } + } + } + out.push_str(rest); + out +} diff --git a/vortex-bench/src/spatialbench/datagen/mod.rs b/vortex-bench/src/spatialbench/datagen/mod.rs new file mode 100644 index 00000000000..671b87663a4 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/mod.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench data preparation. [`wkb`] generates the canonical WKB base tables; [`native`] +//! derives the native-Point encodings from them for `points=native`. The [`table`] catalog is the +//! single source of truth for the base tables both stages share. + +pub mod native; +pub mod table; +pub mod wkb; + +pub use native::write_native_parquet; +pub use native::write_native_vortex; +pub use table::Table; +pub use wkb::generate_tables; diff --git a/vortex-bench/src/spatialbench/datagen/native.rs b/vortex-bench/src/spatialbench/datagen/native.rs new file mode 100644 index 00000000000..975caf1712b --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/native.rs @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Native-geometry preparation for `points=native`: decode each table's WKB geometry columns to +//! native GeoArrow types in Arrow land (`geoarrow_cast`, so Vortex never decodes WKB), then write +//! them as a native Vortex file and a GeoParquet file. The decode is a one-time data-prep cost. + +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Context; +use arrow_array::RecordBatch; +use arrow_schema::Schema; +use futures::TryStreamExt; +use geoarrow::array::GenericWkbArray; +use geoarrow::array::GeoArrowArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::Crs; +use geoarrow::datatypes::Dimension; +use geoarrow::datatypes::GeoArrowType; +use geoarrow::datatypes::Metadata; +use geoarrow::datatypes::PointType; +use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; +use parquet::arrow::AsyncArrowWriter; +use parquet::arrow::ParquetRecordBatchStreamBuilder; +use parquet::arrow::ProjectionMask; +use parquet::basic::Compression; +use parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use vortex::array::ArrayRef; +use vortex::array::IntoArray; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrow::ArrowSessionExt; +use vortex::file::WriteOptionsSessionExt; + +use super::table::GeometryKind; +use super::table::Table; +use crate::SESSION; +use crate::utils::file::idempotent_async; + +/// EPSG:4326, the CRS the benchmark data and queries assume. +fn epsg_4326() -> Arc { + Arc::new(Metadata::new( + Crs::from_unknown_crs_type("EPSG:4326".to_string()), + None, + )) +} + +/// The native GeoArrow type for `kind`, separated-XY in EPSG:4326. +fn geoarrow_type(kind: GeometryKind) -> GeoArrowType { + match kind { + GeometryKind::Point => GeoArrowType::Point( + PointType::new(Dimension::XY, epsg_4326()).with_coord_type(CoordType::Separated), + ), + GeometryKind::Polygon => GeoArrowType::Polygon( + PolygonType::new(Dimension::XY, epsg_4326()).with_coord_type(CoordType::Separated), + ), + } +} + +/// Write `{native_dir}/{table}_0.vortex` with native geometry columns from the WKB parquet. Idempotent. +pub async fn write_native_vortex( + table: Table, + parquet_dir: &Path, + native_dir: &Path, +) -> anyhow::Result { + idempotent_async( + native_dir.join(format!("{}_0.vortex", table.name())), + |path| async move { + let chunks = map_source_batches(parquet_dir, table, |b| native_chunk(b, table)).await?; + + let dtype = chunks[0].dtype().clone(); + let chunked = ChunkedArray::try_new(chunks, dtype)?.into_array(); + let mut file = TokioFile::create(&path).await?; + SESSION + .write_options() + .write(&mut file, chunked.to_array_stream()) + .await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry table"); + Ok(()) + }, + ) + .await +} + +/// Write `{out_dir}/{table}_0.parquet` with native GeoArrow geometry columns (separated XY, +/// `geoarrow.*` field metadata). Idempotent. +pub async fn write_native_parquet( + table: Table, + parquet_dir: &Path, + out_dir: &Path, +) -> anyhow::Result { + idempotent_async( + out_dir.join(format!("{}_0.parquet", table.name())), + |path| async move { + let batches = + map_source_batches(parquet_dir, table, |b| native_record_batch(b, table)).await?; + + let schema = batches.first().context("no batches to write")?.schema(); + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = + AsyncArrowWriter::try_new(TokioFile::create(&path).await?, schema, Some(props))?; + for batch in &batches { + writer.write(batch).await?; + } + writer.close().await?; + tracing::info!(path = %path.display(), table = table.name(), "wrote native geometry parquet table"); + Ok(()) + }, + ) + .await +} + +/// Apply `f` to every batch read from `table`'s base WKB parquet parts, projected to its columns. +async fn map_source_batches( + parquet_dir: &Path, + table: Table, + mut f: impl FnMut(RecordBatch) -> anyhow::Result, +) -> anyhow::Result> { + let pattern = parquet_dir.join(format!("{}_*.parquet", table.name())); + let mut files: Vec = + glob::glob(&pattern.to_string_lossy())?.collect::>()?; + files.sort(); + anyhow::ensure!(!files.is_empty(), "no parquet matching {pattern:?}"); + + let mut out = Vec::new(); + for file in files { + let builder = ParquetRecordBatchStreamBuilder::new(TokioFile::open(&file).await?).await?; + let mask = + ProjectionMask::columns(builder.parquet_schema(), table.columns().iter().copied()); + let mut stream = builder.with_projection(mask).build()?; + while let Some(batch) = stream.try_next().await? { + out.push(f(batch)?); + } + } + Ok(out) +} + +/// Decode each of `table`'s geometry columns from WKB to its native GeoArrow type, swapping the +/// column in so the field carries the matching `geoarrow.*` extension metadata. +fn native_record_batch(batch: RecordBatch, table: Table) -> anyhow::Result { + let schema = batch.schema(); + let mut fields = schema.fields().to_vec(); + let mut columns = batch.columns().to_vec(); + + for geom in table.geometry_columns() { + let idx = schema.index_of(geom.name)?; + let geo_type = geoarrow_type(geom.kind); + let wkb = GenericWkbArray::::try_from(( + batch.column(idx).as_ref(), + WkbType::new(epsg_4326()), + )) + .map_err(|e| anyhow::anyhow!("wrapping WKB column {}: {e}", geom.name))?; + columns[idx] = cast(&wkb, &geo_type) + .map_err(|e| anyhow::anyhow!("parsing WKB column {}: {e}", geom.name))? + .to_array_ref(); + fields[idx] = Arc::new(geo_type.to_field(geom.name, false)); + } + + Ok(RecordBatch::try_new( + Arc::new(Schema::new(fields)), + columns, + )?) +} + +/// Convert a WKB batch to a Vortex struct chunk with `table`'s geometry columns as native types. +fn native_chunk(batch: RecordBatch, table: Table) -> anyhow::Result { + let native_batch = native_record_batch(batch, table)?; + let native_schema = native_batch.schema(); + SESSION + .arrow() + .from_arrow_record_batch(native_batch, &native_schema) + .context("importing native batch") +} diff --git a/vortex-bench/src/spatialbench/datagen/table.rs b/vortex-bench/src/spatialbench/datagen/table.rs new file mode 100644 index 00000000000..bce58e98ad6 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/table.rs @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The shared SpatialBench table catalog: one source of truth for the base tables, used by both +//! the WKB generation ([`super::wkb`]) and the native geometry conversion ([`super::native`]). + +/// A SpatialBench base table. +#[derive(Clone, Copy)] +pub enum Table { + Trip, + Building, +} + +/// Every base table. WKB generation emits all of them; native conversion handles those with +/// geometry columns. +pub(crate) const TABLES: &[Table] = &[Table::Trip, Table::Building]; + +/// A geometry column and the geometry type its WKB bytes decode to. +pub(crate) struct GeometryColumn { + pub(crate) name: &'static str, + pub(crate) kind: GeometryKind, +} + +/// Geometry types a column can hold. Add a variant (and the matching arm in [`super::native`]) as +/// tables with new geometry types are wired. +#[derive(Clone, Copy, Debug)] +pub(crate) enum GeometryKind { + Point, + Polygon, +} + +impl Table { + /// File stem under a format directory, e.g. `Trip` → `trip_{part}.parquet`. + pub(crate) fn name(self) -> &'static str { + match self { + Table::Trip => "trip", + Table::Building => "building", + } + } + + /// Columns the wired queries read — the projection applied when building native files. + pub(crate) fn columns(self) -> &'static [&'static str] { + match self { + Table::Trip => &["t_tripkey", "t_pickuptime", "t_pickuploc"], + Table::Building => &["b_buildingkey", "b_name", "b_boundary"], + } + } + + /// Geometry columns to decode from WKB to native, with their geometry type. Empty for tables + /// only used on the WKB lane (DuckDB reads WKB directly; no native conversion needed yet). + pub(crate) fn geometry_columns(self) -> &'static [GeometryColumn] { + match self { + Table::Trip => &[GeometryColumn { + name: "t_pickuploc", + kind: GeometryKind::Point, + }], + Table::Building => &[GeometryColumn { + name: "b_boundary", + kind: GeometryKind::Polygon, + }], + } + } +} diff --git a/vortex-bench/src/spatialbench/datagen/wkb.rs b/vortex-bench/src/spatialbench/datagen/wkb.rs new file mode 100644 index 00000000000..aea236f6333 --- /dev/null +++ b/vortex-bench/src/spatialbench/datagen/wkb.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench WKB base-table generation via the `spatialbench` crates (a tpchgen-rs fork). +//! Geometry is emitted as WKB; the native-Point encodings derive from these files in +//! [`super::native`]. + +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::Result; +// spatialbench emits arrow-56 batches, so they must be written with its matching arrow-56 +// parquet crate, not the workspace's arrow-58 one. The parquet file itself is version-neutral. +use spatialbench::generators::BuildingGenerator; +use spatialbench::generators::TripGenerator; +use spatialbench_arrow::BuildingArrow; +use spatialbench_arrow::RecordBatchIterator; +use spatialbench_arrow::TripArrow; +use spatialbench_parquet::arrow::AsyncArrowWriter; +use spatialbench_parquet::basic::Compression; +use spatialbench_parquet::file::properties::WriterProperties; +use tokio::fs::File as TokioFile; +use tracing::info; + +use super::table::TABLES; +use super::table::Table; +use crate::Format; +use crate::utils::file::idempotent_async; + +/// Batch size matching the TPC-H generator's streaming batches. +const BATCH_SIZE: usize = 8192 * 64; + +/// Batch iterator for one partition of `table`, from the arrow-56 `spatialbench` crates. +fn iterator( + table: Table, + scale_factor: f64, + part: i32, + part_count: i32, +) -> Box { + match table { + Table::Trip => Box::new( + TripArrow::new(TripGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + Table::Building => Box::new( + BuildingArrow::new(BuildingGenerator::new(scale_factor, part, part_count)) + .with_batch_size(BATCH_SIZE), + ), + } +} + +/// Generate the SpatialBench base tables as parquet under `{output_dir}/parquet/`. +pub async fn generate_tables(scale_factor: &str, output_dir: PathBuf) -> Result<()> { + let scale_factor = scale_factor.parse::()?; + let parquet_dir = output_dir.join(Format::Parquet.name()); + fs::create_dir_all(&parquet_dir)?; + + // One part per unit of scale factor keeps each file near the ~350MB the trip generator + // produces at SF1. + #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let num_parts = (scale_factor.ceil() as usize).max(1); + let part_count = i32::try_from(num_parts)?; + + for &table in TABLES { + for part_idx in 0..num_parts { + let output_file = parquet_dir.join(format!("{}_{part_idx}.parquet", table.name())); + let part = i32::try_from(part_idx + 1)?; + + idempotent_async(output_file.to_string_lossy().as_ref(), |path| async move { + info!( + scale_factor, + part, + part_count, + table = table.name(), + "Generating SpatialBench table" + ); + + let iter = iterator(table, scale_factor, part, part_count); + let schema = Arc::clone(iter.schema()); + + let file = TokioFile::create(&path).await?; + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = AsyncArrowWriter::try_new(file, schema, Some(props))?; + for batch in iter { + writer.write(&batch).await?; + } + writer.close().await?; + + Ok::<(), anyhow::Error>(()) + }) + .await?; + } + } + + Ok(()) +} diff --git a/vortex-bench/src/spatialbench/mod.rs b/vortex-bench/src/spatialbench/mod.rs new file mode 100644 index 00000000000..bba06bd7ef9 --- /dev/null +++ b/vortex-bench/src/spatialbench/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! SpatialBench geospatial analytics benchmark. +//! +//! See . + +pub mod benchmark; +pub mod datagen; + +pub use benchmark::SpatialBenchBenchmark; diff --git a/vortex-bench/src/v3.rs b/vortex-bench/src/v3.rs index 48e8a7f1c94..7ac1d72365b 100644 --- a/vortex-bench/src/v3.rs +++ b/vortex-bench/src/v3.rs @@ -294,6 +294,7 @@ fn canonical_tpc_scale_factor(scale_factor: &str) -> String { /// | `GhArchive` | `gharchive` | `None` | `None` | | /// | `Appian` | `appian` | `None` | `None` | Static dataset; no scale factor. | /// | `PublicBi { name }` | `public-bi` | dataset name (e.g. `cms-provider`) | `None` | Sub-dataset name lives in `dataset_variant`. | +/// | `SpatialBench { scale_factor, native_points }` | `spatialbench` | `points-native` when native, else `None` | SF as string | Same canonicalization as TPC-H; no historical v2 records to merge with. | pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, Option) { match d { BenchmarkDataset::TpcH { scale_factor } => ( @@ -318,6 +319,14 @@ pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, // live). Drop it to keep live ingests merging into the migrated // group. The dataset-level `n_rows` is recoverable from the bench // matrix if ever needed. + BenchmarkDataset::SpatialBench { + scale_factor, + native_points, + } => ( + "spatialbench".to_string(), + native_points.then(|| "points-native".to_string()), + Some(canonical_tpc_scale_factor(scale_factor)), + ), BenchmarkDataset::StatPopGen { .. } => ("statpopgen".to_string(), None, None), BenchmarkDataset::PolarSignals { .. } => ("polarsignals".to_string(), None, None), BenchmarkDataset::Fineweb => ("fineweb".to_string(), None, None), diff --git a/vortex-duckdb/src/convert/dtype.rs b/vortex-duckdb/src/convert/dtype.rs index 4238b354182..b2bb9a5e962 100644 --- a/vortex-duckdb/src/convert/dtype.rs +++ b/vortex-duckdb/src/convert/dtype.rs @@ -58,6 +58,8 @@ use vortex::extension::datetime::Time; use vortex::extension::datetime::TimeUnit; use vortex::extension::datetime::Timestamp; use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::Point; +use vortex_geo::extension::Polygon; use vortex_geo::extension::WellKnownBinary; use crate::cpp::DUCKDB_TYPE; @@ -245,9 +247,14 @@ impl TryFrom<&DType> for LogicalType { return temporal_to_duckdb(temporal); } - if let Some(wkb) = ext_dtype.metadata_opt::() { - let crs = wkb.crs.as_ref(); - return LogicalType::geometry_type(crs.map(|crs| crs.as_str())); + // Native Point/Polygon and WKB all surface to DuckDB as GEOMETRY so `ST_*` bind; for + // native geometry the filter work then pushes down into the Vortex scan. + if let Some(geo) = ext_dtype + .metadata_opt::() + .or_else(|| ext_dtype.metadata_opt::()) + .or_else(|| ext_dtype.metadata_opt::()) + { + return LogicalType::geometry_type(geo.crs.as_deref()); } vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()); diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index 324086e5775..5ffdbe60958 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -27,6 +27,7 @@ use vortex::expr::not; use vortex::expr::or_collect; use vortex::expr::root; use vortex::scalar::Scalar; +use vortex::scalar_fn::EmptyOptions; use vortex::scalar_fn::ScalarFnVTableExt; use vortex::scalar_fn::fns::between::Between; use vortex::scalar_fn::fns::between::BetweenOptions; @@ -36,6 +37,9 @@ use vortex::scalar_fn::fns::like::Like; use vortex::scalar_fn::fns::like::LikeOptions; use vortex::scalar_fn::fns::literal::Literal; use vortex::scalar_fn::fns::operators::Operator; +use vortex_geo::extension::WellKnownBinary; +use vortex_geo::extension::point_2d_scalar; +use vortex_geo::scalar_fn::distance::GeoDistance; use crate::cpp::DUCKDB_VX_EXPR_TYPE; use crate::duckdb; @@ -57,11 +61,86 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult { } } +/// Read an `f64` from a constant expression (e.g. an `ST_Point` coordinate literal). +fn from_bound_f64(value: &duckdb::ExpressionRef) -> VortexResult { + match value.as_class().vortex_expect("unknown class") { + BoundConstant(constant) => f64::try_from(&Scalar::try_from(constant.value)?), + _ => vortex_bail!("Expected f64 constant, got {:?}", value.as_class_id()), + } +} + +/// Convert an `ST_Distance` operand to a native geometry expression. A folded `ST_Point(..)` +/// constant arrives as WKB `GEOMETRY`; decode it once at plan time to a native `Point`, no per-row WKB. +fn geo_operand( + value: &duckdb::ExpressionRef, + col_sub: Option<&Expression>, +) -> VortexResult> { + if let Some(BoundConstant(constant)) = value.as_class() { + let scalar = Scalar::try_from(constant.value)?; + if let Some(point) = point_scalar_from_geometry_const(&scalar)? { + return Ok(Some(lit(point))); + } + } + try_from_expression_inner(value, col_sub) +} + +/// Decode a constant WKB `Point` into a native `Point` scalar. `None` if it isn't a WKB constant or +/// isn't a Point — those fall through to the general geo path rather than being misread. +fn point_scalar_from_geometry_const(scalar: &Scalar) -> VortexResult> { + let DType::Extension(ext_dtype) = scalar.dtype() else { + return Ok(None); + }; + if !ext_dtype.is::() { + return Ok(None); + } + let storage = scalar.as_extension().to_storage_scalar(); + let Some(buf) = storage.as_binary_opt().and_then(|b| b.value()) else { + return Ok(None); + }; + let Some((x, y)) = wkb_2d_point_xy(buf.as_slice()) else { + return Ok(None); + }; + Ok(Some(point_2d_scalar(x, y)?)) +} + +/// Read `(x, y)` from a bare 2D WKB Point: 1-byte endianness, geometry-type `u32 == 1`, two f64s. +/// `None` for anything else (SRID/Z/M flags or non-Point types shift these fixed offsets). +fn wkb_2d_point_xy(bytes: &[u8]) -> Option<(f64, f64)> { + if bytes.len() < 21 { + return None; + } + let le = bytes[0] == 1; + let read_u32 = |offset: usize| -> u32 { + let mut chunk = [0u8; 4]; + chunk.copy_from_slice(&bytes[offset..offset + 4]); + if le { + u32::from_le_bytes(chunk) + } else { + u32::from_be_bytes(chunk) + } + }; + let read_f64 = |offset: usize| -> f64 { + let mut chunk = [0u8; 8]; + chunk.copy_from_slice(&bytes[offset..offset + 8]); + if le { + f64::from_le_bytes(chunk) + } else { + f64::from_be_bytes(chunk) + } + }; + // Geometry-type code 1 == bare 2D Point; anything else shifts the coordinate offsets, so bail. + if read_u32(1) != 1 { + return None; + } + Some((read_f64(5), read_f64(13))) +} + fn try_from_bound_function( func: &BoundFunction, col_sub: Option<&Expression>, ) -> VortexResult> { - let expr = match func.scalar_function.name() { + let name = func.scalar_function.name(); + let expr = match name { "strlen" => { let children: Vec<_> = func.children().collect(); vortex_ensure!(children.len() == 1); @@ -115,15 +194,63 @@ fn try_from_bound_function( }; Like.new_expr(LikeOptions::default(), [value, lit(pattern)]) } - _ => { - debug!("bound function {}", func.scalar_function.name()); - return Ok(None); + // Geo UDFs (and any unsupported function) are handled here. + _ => return try_from_geo_function(name, func, col_sub), + }; + + Ok(Some(expr)) +} + +/// Lower the geospatial UDFs to native Vortex geo ops over `Point` storage, so the work runs in the +/// scan instead of materializing geometry for DuckDB. `None` for any other function. +fn try_from_geo_function( + name: &str, + func: &BoundFunction, + col_sub: Option<&Expression>, +) -> VortexResult> { + let children: Vec<_> = func.children().collect(); + let expr = match name.to_ascii_lowercase().as_str() { + "st_distance" => { + vortex_ensure!(children.len() == 2); + let Some(a) = geo_operand(children[0], col_sub)? else { + return Ok(None); + }; + let Some(b) = geo_operand(children[1], col_sub)? else { + return Ok(None); + }; + GeoDistance.new_expr(EmptyOptions, [a, b]) + } + "st_point" => { + vortex_ensure!(children.len() == 2); + lit(point_2d_scalar( + from_bound_f64(children[0])?, + from_bound_f64(children[1])?, + )?) } + coord @ ("st_x" | "st_y") => { + vortex_ensure!(children.len() == 1); + let Some(child) = try_from_expression_inner(children[0], col_sub)? else { + return Ok(None); + }; + // "st_x" -> "x", "st_y" -> "y" + get_item(&coord[3..], child) + } + _ => return Ok(None), }; Ok(Some(expr)) } +/// Whether `name` is a geo UDF that `try_from_geo_function` lowers — shared with +/// `can_push_expression` so the pushable and lowered sets can't drift. Case-insensitive since +/// DuckDB keeps the registered case (e.g. `ST_Distance`). +fn is_geo_function(name: &str) -> bool { + matches!( + name.to_ascii_lowercase().as_str(), + "st_distance" | "st_point" | "st_x" | "st_y" + ) +} + pub fn try_from_bound_expression( value: &duckdb::ExpressionRef, ) -> VortexResult> { @@ -166,13 +293,15 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool { BoundConjunction(conj) => conj.children().all(can_push_expression), ExpressionClass::BoundFunction(func) => { let name = func.scalar_function.name(); - name == "struct_extract" - || name == "contains" - || name == "prefix" - || name == "suffix" - || name == "~~" - || name == "!~~" - || name == "strlen" + // A geo UDF is pushable when all its operands are; `try_from_geo_function` lowers it. + // Built-in names are always lowercase; geo UDFs keep their registered case. + match name { + "struct_extract" | "contains" | "prefix" | "suffix" | "~~" | "!~~" | "strlen" => { + true + } + _ if is_geo_function(name) => func.children().all(can_push_expression), + _ => false, + } } ExpressionClass::BoundOperator(op) => { if !matches!( diff --git a/vortex-duckdb/src/exporter/extension.rs b/vortex-duckdb/src/exporter/extension.rs index 221dc92a85f..07be6d00dc0 100644 --- a/vortex-duckdb/src/exporter/extension.rs +++ b/vortex-duckdb/src/exporter/extension.rs @@ -8,6 +8,10 @@ use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::extension::datetime::AnyTemporal; use vortex::error::VortexResult; use vortex::error::vortex_bail; +use vortex_geo::extension::Point; +use vortex_geo::extension::PointData; +use vortex_geo::extension::Polygon; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinary; use vortex_geo::extension::WellKnownBinaryData; @@ -27,5 +31,13 @@ pub(crate) fn new_exporter( return geo::new_wkb_exporter(WellKnownBinaryData::try_from(ext)?, ctx); } + if ext.ext_dtype().is::() { + return geo::new_point_exporter(PointData::try_from(ext)?, ctx); + } + + if ext.ext_dtype().is::() { + return geo::new_polygon_exporter(PolygonData::try_from(ext)?, ctx); + } + vortex_bail!("no non-temporal extension exporter") } diff --git a/vortex-duckdb/src/exporter/geo.rs b/vortex-duckdb/src/exporter/geo.rs index 1287ed019e2..6137c56fd5a 100644 --- a/vortex-duckdb/src/exporter/geo.rs +++ b/vortex-duckdb/src/exporter/geo.rs @@ -4,6 +4,8 @@ use vortex::array::ExecutionCtx; use vortex::array::arrays::VarBinViewArray; use vortex::error::VortexResult; +use vortex_geo::extension::PointData; +use vortex_geo::extension::PolygonData; use vortex_geo::extension::WellKnownBinaryData; use crate::exporter::ColumnExporter; @@ -17,3 +19,24 @@ pub(crate) fn new_wkb_exporter( let values = array.wkb_values().clone().execute::(ctx)?; new_exporter(values, ctx) } + +/// Create an exporter for a native `Point` column. DuckDB `GEOMETRY` vectors carry WKB, so the +/// points are serialized to WKB via [`PointData::to_wkb`] (only for rows DuckDB materializes — +/// with predicate pushdown that's just the survivors). +pub(crate) fn new_point_exporter( + point: PointData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = point.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} + +/// Create an exporter for a native `Polygon` column. DuckDB `GEOMETRY` vectors carry WKB, so the +/// polygons are serialized to WKB via [`PolygonData::to_wkb`] (only for rows DuckDB materializes). +pub(crate) fn new_polygon_exporter( + polygon: PolygonData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let values = polygon.to_wkb(ctx)?.execute::(ctx)?; + new_exporter(values, ctx) +} diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index e2f7e4dc10f..2f0583b49e6 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -20,6 +20,7 @@ geo = { workspace = true } geo-traits = { workspace = true } geo-types = { workspace = true } geoarrow = { workspace = true } +geoarrow-cast = { workspace = true } prost = { workspace = true } vortex-array = { workspace = true } vortex-error = { workspace = true } diff --git a/vortex-geo/src/extension/point.rs b/vortex-geo/src/extension/point.rs index 19e33c212f5..499352eb086 100644 --- a/vortex-geo/src/extension/point.rs +++ b/vortex-geo/src/extension/point.rs @@ -12,11 +12,15 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PointArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PointType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -31,12 +35,14 @@ use vortex_array::arrow::ArrowSession; use vortex_array::arrow::ArrowSessionExt; use vortex_array::arrow::FromArrowArray; use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; use vortex_array::dtype::arrow::FromArrowType; use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::Scalar; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_error::vortex_err; @@ -96,20 +102,24 @@ fn point_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PointType { PointType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } -/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. -pub(crate) fn point_geometries( - storage: &ArrayRef, - ctx: &mut ExecutionCtx, -) -> VortexResult>> { +/// Build the GeoArrow [`PointArray`] from `Point` storage — shared by geometry decode and WKB export. +fn point_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { let point_type = point_type( &GeoMetadata::default(), coordinate_dimension(storage.dtype())?, ); let session = ctx.session().clone(); let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let points = PointArray::try_from((arrow.as_ref(), point_type)) - .map_err(|e| vortex_err!("failed to construct PointArray: {e}"))?; - points + PointArray::try_from((arrow.as_ref(), point_type)) + .map_err(|e| vortex_err!("failed to construct PointArray: {e}")) +} + +/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. +pub(crate) fn point_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + point_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry @@ -120,6 +130,45 @@ pub(crate) fn point_geometries( .collect() } +/// A validated `Point` array (`try_from` checks the extension type) — the entry point for WKB export. +pub struct PointData(ExtensionArray); + +impl TryFrom for PointData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Point extension array" + ); + Ok(PointData(ext)) + } +} + +impl PointData { + /// Serialize points to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let points = point_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&points, &wkb_type) + .map_err(|e| vortex_err!("failed to cast points to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + +/// A constant 2D `Point` scalar at `(x, y)`, no CRS (`2d`: builds only the `Xy` dimension). Lowers a +/// folded `ST_Point(x, y)` literal to a native operand for pushed-down geo functions. +pub fn point_2d_scalar(x: f64, y: f64) -> VortexResult { + let storage_dtype = coordinate_storage_dtype(Dimension::Xy, Nullability::NonNullable); + let storage = Scalar::struct_( + storage_dtype.clone(), + vec![Scalar::from(x), Scalar::from(y)], + ); + let ext = ExtDType::try_with_vtable(Point, GeoMetadata::default(), storage_dtype)?.erased(); + Scalar::try_new(DType::Extension(ext), storage.into_value()) +} + impl ArrowExportVTable for Point { fn arrow_ext_id(&self) -> Id { *ARROW_POINT diff --git a/vortex-geo/src/extension/polygon.rs b/vortex-geo/src/extension/polygon.rs index fc06ce59bd3..2eced0c962f 100644 --- a/vortex-geo/src/extension/polygon.rs +++ b/vortex-geo/src/extension/polygon.rs @@ -13,11 +13,15 @@ use arrow_schema::Field; use arrow_schema::extension::ExtensionType; use geo_traits::to_geo::ToGeoGeometry; use geo_types::Geometry; +use geoarrow::array::GeoArrowArray; use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PolygonArray; use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::GeoArrowType; use geoarrow::datatypes::PolygonType; +use geoarrow::datatypes::WkbType; +use geoarrow_cast::cast::cast; use prost::Message; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -38,6 +42,7 @@ use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; @@ -111,18 +116,21 @@ fn polygon_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PolygonType PolygonType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } -/// Decode `Polygon` storage (`List>`) to `geo_types` polygons, for the geo scalar -/// functions. CRS does not affect planar geometry ops, so default metadata is used. +/// Build the GeoArrow [`PolygonArray`] from `Polygon` storage — shared by geometry decode and WKB export. +fn polygon_array(storage: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); + let session = ctx.session().clone(); + let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; + PolygonArray::try_from((arrow.as_ref(), polygon_type)) + .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}")) +} + +/// Decode `Polygon` storage to `geo_types` polygons, for the geo scalar functions. pub(crate) fn polygon_geometries( storage: &ArrayRef, ctx: &mut ExecutionCtx, ) -> VortexResult>> { - let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); - let session = ctx.session().clone(); - let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; - let polygons = PolygonArray::try_from((arrow.as_ref(), polygon_type)) - .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}"))?; - polygons + polygon_array(storage, ctx)? .iter() .map(|geometry| -> VortexResult> { Ok(geometry @@ -133,6 +141,33 @@ pub(crate) fn polygon_geometries( .collect() } +/// A validated `Polygon` array (`try_from` checks the extension type) — the entry point for WKB export. +pub struct PolygonData(ExtensionArray); + +impl TryFrom for PolygonData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + vortex_ensure!( + ext.ext_dtype().is::(), + "expected a Polygon extension array" + ); + Ok(PolygonData(ext)) + } +} + +impl PolygonData { + /// Serialize polygons to WKB (a view array) via geoarrow's cast — the form DuckDB `GEOMETRY` takes. + pub fn to_wkb(&self, ctx: &mut ExecutionCtx) -> VortexResult { + let polygons = polygon_array(&self.0.storage_array().clone(), ctx)?; + let wkb_type = + GeoArrowType::WkbView(WkbType::new(geoarrow_metadata(&GeoMetadata::default()))); + let wkb = cast(&polygons, &wkb_type) + .map_err(|e| vortex_err!("failed to cast polygons to WKB: {e}"))?; + ArrayRef::from_arrow(wkb.to_array_ref().as_ref(), false) + } +} + impl ArrowExportVTable for Polygon { fn arrow_ext_id(&self) -> Id { *ARROW_POLYGON diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs index feb7ea833aa..645cd64f41a 100644 --- a/vortex-geo/src/scalar_fn/distance.rs +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -10,8 +10,12 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::Constant; use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::ScalarFnArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; @@ -27,6 +31,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_session::VortexSession; +use crate::extension::Point; +use crate::extension::coordinate::coordinate_from_struct; use crate::extension::geometries; use crate::extension::single_geometry; @@ -99,14 +105,25 @@ impl ScalarFnVTable for GeoDistance { (Some(query), None) => distances_to_constant(&b, query.scalar(), ctx), (None, Some(query)) => distances_to_constant(&a, query.scalar(), ctx), (None, None) => { - let ag = geometries(&a, ctx)?; - let bg = geometries(&b, ctx)?; vortex_ensure!( - ag.len() == bg.len(), + a.len() == b.len(), "geo distance: operand length mismatch {} vs {}", - ag.len(), - bg.len() + a.len(), + b.len() ); + // Fast path: two Point columns — distance over their `x`/`y` f64 buffers directly. + if is_point(a.dtype()) && is_point(b.dtype()) { + let (xa, ya) = point_xy(&a, ctx)?; + let (xb, yb) = point_xy(&b, ctx)?; + return Ok(point_distances( + xa.as_slice::().iter().copied(), + ya.as_slice::().iter().copied(), + xb.as_slice::().iter().copied(), + yb.as_slice::().iter().copied(), + )); + } + let ag = geometries(&a, ctx)?; + let bg = geometries(&b, ctx)?; let distances = ag.iter().zip(&bg).map(|(x, y)| Euclidean.distance(x, y)); Ok(PrimitiveArray::from_iter(distances).into_array()) } @@ -121,12 +138,70 @@ fn distances_to_constant( query: &Scalar, ctx: &mut ExecutionCtx, ) -> VortexResult { + // Fast path: Point column to a constant Point — distance over the column's `x`/`y` f64 buffers, + // broadcasting the constant, skipping the per-row `geo::Geometry` the general path materializes. + // (Polygons / other geometries fall through.) + if is_point(operand.dtype()) && is_point(query.dtype()) { + let q = coordinate_from_struct(&query.as_extension().to_storage_scalar())?; + let (xs, ys) = point_xy(operand, ctx)?; + return Ok(point_distances( + xs.as_slice::().iter().copied(), + ys.as_slice::().iter().copied(), + std::iter::repeat(q.x), + std::iter::repeat(q.y), + )); + } + let query = single_geometry(query, ctx)?; let geoms = geometries(operand, ctx)?; let distances = geoms.iter().map(|g| Euclidean.distance(g, &query)); Ok(PrimitiveArray::from_iter(distances).into_array()) } +/// Extract the `x` and `y` `f64` columns from a native `Point` operand, for the columnar fast paths. +fn point_xy( + operand: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { + let storage = operand + .clone() + .execute::(ctx)? + .storage_array() + .clone() + .execute::(ctx)?; + let xs = storage + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = storage + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + Ok((xs, ys)) +} + +/// Per-row planar distance `sqrt(dx² + dy²)` between two streams of `(x, y)` f64 coordinates. Shared +/// by the point fast paths; a constant operand feeds its side as `repeat(c)` (zero-alloc broadcast). +fn point_distances( + xa: impl Iterator, + ya: impl Iterator, + xb: impl Iterator, + yb: impl Iterator, +) -> ArrayRef { + let distances = xa.zip(ya).zip(xb.zip(yb)).map(|((xa, ya), (xb, yb))| { + let (dx, dy) = (xa - xb, ya - yb); + (dx * dx + dy * dy).sqrt() + }); + PrimitiveArray::from_iter(distances).into_array() +} + +/// Whether `dtype` is the native `Point` extension (eligible for the columnar fast path). +fn is_point(dtype: &DType) -> bool { + dtype + .as_extension_opt() + .is_some_and(|ext| ext.is::()) +} + #[cfg(test)] mod tests { use vortex_array::ArrayRef;