From 4626f9b32d8d45cb8e3f0d4891c53fa6ee1d0d70 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 27 Mar 2025 17:30:28 -0400 Subject: [PATCH 01/20] Temp pin to datafusion main --- Cargo.toml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index b6164f89d31e8..c282d2283cfcb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -206,3 +206,20 @@ used_underscore_binding = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" + + +## Temporary arrow-rs patch until 55 + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } From 98d62e0d0e448818de8a2bf01c35e5214e31534f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Mar 2025 18:41:06 -0400 Subject: [PATCH 02/20] Update cargo lock --- Cargo.lock | 64 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 16fccdb2865b7..8a7181675972d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,9 +301,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "bytes", "half", @@ -349,9 +348,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-buffer", "arrow-schema", @@ -448,9 +446,8 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "bitflags 2.8.0", "serde", @@ -3897,7 +3894,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -7046,7 +7043,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] @@ -7501,3 +7498,48 @@ dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-array" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-cast" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-flight" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-ipc" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-ord" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-select" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "arrow-string" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" + +[[patch.unused]] +name = "parquet" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" From fc1f7a58a44a86d749913e71300cfe28e442479f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Mar 2025 18:45:10 -0400 Subject: [PATCH 03/20] update pyo3 --- Cargo.lock | 167 +++++++++++++---------------------- Cargo.toml | 14 +-- datafusion/common/Cargo.toml | 2 +- 3 files changed, 69 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a7181675972d..1c59c62bbcd35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,9 +246,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-arith", "arrow-array", @@ -265,14 +264,13 @@ dependencies = [ "arrow-string", "half", "pyo3", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] name = "arrow-arith" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -284,9 +282,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -311,9 +308,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -332,9 +328,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-cast", @@ -359,9 +354,8 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7408f2bf3b978eddda272c7699f439760ebc4ac70feca25fefa82c5b8ce808d" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-arith", "arrow-array", @@ -386,9 +380,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -400,9 +393,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -413,16 +405,17 @@ dependencies = [ "half", "indexmap 2.8.0", "lexical-core", + "memchr", "num", "serde", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -433,9 +426,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -455,9 +447,8 @@ dependencies = [ [[package]] name = "arrow-select" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -469,9 +460,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "arrow-array", "arrow-buffer", @@ -1344,9 +1334,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1354,7 +1344,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.6", + "windows-link", ] [[package]] @@ -2900,11 +2890,11 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.8.0", "rustc_version", ] @@ -3982,7 +3972,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "twox-hash", + "twox-hash 1.6.3", ] [[package]] @@ -4343,9 +4333,8 @@ dependencies = [ [[package]] name = "parquet" -version = "54.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f88838dca3b84d41444a0341b19f347e8098a3898b0f21536654b8b799e11abd" +version = "54.3.1" +source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4373,9 +4362,8 @@ dependencies = [ "snap", "thrift", "tokio", - "twox-hash", + "twox-hash 2.1.0", "zstd", - "zstd-sys", ] [[package]] @@ -4822,9 +4810,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.5" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +checksum = "7f1c6c3591120564d64db2261bec5f910ae454f01def849b9c22835a84695e86" dependencies = [ "cfg-if", "indoc", @@ -4840,9 +4828,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.5" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +checksum = "e9b6c2b34cf71427ea37c7001aefbaeb85886a074795e35f161f5aecc7620a7a" dependencies = [ "once_cell", "target-lexicon", @@ -4850,9 +4838,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.5" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +checksum = "5507651906a46432cdda02cd02dd0319f6064f1374c9147c45b978621d2c3a9c" dependencies = [ "libc", "pyo3-build-config", @@ -4860,9 +4848,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.5" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +checksum = "b0d394b5b4fd8d97d48336bb0dd2aebabad39f1d294edd6bcd2cccf2eefe6f42" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -4872,9 +4860,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.23.5" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +checksum = "fd72da09cfa943b1080f621f024d2ef7e2773df7badd51aa30a2be1f8caa7c8e" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -6110,9 +6098,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" [[package]] name = "tempfile" @@ -6613,6 +6601,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" + [[package]] name = "typed-arena" version = "2.0.2" @@ -7105,6 +7099,12 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + [[package]] name = "windows-registry" version = "0.2.0" @@ -7498,48 +7498,3 @@ dependencies = [ "cc", "pkg-config", ] - -[[patch.unused]] -name = "arrow" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-array" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-cast" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-flight" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-ipc" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-ord" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-select" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "arrow-string" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" - -[[patch.unused]] -name = "parquet" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" diff --git a/Cargo.toml b/Cargo.toml index c282d2283cfcb..4c06b418820c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,19 +87,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.17", default-features = false } -arrow = { version = "54.2.1", features = [ +arrow = { version = "54.3.1", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "54.1.0", default-features = false } -arrow-flight = { version = "54.2.1", features = [ +arrow-buffer = { version = "54.3.1", default-features = false } +arrow-flight = { version = "54.3.1", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "54.2.0", default-features = false, features = [ +arrow-ipc = { version = "54.3.1", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "54.1.0", default-features = false } -arrow-schema = { version = "54.1.0", default-features = false } +arrow-ord = { version = "54.3.1", default-features = false } +arrow-schema = { version = "54.3.1", default-features = false } async-trait = "0.1.88" bigdecimal = "0.4.7" bytes = "1.10" @@ -149,7 +149,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "54.2.1", default-features = false, features = [ +parquet = { version = "54.3.1", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 39b47a96bccf3..74e99163955e9 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -63,7 +63,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.23.5", optional = true } +pyo3 = { version = "0.24.0", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } From 9299b4bea974e1f09b1325338d88fe2cf240c31d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Mar 2025 18:52:21 -0400 Subject: [PATCH 04/20] vendor random generation --- .../functions-aggregate/benches/array_agg.rs | 28 +++++++++++++++++-- datafusion/functions/benches/chr.rs | 10 +++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs index fb605e87ed0cc..e22be611d8d76 100644 --- a/datafusion/functions-aggregate/benches/array_agg.rs +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -19,17 +19,23 @@ use std::sync::Arc; use arrow::array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder, + PrimitiveArray, }; use arrow::datatypes::{Field, Int64Type}; -use arrow::util::bench_util::create_primitive_array; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::Accumulator; use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; use arrow::buffer::OffsetBuffer; -use arrow::util::test_util::seedable_rng; use rand::distributions::{Distribution, Standard}; +use rand::prelude::StdRng; use rand::Rng; +use rand::SeedableRng; + +/// Returns fixed seedable RNG +pub fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { let list_item_data_type = values.as_list::().values().data_type().clone(); @@ -46,6 +52,24 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { }); } +pub fn create_primitive_array(size: usize, null_density: f32) -> PrimitiveArray +where + T: ArrowPrimitiveType, + Standard: Distribution, +{ + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() < null_density { + None + } else { + Some(rng.gen()) + } + }) + .collect() +} + /// Create List array with the given item data type, null density, null locations and zero length lists density /// Creates an random (but fixed-seeded) array of a given size and null density pub fn create_list_array( diff --git a/datafusion/functions/benches/chr.rs b/datafusion/functions/benches/chr.rs index 4750fb4666532..8575809c21c8b 100644 --- a/datafusion/functions/benches/chr.rs +++ b/datafusion/functions/benches/chr.rs @@ -17,15 +17,21 @@ extern crate criterion; -use arrow::{array::PrimitiveArray, datatypes::Int64Type, util::test_util::seedable_rng}; +use arrow::{array::PrimitiveArray, datatypes::Int64Type}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string::chr; -use rand::Rng; +use rand::{Rng, SeedableRng}; use arrow::datatypes::DataType; +use rand::rngs::StdRng; use std::sync::Arc; +/// Returns fixed seedable RNG +pub fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} + fn criterion_benchmark(c: &mut Criterion) { let cot_fn = chr(); let size = 1024; From 7b8320eee948396f7e76db694e769ed22481805a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Mar 2025 18:54:50 -0400 Subject: [PATCH 05/20] Update error message --- datafusion/sqllogictest/test_files/dates.slt | 2 +- datafusion/sqllogictest/test_files/timestamps.slt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index 4425eee333735..148f0dfe64bb7 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -183,7 +183,7 @@ query error input contains invalid characters SELECT to_date('2020-09-08 12/00/00+00:00', '%c', '%+') # to_date with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_date('2020-09-08 12/00/00+00:00', '%q') statement ok diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index dcbcfbfa439d5..e3f8d2e4c8bb2 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2241,23 +2241,23 @@ query error input contains invalid characters SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') # to_timestamp with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_nanos with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_millis with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_micros with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_seconds with broken formatting -query error bad or unsupported format string +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') # Create string timestamp table with different formats From cdc55e5c30ee68dc7e1f4f450d6ad8c7af551ee8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Mar 2025 18:56:46 -0400 Subject: [PATCH 06/20] Update for extraction --- datafusion/sqllogictest/test_files/expr/date_part.slt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/expr/date_part.slt index dec796aa59cb5..39c42cbe1e97f 100644 --- a/datafusion/sqllogictest/test_files/expr/date_part.slt +++ b/datafusion/sqllogictest/test_files/expr/date_part.slt @@ -884,7 +884,7 @@ SELECT extract(day from arrow_cast('14400 minutes', 'Interval(DayTime)')) query I SELECT extract(minute from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- -14400 +0 query I SELECT extract(second from arrow_cast('5.1 seconds', 'Interval(DayTime)')) @@ -894,7 +894,7 @@ SELECT extract(second from arrow_cast('5.1 seconds', 'Interval(DayTime)')) query I SELECT extract(second from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- -864000 +0 query I SELECT extract(second from arrow_cast('2 months', 'Interval(MonthDayNano)')) @@ -954,7 +954,7 @@ from t order by id; ---- 0 0 5 -1 0 15 +1 0 3 2 0 0 3 2 0 4 0 8 From 53ec3530128546cf77006499cb3a9fb0d4111d3d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 10:38:17 -0400 Subject: [PATCH 07/20] Update pin --- Cargo.lock | 165 ++++++++++++++++++++++++++++++++--------------------- Cargo.toml | 39 +++++++------ 2 files changed, 121 insertions(+), 83 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1dcca0dccb708..c778e8a7b846b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,8 +246,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-arith", "arrow-array", @@ -269,8 +269,8 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,8 +282,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -298,8 +298,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "bytes", "half", @@ -308,8 +308,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -328,8 +328,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-cast", @@ -343,8 +343,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,8 +354,8 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-arith", "arrow-array", @@ -380,8 +380,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -393,8 +393,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -414,8 +414,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -426,8 +426,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -438,8 +438,8 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "bitflags 2.8.0", "serde", @@ -447,8 +447,8 @@ dependencies = [ [[package]] name = "arrow-select" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -460,8 +460,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "arrow-array", "arrow-buffer", @@ -1841,7 +1841,7 @@ dependencies = [ "itertools 0.14.0", "log", "nix", - "object_store", + "object_store 0.11.2", "parking_lot", "parquet", "paste", @@ -1874,7 +1874,7 @@ dependencies = [ "futures", "log", "mimalloc", - "object_store", + "object_store 0.11.2", "parquet", "rand 0.8.5", "serde", @@ -1905,7 +1905,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.11.2", "parking_lot", "tokio", ] @@ -1927,7 +1927,7 @@ dependencies = [ "datafusion-session", "futures", "log", - "object_store", + "object_store 0.11.2", "tempfile", "tokio", ] @@ -1950,7 +1950,7 @@ dependencies = [ "insta", "insta-cmd", "mimalloc", - "object_store", + "object_store 0.11.2", "parking_lot", "parquet", "predicates", @@ -1977,7 +1977,7 @@ dependencies = [ "insta", "libc", "log", - "object_store", + "object_store 0.11.2", "parquet", "paste", "pyo3", @@ -2020,7 +2020,7 @@ dependencies = [ "glob", "itertools 0.14.0", "log", - "object_store", + "object_store 0.11.2", "parquet", "rand 0.8.5", "tempfile", @@ -2050,7 +2050,7 @@ dependencies = [ "datafusion-session", "futures", "num-traits", - "object_store", + "object_store 0.11.2", "rstest", "serde_json", "tokio", @@ -2074,7 +2074,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store", + "object_store 0.11.2", "regex", "tokio", ] @@ -2097,7 +2097,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store", + "object_store 0.11.2", "serde_json", "tokio", ] @@ -2125,7 +2125,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.11.2", "parking_lot", "parquet", "rand 0.8.5", @@ -2153,7 +2153,7 @@ dependencies = [ "log", "mimalloc", "nix", - "object_store", + "object_store 0.11.2", "prost", "tempfile", "test-utils", @@ -2176,7 +2176,7 @@ dependencies = [ "datafusion-expr", "futures", "log", - "object_store", + "object_store 0.11.2", "parking_lot", "rand 0.8.5", "tempfile", @@ -2493,7 +2493,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-proto-common", "doc-comment", - "object_store", + "object_store 0.11.2", "pbjson", "prost", "serde", @@ -2532,7 +2532,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.11.2", "parking_lot", "tokio", ] @@ -2578,7 +2578,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "log", - "object_store", + "object_store 0.11.2", "postgres-protocol", "postgres-types", "rust_decimal", @@ -2603,7 +2603,7 @@ dependencies = [ "datafusion-functions-aggregate", "insta", "itertools 0.14.0", - "object_store", + "object_store 0.11.2", "pbjson-types", "prost", "serde_json", @@ -2627,7 +2627,7 @@ dependencies = [ "datafusion-sql", "getrandom 0.2.15", "insta", - "object_store", + "object_store 0.11.2", "tokio", "url", "wasm-bindgen", @@ -2922,6 +2922,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -3943,6 +3944,15 @@ dependencies = [ "escape8259", ] +[[package]] +name = "libz-rs-sys" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d" +dependencies = [ + "zlib-rs", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4274,6 +4284,27 @@ dependencies = [ "walkdir", ] +[[package]] +name = "object_store" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http 1.2.0", + "humantime", + "itertools 0.14.0", + "parking_lot", + "percent-encoding", + "thiserror 2.0.12", + "tokio", + "tracing", + "url", +] + [[package]] name = "once_cell" version = "1.20.3" @@ -4350,8 +4381,8 @@ dependencies = [ [[package]] name = "parquet" -version = "54.3.1" -source = "git+https://github.com/apache/arrow-rs.git?rev=b30336da8c1318f3f45e22f0a377ca21830fddac#b30336da8c1318f3f45e22f0a377ca21830fddac" +version = "55.0.0" +source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4372,7 +4403,7 @@ dependencies = [ "lz4_flex", "num", "num-bigint", - "object_store", + "object_store 0.12.0", "paste", "seq-macro", "simdutf8", @@ -4752,7 +4783,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck 0.5.0", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -4772,7 +4803,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.100", @@ -4827,9 +4858,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f1c6c3591120564d64db2261bec5f910ae454f01def849b9c22835a84695e86" +checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" dependencies = [ "cfg-if", "indoc", @@ -4845,9 +4876,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9b6c2b34cf71427ea37c7001aefbaeb85886a074795e35f161f5aecc7620a7a" +checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" dependencies = [ "once_cell", "target-lexicon", @@ -4855,9 +4886,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5507651906a46432cdda02cd02dd0319f6064f1374c9147c45b978621d2c3a9c" +checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" dependencies = [ "libc", "pyo3-build-config", @@ -4865,9 +4896,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0d394b5b4fd8d97d48336bb0dd2aebabad39f1d294edd6bcd2cccf2eefe6f42" +checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -4877,9 +4908,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.24.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd72da09cfa943b1080f621f024d2ef7e2773df7badd51aa30a2be1f8caa7c8e" +checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -7054,7 +7085,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -7488,6 +7519,12 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "zlib-rs" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05" + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index 7181d8f87ee36..593ff94ec9dd3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,19 +87,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.17", default-features = false } -arrow = { version = "54.3.1", features = [ +arrow = { version = "55.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "54.3.1", default-features = false } -arrow-flight = { version = "54.3.1", features = [ +arrow-buffer = { version = "55.0.0", default-features = false } +arrow-flight = { version = "55.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "54.3.1", default-features = false, features = [ +arrow-ipc = { version = "55.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "54.3.1", default-features = false } -arrow-schema = { version = "54.3.1", default-features = false } +arrow-ord = { version = "55.0.0", default-features = false } +arrow-schema = { version = "55.0.0", default-features = false } async-trait = "0.1.88" bigdecimal = "0.4.7" bytes = "1.10" @@ -149,7 +149,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "54.3.1", default-features = false, features = [ +parquet = { version = "55.0.0", default-features = false, features = [ "arrow", "async", "object_store", @@ -216,17 +216,18 @@ unused_qualifications = "deny" ## Temporary arrow-rs patch until 55 +## https://github.com/apache/arrow-rs/pull/7391 [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "b30336da8c1318f3f45e22f0a377ca21830fddac" } +arrow = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-array = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-cast = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-data = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-ipc = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-select = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-string = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +parquet = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } From c103a03e2d6a5d1e6f424709d7fb3357a3838174 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 10:46:14 -0400 Subject: [PATCH 08/20] Upgrade object_store --- Cargo.lock | 91 ++++++++++++++++-------------------------------------- Cargo.toml | 2 +- 2 files changed, 27 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f74e6f0a7a14..177d779c43f60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1841,7 +1841,7 @@ dependencies = [ "itertools 0.14.0", "log", "nix", - "object_store 0.11.2", + "object_store", "parking_lot", "parquet", "paste", @@ -1874,7 +1874,7 @@ dependencies = [ "futures", "log", "mimalloc", - "object_store 0.11.2", + "object_store", "parquet", "rand 0.8.5", "serde", @@ -1905,7 +1905,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.11.2", + "object_store", "parking_lot", "tokio", ] @@ -1927,7 +1927,7 @@ dependencies = [ "datafusion-session", "futures", "log", - "object_store 0.11.2", + "object_store", "tempfile", "tokio", ] @@ -1950,7 +1950,7 @@ dependencies = [ "insta", "insta-cmd", "mimalloc", - "object_store 0.11.2", + "object_store", "parking_lot", "parquet", "predicates", @@ -1977,7 +1977,7 @@ dependencies = [ "insta", "libc", "log", - "object_store 0.11.2", + "object_store", "parquet", "paste", "pyo3", @@ -2021,7 +2021,7 @@ dependencies = [ "glob", "itertools 0.14.0", "log", - "object_store 0.11.2", + "object_store", "parquet", "rand 0.8.5", "tempfile", @@ -2051,7 +2051,7 @@ dependencies = [ "datafusion-session", "futures", "num-traits", - "object_store 0.11.2", + "object_store", "rstest", "serde_json", "tokio", @@ -2075,7 +2075,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store 0.11.2", + "object_store", "regex", "tokio", ] @@ -2098,7 +2098,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store 0.11.2", + "object_store", "serde_json", "tokio", ] @@ -2126,7 +2126,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.11.2", + "object_store", "parking_lot", "parquet", "rand 0.8.5", @@ -2154,7 +2154,7 @@ dependencies = [ "log", "mimalloc", "nix", - "object_store 0.11.2", + "object_store", "prost", "tempfile", "test-utils", @@ -2177,7 +2177,7 @@ dependencies = [ "datafusion-expr", "futures", "log", - "object_store 0.11.2", + "object_store", "parking_lot", "rand 0.8.5", "tempfile", @@ -2495,7 +2495,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-proto-common", "doc-comment", - "object_store 0.11.2", + "object_store", "pbjson", "prost", "serde", @@ -2534,7 +2534,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.11.2", + "object_store", "parking_lot", "tokio", ] @@ -2580,7 +2580,7 @@ dependencies = [ "indicatif", "itertools 0.14.0", "log", - "object_store 0.11.2", + "object_store", "postgres-protocol", "postgres-types", "rust_decimal", @@ -2605,7 +2605,7 @@ dependencies = [ "datafusion-functions-aggregate", "insta", "itertools 0.14.0", - "object_store 0.11.2", + "object_store", "pbjson-types", "prost", "serde_json", @@ -2629,7 +2629,7 @@ dependencies = [ "datafusion-sql", "getrandom 0.2.15", "insta", - "object_store 0.11.2", + "object_store", "tokio", "url", "wasm-bindgen", @@ -4257,18 +4257,21 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" +checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", + "form_urlencoded", "futures", + "http 1.2.0", + "http-body-util", "humantime", "hyper", - "itertools 0.13.0", + "itertools 0.14.0", "md-5", "parking_lot", "percent-encoding", @@ -4279,28 +4282,7 @@ dependencies = [ "rustls-pemfile", "serde", "serde_json", - "snafu", - "tokio", - "tracing", - "url", - "walkdir", -] - -[[package]] -name = "object_store" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" -dependencies = [ - "async-trait", - "bytes", - "chrono", - "futures", - "http 1.2.0", - "humantime", - "itertools 0.14.0", - "parking_lot", - "percent-encoding", + "serde_urlencoded", "thiserror 2.0.12", "tokio", "tracing", @@ -4405,7 +4387,7 @@ dependencies = [ "lz4_flex", "num", "num-bigint", - "object_store 0.12.0", + "object_store", "paste", "seq-macro", "simdutf8", @@ -5810,27 +5792,6 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" -[[package]] -name = "snafu" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" -dependencies = [ - "snafu-derive", -] - -[[package]] -name = "snafu-derive" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "quote", - "syn 2.0.100", -] - [[package]] name = "snap" version = "1.1.1" diff --git a/Cargo.toml b/Cargo.toml index 593ff94ec9dd3..f65d23169ad5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -147,7 +147,7 @@ hashbrown = { version = "0.14.5", features = ["raw"] } indexmap = "2.8.0" itertools = "0.14" log = "^0.4" -object_store = { version = "0.11.0", default-features = false } +object_store = { version = "0.12.0", default-features = false } parking_lot = "0.12" parquet = { version = "55.0.0", default-features = false, features = [ "arrow", From 586851bcee0f5daed336c3815fa5093a84d3c5ba Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 10:49:05 -0400 Subject: [PATCH 09/20] fix feature --- Cargo.lock | 1 + datafusion/execution/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 177d779c43f60..357d3a922456c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4287,6 +4287,7 @@ dependencies = [ "tokio", "tracing", "url", + "walkdir", ] [[package]] diff --git a/datafusion/execution/Cargo.toml b/datafusion/execution/Cargo.toml index 8f642f3384d2e..20e507e98b68a 100644 --- a/datafusion/execution/Cargo.toml +++ b/datafusion/execution/Cargo.toml @@ -44,7 +44,7 @@ datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } futures = { workspace = true } log = { workspace = true } -object_store = { workspace = true } +object_store = { workspace = true, features = ["fs"] } parking_lot = { workspace = true } rand = { workspace = true } tempfile = { workspace = true } From cabfb589f52b40f3de6d2946025f85f99baf58bf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 10:58:10 -0400 Subject: [PATCH 10/20] Update file size handling --- datafusion/datasource/src/file_groups.rs | 13 +++++++------ datafusion/datasource/src/mod.rs | 24 +++++++++++++++--------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs index a1f966c22f35f..15c86427ed00a 100644 --- a/datafusion/datasource/src/file_groups.rs +++ b/datafusion/datasource/src/file_groups.rs @@ -224,10 +224,11 @@ impl FileGroupPartitioner { return None; } - let target_partition_size = (total_size as usize).div_ceil(target_partitions); + let target_partition_size = + (total_size as u64).div_ceil(target_partitions as u64); let current_partition_index: usize = 0; - let current_partition_size: usize = 0; + let current_partition_size: u64 = 0; // Partition byte range evenly for all `PartitionedFile`s let repartitioned_files = flattened_files @@ -497,15 +498,15 @@ struct ToRepartition { /// the index from which the original file will be taken source_index: usize, /// the size of the original file - file_size: usize, + file_size: u64, /// indexes of which group(s) will this be distributed to (including `source_index`) new_groups: Vec, } impl ToRepartition { - // how big will each file range be when this file is read in its new groups? - fn range_size(&self) -> usize { - self.file_size / self.new_groups.len() + /// How big will each file range be when this file is read in its new groups? + fn range_size(&self) -> u64 { + self.file_size / (self.new_groups.len() as u64) } } diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index c02f84c74d64b..1322fefdda505 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -52,7 +52,7 @@ pub use self::url::ListingTableUrl; use crate::file_groups::FileGroup; use chrono::TimeZone; use datafusion_common::stats::Precision; -use datafusion_common::{ColumnStatistics, Result}; +use datafusion_common::{exec_datafusion_err, ColumnStatistics, Result}; use datafusion_common::{ScalarValue, Statistics}; use file_meta::FileMeta; use futures::{Stream, StreamExt}; @@ -123,7 +123,7 @@ impl PartitionedFile { object_meta: ObjectMeta { location: Path::from(path.into()), last_modified: chrono::Utc.timestamp_nanos(0), - size: size as usize, + size, e_tag: None, version: None, }, @@ -141,7 +141,7 @@ impl PartitionedFile { object_meta: ObjectMeta { location: Path::from(path), last_modified: chrono::Utc.timestamp_nanos(0), - size: size as usize, + size, e_tag: None, version: None, }, @@ -224,7 +224,7 @@ impl From for PartitionedFile { /// Indicates that the range calculation determined no further action is /// necessary, possibly because the calculated range is empty or invalid. pub enum RangeCalculation { - Range(Option>), + Range(Option>), TerminateEarly, } @@ -250,7 +250,12 @@ pub async fn calculate_range( match file_meta.range { None => Ok(RangeCalculation::Range(None)), Some(FileRange { start, end }) => { - let (start, end) = (start as usize, end as usize); + let start: u64 = start.try_into().map_err(|_| { + exec_datafusion_err!("Expect start range to fit in u64, got {start}") + })?; + let end: u64 = end.try_into().map_err(|_| { + exec_datafusion_err!("Expect end range to fit in u64, got {end}") + })?; let start_delta = if start != 0 { find_first_newline(store, location, start - 1, file_size, newline).await? @@ -289,10 +294,10 @@ pub async fn calculate_range( async fn find_first_newline( object_store: &Arc, location: &Path, - start: usize, - end: usize, + start: u64, + end: u64, newline: u8, -) -> Result { +) -> Result { let options = GetOptions { range: Some(GetRange::Bounded(start..end)), ..Default::default() @@ -305,10 +310,11 @@ async fn find_first_newline( while let Some(chunk) = result_stream.next().await.transpose()? { if let Some(position) = chunk.iter().position(|&byte| byte == newline) { + let position = position as u64; return Ok(index + position); } - index += chunk.len(); + index += chunk.len() as u64; } Ok(index) From d980c009a65420e5221b5ba1b95ef3a3ae7dba0f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 11:39:06 -0400 Subject: [PATCH 11/20] bash for object store API changes --- .../datasource/physical_plan/arrow_file.rs | 20 +++++++++---------- datafusion/core/src/test/object_store.rs | 4 ++-- .../datasource-parquet/src/file_format.rs | 3 ++- datafusion/datasource-parquet/src/reader.rs | 16 ++++++++------- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 5dcf4df73f57a..21d962e6d11fb 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -305,7 +305,7 @@ impl FileOpener for ArrowOpener { )?; // read footer according to footer_len let get_option = GetOptions { - range: Some(GetRange::Suffix(10 + footer_len)), + range: Some(GetRange::Suffix(10 + (footer_len as u64))), ..Default::default() }; let get_result = object_store @@ -332,9 +332,9 @@ impl FileOpener for ArrowOpener { .iter() .flatten() .map(|block| { - let block_len = block.bodyLength() as usize - + block.metaDataLength() as usize; - let block_offset = block.offset() as usize; + let block_len = + block.bodyLength() as u64 + block.metaDataLength() as u64; + let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) .collect_vec(); @@ -354,9 +354,9 @@ impl FileOpener for ArrowOpener { .iter() .flatten() .filter(|block| { - let block_offset = block.offset() as usize; - block_offset >= range.start as usize - && block_offset < range.end as usize + let block_offset = block.offset() as u64; + block_offset >= range.start as u64 + && block_offset < range.end as u64 }) .copied() .collect_vec(); @@ -364,9 +364,9 @@ impl FileOpener for ArrowOpener { let recordbatch_ranges = recordbatches .iter() .map(|block| { - let block_len = block.bodyLength() as usize - + block.metaDataLength() as usize; - let block_offset = block.offset() as usize; + let block_len = + block.bodyLength() as u64 + block.metaDataLength() as u64; + let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) .collect_vec(); diff --git a/datafusion/core/src/test/object_store.rs b/datafusion/core/src/test/object_store.rs index e1328770cabdd..8b19658bb1473 100644 --- a/datafusion/core/src/test/object_store.rs +++ b/datafusion/core/src/test/object_store.rs @@ -66,7 +66,7 @@ pub fn local_unpartitioned_file(path: impl AsRef) -> ObjectMeta ObjectMeta { location, last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, } @@ -166,7 +166,7 @@ impl ObjectStore for BlockingObjectStore { fn list( &self, prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { self.inner.list(prefix) } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1d9a67fd2eb6d..f534d6769b9f1 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -739,6 +739,7 @@ impl MetadataFetch for ObjectStoreFetch<'_> { &mut self, range: Range, ) -> BoxFuture<'_, Result> { + let range = range.start as u64..range.end as u64; async { self.store .get_range(&self.meta.location, range) @@ -765,7 +766,7 @@ pub async fn fetch_parquet_metadata( ParquetMetaDataReader::new() .with_prefetch_hint(size_hint) - .load_and_finish(fetch, file_size) + .load_and_finish(fetch, file_size as usize) .await .map_err(DataFusionError::from) } diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 5924a5b5038fc..141b30d431379 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -18,19 +18,19 @@ //! [`ParquetFileReaderFactory`] and [`DefaultParquetFileReaderFactory`] for //! low level control of parquet file readers +use crate::ParquetFileMetrics; use bytes::Bytes; use datafusion_datasource::file_meta::FileMeta; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use futures::future::BoxFuture; use object_store::ObjectStore; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader}; use parquet::file::metadata::ParquetMetaData; use std::fmt::Debug; use std::ops::Range; use std::sync::Arc; -use crate::ParquetFileMetrics; - /// Interface for reading parquet files. /// /// The combined implementations of [`ParquetFileReaderFactory`] and @@ -114,10 +114,11 @@ impl AsyncFileReader for ParquetFileReader { self.inner.get_byte_ranges(ranges) } - fn get_metadata( - &mut self, - ) -> BoxFuture<'_, parquet::errors::Result>> { - self.inner.get_metadata() + fn get_metadata<'a>( + &'a mut self, + options: Option<&'a ArrowReaderOptions>, + ) -> BoxFuture<'a, parquet::errors::Result>> { + self.inner.get_metadata(options) } } @@ -135,7 +136,8 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { metrics, ); let store = Arc::clone(&self.store); - let mut inner = ParquetObjectReader::new(store, file_meta.object_meta); + let mut inner = ParquetObjectReader::new(store, file_meta.object_meta.location) + .with_file_size(file_meta.object_meta.size as usize); if let Some(hint) = metadata_size_hint { inner = inner.with_footer_size_hint(hint) From 2dd1827bff47eaa7210356e1bbb6622986ee124b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 11:41:05 -0400 Subject: [PATCH 12/20] few more --- datafusion/core/src/test_util/parquet.rs | 2 +- datafusion/proto/src/physical_plan/from_proto.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 084554eecbdb0..f5753af64d93f 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -102,7 +102,7 @@ impl TestParquetFile { println!("Generated test dataset with {num_rows} rows"); - let size = std::fs::metadata(&path)?.len() as usize; + let size = std::fs::metadata(&path)?.len(); let mut canonical_path = path.canonicalize()?; diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index c949e3c9f8cb1..88963ed6bba52 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -555,7 +555,7 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile { object_meta: ObjectMeta { location: Path::from(val.path.as_str()), last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64), - size: val.size as usize, + size: val.size, e_tag: None, version: None, }, From dff9490ee720ae5f9139a3813bdef1b192e8a9bc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 15:06:41 -0400 Subject: [PATCH 13/20] Update APIs more --- .../examples/advanced_parquet_index.rs | 5 ++++- .../core/src/datasource/file_format/arrow.rs | 4 ++-- .../core/src/datasource/file_format/csv.rs | 17 +++++++++-------- .../core/src/datasource/file_format/parquet.rs | 6 +++--- datafusion/core/src/datasource/mod.rs | 2 +- .../src/datasource/physical_plan/parquet.rs | 4 ++-- datafusion/core/tests/parquet/custom_reader.rs | 5 ++++- datafusion/core/tests/parquet/page_pruning.rs | 2 +- datafusion/core/tests/sql/path_partition.rs | 18 ++++++++++-------- .../datasource-parquet/src/row_group_filter.rs | 7 +++++-- datafusion/proto/src/physical_plan/to_proto.rs | 2 +- .../substrait/src/physical_plan/producer.rs | 2 +- 12 files changed, 43 insertions(+), 31 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index b8c303e221618..113c43dc13236 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -571,7 +571,9 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory { .to_string(); let object_store = Arc::clone(&self.object_store); - let mut inner = ParquetObjectReader::new(object_store, file_meta.object_meta); + let mut inner = + ParquetObjectReader::new(object_store, file_meta.object_meta.location) + .with_file_size(file_meta.object_meta.size as usize); if let Some(hint) = metadata_size_hint { inner = inner.with_footer_size_hint(hint) @@ -618,6 +620,7 @@ impl AsyncFileReader for ParquetReaderWithCache { fn get_metadata( &mut self, + _options: Option<&ArrowReaderOptions>, ) -> BoxFuture<'_, datafusion::parquet::errors::Result>> { println!("get_metadata: {} returning cached metadata", self.filename); diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 6c7c9463cf3b7..ea027ae91a061 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -442,7 +442,7 @@ mod tests { let object_meta = ObjectMeta { location, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; @@ -485,7 +485,7 @@ mod tests { let object_meta = ObjectMeta { location, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 309458975ab6c..9fa4c00e6af25 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -72,7 +72,7 @@ mod tests { #[derive(Debug)] struct VariableStream { bytes_to_repeat: Bytes, - max_iterations: usize, + max_iterations: u64, iterations_detected: Arc>, } @@ -103,14 +103,15 @@ mod tests { async fn get(&self, location: &Path) -> object_store::Result { let bytes = self.bytes_to_repeat.clone(); - let range = 0..bytes.len() * self.max_iterations; + let len = bytes.len() as u64; + let range = 0..len * self.max_iterations; let arc = self.iterations_detected.clone(); let stream = futures::stream::repeat_with(move || { let arc_inner = arc.clone(); *arc_inner.lock().unwrap() += 1; Ok(bytes.clone()) }) - .take(self.max_iterations) + .take(self.max_iterations as usize) .boxed(); Ok(GetResult { @@ -138,7 +139,7 @@ mod tests { async fn get_ranges( &self, _location: &Path, - _ranges: &[Range], + _ranges: &[Range], ) -> object_store::Result> { unimplemented!() } @@ -154,7 +155,7 @@ mod tests { fn list( &self, _prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { unimplemented!() } @@ -179,7 +180,7 @@ mod tests { } impl VariableStream { - pub fn new(bytes_to_repeat: Bytes, max_iterations: usize) -> Self { + pub fn new(bytes_to_repeat: Bytes, max_iterations: u64) -> Self { Self { bytes_to_repeat, max_iterations, @@ -371,7 +372,7 @@ mod tests { let object_meta = ObjectMeta { location: Path::parse("/")?, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; @@ -429,7 +430,7 @@ mod tests { let object_meta = ObjectMeta { location: Path::parse("/")?, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 67a7ba8dc776e..76009ccd80b05 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -331,7 +331,7 @@ mod tests { fn list( &self, _prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { Box::pin(futures::stream::once(async { Err(object_store::Error::NotImplemented) })) @@ -408,7 +408,7 @@ mod tests { ))); // Use the file size as the hint so we can get the full metadata from the first fetch - let size_hint = meta[0].size; + let size_hint = meta[0].size as usize; fetch_parquet_metadata(store.upcast().as_ref(), &meta[0], Some(size_hint)) .await @@ -443,7 +443,7 @@ mod tests { ))); // Use the a size hint larger than the file size to make sure we don't panic - let size_hint = meta[0].size + 100; + let size_hint = (meta[0].size + 100) as usize; fetch_parquet_metadata(store.upcast().as_ref(), &meta[0], Some(size_hint)) .await diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index a15b2b6ffe137..25a89644cd2a4 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -106,7 +106,7 @@ mod tests { let meta = ObjectMeta { location, last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 5c06c3902c1c8..5986460cb539a 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -1786,13 +1786,13 @@ mod tests { path: &str, store: Arc, batch: RecordBatch, - ) -> usize { + ) -> u64 { let mut writer = ArrowWriter::try_new(BytesMut::new().writer(), batch.schema(), None).unwrap(); writer.write(&batch).unwrap(); writer.flush().unwrap(); let bytes = writer.into_inner().unwrap().into_inner().freeze(); - let total_size = bytes.len(); + let total_size = bytes.len() as u64; let path = Path::from(path); let payload = object_store::PutPayload::from_bytes(bytes); store diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index ce5c0d720174d..c06da2e75a461 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -44,6 +44,7 @@ use insta::assert_snapshot; use object_store::memory::InMemory; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; @@ -186,7 +187,7 @@ async fn store_parquet_in_memory( location: Path::parse(format!("file-{offset}.parquet")) .expect("creating path"), last_modified: chrono::DateTime::from(SystemTime::now()), - size: buf.len(), + size: buf.len() as u64, e_tag: None, version: None, }; @@ -222,6 +223,7 @@ impl AsyncFileReader for ParquetFileReader { ) -> BoxFuture<'_, parquet::errors::Result> { self.metrics.bytes_scanned.add(range.end - range.start); + let range = range.start as u64..range.end as u64; self.store .get_range(&self.meta.location, range) .map_err(|e| { @@ -232,6 +234,7 @@ impl AsyncFileReader for ParquetFileReader { fn get_metadata( &mut self, + _options: Option<&ArrowReaderOptions>, ) -> BoxFuture<'_, parquet::errors::Result>> { Box::pin(async move { let metadata = fetch_parquet_metadata( diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 7006bf083eeed..f693485cbe018 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -52,7 +52,7 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec let meta = ObjectMeta { location, last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, }; diff --git a/datafusion/core/tests/sql/path_partition.rs b/datafusion/core/tests/sql/path_partition.rs index bf8466d849f25..fa6c7432413f1 100644 --- a/datafusion/core/tests/sql/path_partition.rs +++ b/datafusion/core/tests/sql/path_partition.rs @@ -712,7 +712,7 @@ impl ObjectStore for MirroringObjectStore { let meta = ObjectMeta { location: location.clone(), last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, }; @@ -728,14 +728,15 @@ impl ObjectStore for MirroringObjectStore { async fn get_range( &self, location: &Path, - range: Range, + range: Range, ) -> object_store::Result { self.files.iter().find(|x| *x == location).unwrap(); let path = std::path::PathBuf::from(&self.mirrored_file); let mut file = File::open(path).unwrap(); - file.seek(SeekFrom::Start(range.start as u64)).unwrap(); + file.seek(SeekFrom::Start(range.start)).unwrap(); let to_read = range.end - range.start; + let to_read: usize = to_read.try_into().unwrap(); let mut data = Vec::with_capacity(to_read); let read = file.take(to_read as u64).read_to_end(&mut data).unwrap(); assert_eq!(read, to_read); @@ -750,9 +751,10 @@ impl ObjectStore for MirroringObjectStore { fn list( &self, prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { let prefix = prefix.cloned().unwrap_or_default(); - Box::pin(stream::iter(self.files.iter().filter_map( + let size = self.file_size; + Box::pin(stream::iter(self.files.clone().into_iter().filter_map( move |location| { // Don't return for exact prefix match let filter = location @@ -762,9 +764,9 @@ impl ObjectStore for MirroringObjectStore { filter.then(|| { Ok(ObjectMeta { - location: location.clone(), + location, last_modified: Utc.timestamp_nanos(0), - size: self.file_size as usize, + size, e_tag: None, version: None, }) @@ -802,7 +804,7 @@ impl ObjectStore for MirroringObjectStore { let object = ObjectMeta { location: k.clone(), last_modified: Utc.timestamp_nanos(0), - size: self.file_size as usize, + size: self.file_size, e_tag: None, version: None, }; diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 9d5f9fa16b6eb..96b21c8703e73 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -1513,7 +1513,7 @@ mod tests { let object_meta = ObjectMeta { location: object_store::path::Path::parse(file_name).expect("creating path"), last_modified: chrono::DateTime::from(std::time::SystemTime::now()), - size: data.len(), + size: data.len() as u64, e_tag: None, version: None, }; @@ -1526,8 +1526,11 @@ mod tests { let metrics = ExecutionPlanMetricsSet::new(); let file_metrics = ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics); + let inner = ParquetObjectReader::new(Arc::new(in_memory), object_meta.location) + .with_file_size(object_meta.size as usize); + let reader = ParquetFileReader { - inner: ParquetObjectReader::new(Arc::new(in_memory), object_meta), + inner, file_metrics: file_metrics.clone(), }; let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index f6546ff3f2a64..bcd67bac59aeb 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -441,7 +441,7 @@ impl TryFrom<&PartitionedFile> for protobuf::PartitionedFile { })? as u64; Ok(protobuf::PartitionedFile { path: pf.object_meta.location.as_ref().to_owned(), - size: pf.object_meta.size as u64, + size: pf.object_meta.size, last_modified_ns, partition_values: pf .partition_values diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 9ba0e0c964e9e..cb725a7277fd3 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -61,7 +61,7 @@ pub fn to_substrait_rel( substrait_files.push(FileOrFiles { partition_index: partition_index.try_into().unwrap(), start: 0, - length: file.object_meta.size as u64, + length: file.object_meta.size, path_type: Some(PathType::UriPath( file.object_meta.location.as_ref().to_string(), )), From a8b3c4dc03edc7a4b1811a3fe79e1ac9385fe015 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Apr 2025 15:35:30 -0400 Subject: [PATCH 14/20] update expected message --- datafusion/core/src/datasource/file_format/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e921f0158e540..ad8c0bdb5680e 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -127,7 +127,7 @@ mod tests { .write_parquet(out_dir_url, DataFrameWriteOptions::new(), None) .await .expect_err("should fail because input file does not match inferred schema"); - assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value d for column 0 at line 4"); + assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"); Ok(()) } } From 84725ff50982f1165fc24ac7faa3ccce3c3237b2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 8 Apr 2025 09:24:08 -0400 Subject: [PATCH 15/20] update error messages --- datafusion/core/src/datasource/physical_plan/csv.rs | 2 +- datafusion/core/src/datasource/physical_plan/json.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 5914924797dce..3ef4030134520 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -658,7 +658,7 @@ mod tests { ) .await .expect_err("should fail because input file does not match inferred schema"); - assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value d for column 0 at line 4"); + assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"); Ok(()) } diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 910c4316d9734..736248fbd95df 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -495,7 +495,7 @@ mod tests { .write_json(out_dir_url, DataFrameWriteOptions::new(), None) .await .expect_err("should fail because input file does not match inferred schema"); - assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value d for column 0 at line 4"); + assert_eq!(e.strip_backtrace(), "Arrow error: Parser error: Error while parsing value 'd' as type 'Int64' for column 0 at line 4. Row data: '[d,4]'"); Ok(()) } From 9bfc8a324a571083a3d722b15f8d76d67e8eee10 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 8 Apr 2025 11:58:41 -0400 Subject: [PATCH 16/20] Update to apache --- Cargo.lock | 32 ++++++++++++++++---------------- Cargo.toml | 27 +++++++++++++-------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b6d474b57e3d..51cb3411ce97a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,7 +247,7 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-arith", "arrow-array", @@ -270,7 +270,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,7 +283,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -299,7 +299,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "bytes", "half", @@ -309,7 +309,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -329,7 +329,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-cast", @@ -344,7 +344,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-buffer", "arrow-schema", @@ -355,7 +355,7 @@ dependencies = [ [[package]] name = "arrow-flight" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-arith", "arrow-array", @@ -381,7 +381,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -394,7 +394,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -415,7 +415,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -427,7 +427,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -439,7 +439,7 @@ dependencies = [ [[package]] name = "arrow-schema" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "bitflags 2.8.0", "serde", @@ -448,7 +448,7 @@ dependencies = [ [[package]] name = "arrow-select" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -461,7 +461,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "arrow-array", "arrow-buffer", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "parquet" version = "55.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?branch=alamb/prepare_for_55.0.0#8c911537c385c60bd6870206f5267735d6223007" +source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" dependencies = [ "ahash 0.8.11", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index 86f81f379ea7d..05bf187e5de72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -214,20 +214,19 @@ used_underscore_binding = "warn" unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" - -## Temporary arrow-rs patch until 55 +## Temporary arrow-rs patch until 55 is released ## https://github.com/apache/arrow-rs/pull/7391 [patch.crates-io] -arrow = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-array = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-cast = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-data = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-ipc = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-select = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-string = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } -parquet = { git = "https://github.com/alamb/arrow-rs.git", branch = "alamb/prepare_for_55.0.0" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } From 3d646d7a4784663e6de1ddd022f2b1cc7fac1442 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 8 Apr 2025 10:41:45 -0400 Subject: [PATCH 17/20] Update API for nicer parquet u64s --- .../examples/advanced_parquet_index.rs | 6 +++--- datafusion/core/tests/parquet/custom_reader.rs | 6 +++--- datafusion/datasource-parquet/src/file_format.rs | 8 ++------ datafusion/datasource-parquet/src/reader.rs | 13 +++++++------ .../datasource-parquet/src/row_group_filter.rs | 2 +- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 113c43dc13236..03ef3d66f9d71 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -573,7 +573,7 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory { let object_store = Arc::clone(&self.object_store); let mut inner = ParquetObjectReader::new(object_store, file_meta.object_meta.location) - .with_file_size(file_meta.object_meta.size as usize); + .with_file_size(file_meta.object_meta.size); if let Some(hint) = metadata_size_hint { inner = inner.with_footer_size_hint(hint) @@ -601,7 +601,7 @@ struct ParquetReaderWithCache { impl AsyncFileReader for ParquetReaderWithCache { fn get_bytes( &mut self, - range: Range, + range: Range, ) -> BoxFuture<'_, datafusion::parquet::errors::Result> { println!("get_bytes: {} Reading range {:?}", self.filename, range); self.inner.get_bytes(range) @@ -609,7 +609,7 @@ impl AsyncFileReader for ParquetReaderWithCache { fn get_byte_ranges( &mut self, - ranges: Vec>, + ranges: Vec>, ) -> BoxFuture<'_, datafusion::parquet::errors::Result>> { println!( "get_byte_ranges: {} Reading ranges {:?}", diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index c06da2e75a461..761a78a29fd3a 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -219,11 +219,11 @@ struct ParquetFileReader { impl AsyncFileReader for ParquetFileReader { fn get_bytes( &mut self, - range: Range, + range: Range, ) -> BoxFuture<'_, parquet::errors::Result> { - self.metrics.bytes_scanned.add(range.end - range.start); + let bytes_scanned = range.end - range.start; + self.metrics.bytes_scanned.add(bytes_scanned as usize); - let range = range.start as u64..range.end as u64; self.store .get_range(&self.meta.location, range) .map_err(|e| { diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index f534d6769b9f1..7617d4d70ceed 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -735,11 +735,7 @@ impl<'a> ObjectStoreFetch<'a> { } impl MetadataFetch for ObjectStoreFetch<'_> { - fn fetch( - &mut self, - range: Range, - ) -> BoxFuture<'_, Result> { - let range = range.start as u64..range.end as u64; + fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result> { async { self.store .get_range(&self.meta.location, range) @@ -766,7 +762,7 @@ pub async fn fetch_parquet_metadata( ParquetMetaDataReader::new() .with_prefetch_hint(size_hint) - .load_and_finish(fetch, file_size as usize) + .load_and_finish(fetch, file_size) .await .map_err(DataFusionError::from) } diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 141b30d431379..27ec843c1991d 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -96,21 +96,22 @@ pub(crate) struct ParquetFileReader { impl AsyncFileReader for ParquetFileReader { fn get_bytes( &mut self, - range: Range, + range: Range, ) -> BoxFuture<'_, parquet::errors::Result> { - self.file_metrics.bytes_scanned.add(range.end - range.start); + let bytes_scanned = range.end - range.start; + self.file_metrics.bytes_scanned.add(bytes_scanned as usize); self.inner.get_bytes(range) } fn get_byte_ranges( &mut self, - ranges: Vec>, + ranges: Vec>, ) -> BoxFuture<'_, parquet::errors::Result>> where Self: Send, { - let total = ranges.iter().map(|r| r.end - r.start).sum(); - self.file_metrics.bytes_scanned.add(total); + let total: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + self.file_metrics.bytes_scanned.add(total as usize); self.inner.get_byte_ranges(ranges) } @@ -137,7 +138,7 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { ); let store = Arc::clone(&self.store); let mut inner = ParquetObjectReader::new(store, file_meta.object_meta.location) - .with_file_size(file_meta.object_meta.size as usize); + .with_file_size(file_meta.object_meta.size); if let Some(hint) = metadata_size_hint { inner = inner.with_footer_size_hint(hint) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 96b21c8703e73..13418cdeee223 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -1527,7 +1527,7 @@ mod tests { let file_metrics = ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics); let inner = ParquetObjectReader::new(Arc::new(in_memory), object_meta.location) - .with_file_size(object_meta.size as usize); + .with_file_size(object_meta.size); let reader = ParquetFileReader { inner, From 86aab05cf3b13a1eaf741c6a245447c15a541e11 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 8 Apr 2025 12:56:06 -0400 Subject: [PATCH 18/20] Fix wasm build --- datafusion/core/src/datasource/file_format/arrow.rs | 1 + datafusion/core/src/datasource/physical_plan/arrow_file.rs | 1 + datafusion/datasource-csv/src/source.rs | 1 + datafusion/datasource-json/src/file_format.rs | 1 + datafusion/datasource-json/src/source.rs | 1 + 5 files changed, 5 insertions(+) diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index ea027ae91a061..7fc27453d1ad5 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -144,6 +144,7 @@ impl FileFormat for ArrowFormat { for object in objects { let r = store.as_ref().get(&object.location).await?; let schema = match r.payload { + #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, _) => { let reader = FileReader::try_new(&mut file, None)?; reader.schema() diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 21d962e6d11fb..f0a1f94d87e1f 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -273,6 +273,7 @@ impl FileOpener for ArrowOpener { None => { let r = object_store.get(file_meta.location()).await?; match r.payload { + #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(file, _) => { let arrow_reader = arrow::ipc::reader::FileReader::try_new( file, projection, diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 6db4d18703204..f5d45cd3fc881 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -704,6 +704,7 @@ impl FileOpener for CsvOpener { let result = store.get_opts(file_meta.location(), options).await?; match result.payload { + #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, _) => { let is_whole_file_scanned = file_meta.range.is_none(); let decoder = if is_whole_file_scanned { diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index a6c52312e4127..8d0515804fc7b 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -209,6 +209,7 @@ impl FileFormat for JsonFormat { let r = store.as_ref().get(&object.location).await?; let schema = match r.payload { + #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(file, _) => { let decoder = file_compression_type.convert_read(file)?; let mut reader = BufReader::new(decoder); diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index f1adccf9ded7d..ee96d050966d6 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -355,6 +355,7 @@ impl FileOpener for JsonOpener { let result = store.get_opts(file_meta.location(), options).await?; match result.payload { + #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, _) => { let bytes = match file_meta.range { None => file_compression_type.convert_read(file)?, From 2a30ca3bbdffecc1cb828a7cda05adff84da2c85 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Apr 2025 13:27:17 -0400 Subject: [PATCH 19/20] Remove pin --- Cargo.lock | 52 ++++++++++++++++++++++++++++++++++------------------ Cargo.toml | 17 ----------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1444ea39e24aa..753bb53f2c352 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,7 +247,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" dependencies = [ "arrow-arith", "arrow-array", @@ -270,7 +271,8 @@ dependencies = [ [[package]] name = "arrow-arith" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00752064ff47cee746e816ddb8450520c3a52cbad1e256f6fa861a35f86c45e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,7 +285,8 @@ dependencies = [ [[package]] name = "arrow-array" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cebfe926794fbc1f49ddd0cdaf898956ca9f6e79541efce62dabccfd81380472" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -299,7 +302,8 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0303c7ec4cf1a2c60310fc4d6bbc3350cd051a17bf9e9c0a8e47b4db79277824" dependencies = [ "bytes", "half", @@ -309,7 +313,8 @@ dependencies = [ [[package]] name = "arrow-cast" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335f769c5a218ea823d3760a743feba1ef7857cba114c01399a891c2fff34285" dependencies = [ "arrow-array", "arrow-buffer", @@ -329,7 +334,8 @@ dependencies = [ [[package]] name = "arrow-csv" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "510db7dfbb4d5761826516cc611d97b3a68835d0ece95b034a052601109c0b1b" dependencies = [ "arrow-array", "arrow-cast", @@ -344,7 +350,8 @@ dependencies = [ [[package]] name = "arrow-data" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8affacf3351a24039ea24adab06f316ded523b6f8c3dbe28fbac5f18743451b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -355,7 +362,8 @@ dependencies = [ [[package]] name = "arrow-flight" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e0fad280f41a918d53ba48288a246ff04202d463b3b380fbc0edecdcb52cfd" dependencies = [ "arrow-arith", "arrow-array", @@ -381,7 +389,8 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69880a9e6934d9cba2b8630dd08a3463a91db8693b16b499d54026b6137af284" dependencies = [ "arrow-array", "arrow-buffer", @@ -394,7 +403,8 @@ dependencies = [ [[package]] name = "arrow-json" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8dafd17a05449e31e0114d740530e0ada7379d7cb9c338fd65b09a8130960b0" dependencies = [ "arrow-array", "arrow-buffer", @@ -415,7 +425,8 @@ dependencies = [ [[package]] name = "arrow-ord" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "895644523af4e17502d42c3cb6b27cb820f0cb77954c22d75c23a85247c849e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -427,7 +438,8 @@ dependencies = [ [[package]] name = "arrow-row" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9be8a2a4e5e7d9c822b2b8095ecd77010576d824f654d347817640acfc97d229" dependencies = [ "arrow-array", "arrow-buffer", @@ -439,7 +451,8 @@ dependencies = [ [[package]] name = "arrow-schema" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7450c76ab7c5a6805be3440dc2e2096010da58f7cab301fdc996a4ee3ee74e49" dependencies = [ "bitflags 2.8.0", "serde", @@ -448,7 +461,8 @@ dependencies = [ [[package]] name = "arrow-select" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa5f5a93c75f46ef48e4001535e7b6c922eeb0aa20b73cf58d09e13d057490d8" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -461,7 +475,8 @@ dependencies = [ [[package]] name = "arrow-string" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e7005d858d84b56428ba2a98a107fe88c0132c61793cf6b8232a1f9bfc0452b" dependencies = [ "arrow-array", "arrow-buffer", @@ -4368,7 +4383,8 @@ dependencies = [ [[package]] name = "parquet" version = "55.0.0" -source = "git+https://github.com/apache/arrow-rs.git?rev=9322547590ab32efeff8c0486e4a3a2cb5887a26#9322547590ab32efeff8c0486e4a3a2cb5887a26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd31a8290ac5b19f09ad77ee7a1e6a541f1be7674ad410547d5f1eef6eef4a9c" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4769,7 +4785,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck 0.5.0", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -4789,7 +4805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.100", diff --git a/Cargo.toml b/Cargo.toml index 05bf187e5de72..de53b7df50d98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -213,20 +213,3 @@ used_underscore_binding = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" - -## Temporary arrow-rs patch until 55 is released -## https://github.com/apache/arrow-rs/pull/7391 - -[patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "9322547590ab32efeff8c0486e4a3a2cb5887a26" } From 1f0711e6f33903226d835808e6b178032f38e178 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Apr 2025 13:33:08 -0400 Subject: [PATCH 20/20] Fix signature --- datafusion/core/tests/tracing/traceable_object_store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/tracing/traceable_object_store.rs b/datafusion/core/tests/tracing/traceable_object_store.rs index e979200c8d9bf..dfcafc3a63da1 100644 --- a/datafusion/core/tests/tracing/traceable_object_store.rs +++ b/datafusion/core/tests/tracing/traceable_object_store.rs @@ -96,7 +96,7 @@ impl ObjectStore for TraceableObjectStore { fn list( &self, prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { futures::executor::block_on(assert_traceability()); self.inner.list(prefix) }