Skip to content

Commit 980c948

Browse files
authored
Upgrade to arrow 56.1.0 (#17275)
* Update to arrow/parquet 56.1.0 * Adjust for new parquet sizes, update for deprecated API * Thread through max_predicate_cache_size, add test
1 parent 4528f2f commit 980c948

File tree

21 files changed

+311
-69
lines changed

21 files changed

+311
-69
lines changed

Cargo.lock

Lines changed: 35 additions & 34 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,19 +90,19 @@ ahash = { version = "0.8", default-features = false, features = [
9090
"runtime-rng",
9191
] }
9292
apache-avro = { version = "0.20", default-features = false }
93-
arrow = { version = "56.0.0", features = [
93+
arrow = { version = "56.1.0", features = [
9494
"prettyprint",
9595
"chrono-tz",
9696
] }
97-
arrow-buffer = { version = "56.0.0", default-features = false }
98-
arrow-flight = { version = "56.0.0", features = [
97+
arrow-buffer = { version = "56.1.0", default-features = false }
98+
arrow-flight = { version = "56.1.0", features = [
9999
"flight-sql-experimental",
100100
] }
101-
arrow-ipc = { version = "56.0.0", default-features = false, features = [
101+
arrow-ipc = { version = "56.1.0", default-features = false, features = [
102102
"lz4",
103103
] }
104-
arrow-ord = { version = "56.0.0", default-features = false }
105-
arrow-schema = { version = "56.0.0", default-features = false }
104+
arrow-ord = { version = "56.1.0", default-features = false }
105+
arrow-schema = { version = "56.1.0", default-features = false }
106106
async-trait = "0.1.89"
107107
bigdecimal = "0.4.8"
108108
bytes = "1.10"
@@ -157,7 +157,7 @@ itertools = "0.14"
157157
log = "^0.4"
158158
object_store = { version = "0.12.3", default-features = false }
159159
parking_lot = "0.12"
160-
parquet = { version = "56.0.0", default-features = false, features = [
160+
parquet = { version = "56.1.0", default-features = false, features = [
161161
"arrow",
162162
"async",
163163
"object_store",

datafusion-cli/src/main.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -571,15 +571,15 @@ mod tests {
571571
let df = ctx.sql(sql).await?;
572572
let rbs = df.collect().await?;
573573

574-
assert_snapshot!(batches_to_string(&rbs),@r#"
574+
assert_snapshot!(batches_to_string(&rbs),@r"
575575
+-----------------------------------+-----------------+---------------------+------+------------------+
576576
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
577577
+-----------------------------------+-----------------+---------------------+------+------------------+
578578
| alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false |
579-
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
579+
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
580580
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false |
581581
+-----------------------------------+-----------------+---------------------+------+------------------+
582-
"#);
582+
");
583583

584584
// increase the number of hits
585585
ctx.sql("select * from alltypes_plain")
@@ -602,15 +602,15 @@ mod tests {
602602
let df = ctx.sql(sql).await?;
603603
let rbs = df.collect().await?;
604604

605-
assert_snapshot!(batches_to_string(&rbs),@r#"
605+
assert_snapshot!(batches_to_string(&rbs),@r"
606606
+-----------------------------------+-----------------+---------------------+------+------------------+
607607
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
608608
+-----------------------------------+-----------------+---------------------+------+------------------+
609609
| alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false |
610-
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
610+
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
611611
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false |
612612
+-----------------------------------+-----------------+---------------------+------+------------------+
613-
"#);
613+
");
614614

615615
Ok(())
616616
}

datafusion/common/src/config.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,14 @@ config_namespace! {
566566
/// (reading) Use any available bloom filters when reading parquet files
567567
pub bloom_filter_on_read: bool, default = true
568568

569+
/// (reading) The maximum predicate cache size, in bytes. When
570+
/// `pushdown_filters` is enabled, sets the maximum memory used to cache
571+
/// the results of predicate evaluation between filter evaluation and
572+
/// output generation. Decreasing this value will reduce memory usage,
573+
/// but may increase IO and CPU usage. None means use the default
574+
/// parquet reader setting. 0 means no caching.
575+
pub max_predicate_cache_size: Option<usize>, default = None
576+
569577
// The following options affect writing to parquet files
570578
// and map to parquet::file::properties::WriterProperties
571579

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ impl ParquetOptions {
208208
binary_as_string: _, // not used for writer props
209209
coerce_int96: _, // not used for writer props
210210
skip_arrow_metadata: _,
211+
max_predicate_cache_size: _,
211212
} = self;
212213

213214
let mut builder = WriterProperties::builder()
@@ -400,6 +401,10 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
400401
#[cfg(feature = "parquet")]
401402
#[cfg(test)]
402403
mod tests {
404+
use super::*;
405+
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
406+
#[cfg(feature = "parquet_encryption")]
407+
use crate::encryption::map_encryption_to_config_encryption;
403408
use parquet::{
404409
basic::Compression,
405410
file::properties::{
@@ -409,11 +414,6 @@ mod tests {
409414
};
410415
use std::collections::HashMap;
411416

412-
use super::*;
413-
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
414-
#[cfg(feature = "parquet_encryption")]
415-
use crate::encryption::map_encryption_to_config_encryption;
416-
417417
const COL_NAME: &str = "configured";
418418

419419
/// Take the column defaults provided in [`ParquetOptions`], and generate a non-default col config.
@@ -475,6 +475,7 @@ mod tests {
475475
binary_as_string: defaults.binary_as_string,
476476
skip_arrow_metadata: defaults.skip_arrow_metadata,
477477
coerce_int96: None,
478+
max_predicate_cache_size: defaults.max_predicate_cache_size,
478479
}
479480
}
480481

@@ -581,6 +582,8 @@ mod tests {
581582
maximum_buffered_record_batches_per_stream: global_options_defaults
582583
.maximum_buffered_record_batches_per_stream,
583584
bloom_filter_on_read: global_options_defaults.bloom_filter_on_read,
585+
max_predicate_cache_size: global_options_defaults
586+
.max_predicate_cache_size,
584587
schema_force_view_types: global_options_defaults.schema_force_view_types,
585588
binary_as_string: global_options_defaults.binary_as_string,
586589
skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,

0 commit comments

Comments
 (0)