-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Remove redundant collect_stat and target_partitions on ListingOptions
#22969
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -533,15 +533,15 @@ impl TableProvider for ListingTable { | |
| &self.table_schema, | ||
| &partitioned_file_lists, | ||
| output_ordering, | ||
| self.options.target_partitions, | ||
| state.config().target_partitions(), | ||
| ) | ||
| }) | ||
| }) | ||
| .flatten() | ||
| { | ||
| Some(Err(e)) => log::debug!("failed to split file groups by statistics: {e}"), | ||
| Some(Ok(new_groups)) => { | ||
| if new_groups.len() <= self.options.target_partitions { | ||
| if new_groups.len() <= state.config().target_partitions() { | ||
| partitioned_file_lists = new_groups; | ||
| } else { | ||
| log::debug!( | ||
|
|
@@ -724,7 +724,7 @@ impl ListingTable { | |
| let files = file_list | ||
| .map(|part_file| async { | ||
| let part_file = part_file?; | ||
| let (statistics, ordering) = if self.options.collect_stat { | ||
| let (statistics, ordering) = if ctx.config().collect_statistics() { | ||
| self.do_collect_statistics_and_ordering(ctx, &store, &part_file) | ||
| .await? | ||
| } else { | ||
|
|
@@ -738,7 +738,7 @@ impl ListingTable { | |
| .buffer_unordered(ctx.config_options().execution.meta_fetch_concurrency); | ||
|
|
||
| let (file_group, inexact_stats) = | ||
| get_files_with_limit(files, limit, self.options.collect_stat).await?; | ||
| get_files_with_limit(files, limit, ctx.config().collect_statistics()).await?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I double checked that this is just a reference (not a deep copy) |
||
|
|
||
| // Threshold: 0 = disabled, N > 0 = enabled when distinct_keys >= N | ||
| // | ||
|
|
@@ -747,32 +747,32 @@ impl ListingTable { | |
| // hash repartitioning for aggregates and joins on partition columns. | ||
| let threshold = ctx.config_options().optimizer.preserve_file_partitions; | ||
|
|
||
| let (file_groups, grouped_by_partition) = if threshold > 0 | ||
| && !self.options.table_partition_cols.is_empty() | ||
| { | ||
| let grouped = | ||
| file_group.group_by_partition_values(self.options.target_partitions); | ||
| if grouped.len() >= threshold { | ||
| (grouped, true) | ||
| let (file_groups, grouped_by_partition) = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is weird that rustfmt reformattted this as it lookslike the code is mostly the same up here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indeed, it shifted this whole block even though only line changed 🤷 |
||
| if threshold > 0 && !self.options.table_partition_cols.is_empty() { | ||
| let grouped = file_group | ||
| .group_by_partition_values(ctx.config().target_partitions()); | ||
| if grouped.len() >= threshold { | ||
| (grouped, true) | ||
| } else { | ||
| let all_files: Vec<_> = | ||
| grouped.into_iter().flat_map(|g| g.into_inner()).collect(); | ||
| ( | ||
| FileGroup::new(all_files) | ||
| .split_files(ctx.config().target_partitions()), | ||
| false, | ||
| ) | ||
| } | ||
| } else { | ||
| let all_files: Vec<_> = | ||
| grouped.into_iter().flat_map(|g| g.into_inner()).collect(); | ||
| ( | ||
| FileGroup::new(all_files).split_files(self.options.target_partitions), | ||
| file_group.split_files(ctx.config().target_partitions()), | ||
| false, | ||
| ) | ||
| } | ||
| } else { | ||
| ( | ||
| file_group.split_files(self.options.target_partitions), | ||
| false, | ||
| ) | ||
| }; | ||
| }; | ||
|
|
||
| let (file_groups, stats) = compute_all_files_statistics( | ||
| file_groups, | ||
| self.schema(), | ||
| self.options.collect_stat, | ||
| ctx.config().collect_statistics(), | ||
| inexact_stats, | ||
| )?; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
technically speaking I can see someone wanting to wanting to control statistics collection on a per table (rather than a per session) basis -- I think you could still get the same effect by changing the session config settings before creating the table, rather than explicitly setting it on the ListingOptions
So while this is a brekaing change I think users can use the same pattern
I think the note in the upgrade guide looks good to me