apache · alamb · Jul 25, 2024 · Jul 15, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -154,4 +154,5 @@ rpath = false
 large_futures = "warn"
 
 [workspace.lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
 unused_imports = "deny"
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
@@ -39,6 +39,7 @@ path = "src/lib.rs"
 avro = ["apache-avro"]
 backtrace = []
 pyarrow = ["pyo3", "arrow/pyarrow", "parquet"]
+force_hash_collisions = []
 
 [dependencies]
 ahash = { workspace = true }

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -1207,7 +1207,7 @@ impl ConfigField for TableOptions {
     /// # Parameters
     ///
     /// * `key`: The configuration key specifying which setting to adjust, prefixed with the format (e.g., "format.delimiter")
-    /// for CSV format.
+    ///   for CSV format.
     /// * `value`: The value to set for the specified configuration key.
     ///
     /// # Returns

diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
@@ -17,22 +17,27 @@
 
 //! Functionality used both on logical and physical plans
 
+#[cfg(not(feature = "force_hash_collisions"))]
 use std::sync::Arc;
 
 use ahash::RandomState;
 use arrow::array::*;
 use arrow::datatypes::*;
 use arrow::row::Rows;
+#[cfg(not(feature = "force_hash_collisions"))]
 use arrow::{downcast_dictionary_array, downcast_primitive_array};
 use arrow_buffer::IntervalDayTime;
 use arrow_buffer::IntervalMonthDayNano;
 
+#[cfg(not(feature = "force_hash_collisions"))]
 use crate::cast::{
     as_boolean_array, as_fixed_size_list_array, as_generic_binary_array,
     as_large_list_array, as_list_array, as_map_array, as_primitive_array,
     as_string_array, as_struct_array,
 };
-use crate::error::{Result, _internal_err};
+use crate::error::Result;
+#[cfg(not(feature = "force_hash_collisions"))]
+use crate::error::_internal_err;
 
 // Combines two hashes into one hash
 #[inline]
@@ -41,6 +46,7 @@ pub fn combine_hashes(l: u64, r: u64) -> u64 {
     hash.wrapping_mul(37).wrapping_add(r)
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
     if mul_col {
         hashes_buffer.iter_mut().for_each(|hash| {
@@ -90,6 +96,7 @@ hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
 /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
 /// If `rehash==true` this combines the previous hash value in the buffer
 /// with the new hash using `combine_hashes`
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array_primitive<T>(
     array: &PrimitiveArray<T>,
     random_state: &RandomState,
@@ -135,6 +142,7 @@ fn hash_array_primitive<T>(
 /// Hashes one array into the `hashes_buffer`
 /// If `rehash==true` this combines the previous hash value in the buffer
 /// with the new hash using `combine_hashes`
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_array<T>(
     array: T,
     random_state: &RandomState,
@@ -180,6 +188,7 @@ fn hash_array<T>(
 }
 
 /// Hash the values in a dictionary array
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_dictionary<K: ArrowDictionaryKeyType>(
     array: &DictionaryArray<K>,
     random_state: &RandomState,
@@ -210,6 +219,7 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>(
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_struct_array(
     array: &StructArray,
     random_state: &RandomState,
@@ -270,6 +280,7 @@ fn hash_map_array(
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_list_array<OffsetSize>(
     array: &GenericListArray<OffsetSize>,
     random_state: &RandomState,
@@ -303,6 +314,7 @@ where
     Ok(())
 }
 
+#[cfg(not(feature = "force_hash_collisions"))]
 fn hash_fixed_list_array(
     array: &FixedSizeListArray,
     random_state: &RandomState,
@@ -488,7 +500,11 @@ pub fn create_row_hashes_v2<'a>(
 
 #[cfg(test)]
 mod tests {
-    use arrow::{array::*, datatypes::*};
+    use std::sync::Arc;
+
+    use arrow::array::*;
+    #[cfg(not(feature = "force_hash_collisions"))]
+    use arrow::datatypes::*;
 
     use super::*;
 

diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
@@ -43,14 +43,14 @@ macro_rules! handle_transform_recursion {
 /// There are three categories of TreeNode APIs:
 ///
 /// 1. "Inspecting" APIs to traverse a tree of `&TreeNodes`:
-/// [`apply`], [`visit`], [`exists`].
+///    [`apply`], [`visit`], [`exists`].
 ///
 /// 2. "Transforming" APIs that traverse and consume a tree of `TreeNode`s
-/// producing possibly changed `TreeNode`s: [`transform`], [`transform_up`],
-/// [`transform_down`], [`transform_down_up`], and [`rewrite`].
+///    producing possibly changed `TreeNode`s: [`transform`], [`transform_up`],
+///    [`transform_down`], [`transform_down_up`], and [`rewrite`].
 ///
 /// 3. Internal APIs used to implement the `TreeNode` API: [`apply_children`],
-/// and [`map_children`].
+///    and [`map_children`].
 ///
 /// | Traversal Order | Inspecting | Transforming |
 /// | --- | --- | --- |

diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
@@ -24,7 +24,7 @@ use crate::{DataFusionError, Result};
 /// # Parameters
 /// - `num_elements`: The number of elements expected in the hash table.
 /// - `fixed_size`: A fixed overhead size associated with the collection
-/// (e.g., HashSet or HashTable).
+///    (e.g., HashSet or HashTable).
 /// - `T`: The type of elements stored in the hash table.
 ///
 /// # Details

diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs
@@ -335,7 +335,7 @@ pub fn get_at_indices<T: Clone, I: Borrow<usize>>(
 /// This function finds the longest prefix of the form 0, 1, 2, ... within the
 /// collection `sequence`. Examples:
 /// - For 0, 1, 2, 4, 5; we would produce 3, meaning 0, 1, 2 is the longest satisfying
-/// prefix.
+///   prefix.
 /// - For 1, 2, 3, 4; we would produce 0, meaning there is no such prefix.
 pub fn longest_consecutive_prefix<T: Borrow<usize>>(
     sequence: impl IntoIterator<Item = T>,

diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
@@ -62,7 +62,7 @@ default = [
 ]
 encoding_expressions = ["datafusion-functions/encoding_expressions"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
-force_hash_collisions = []
+force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"]
 math_expressions = ["datafusion-functions/math_expressions"]
 parquet = ["datafusion-common/parquet", "dep:parquet"]
 pyarrow = ["datafusion-common/pyarrow", "parquet"]

diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs
@@ -21,7 +21,7 @@
 //! 1. Creates a list of tuples (sorted if necessary)
 //!
 //! 2. Divides those tuples across some number of streams of [`RecordBatch`]
-//! preserving any ordering
+//!    preserving any ordering
 //!
 //! 3. Times how long it takes for a given sort plan to process the input
 //!

diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs
@@ -141,12 +141,12 @@ pub trait CatalogList: CatalogProviderList {}
 /// Here are some examples of how to implement custom catalogs:
 ///
 /// * [`datafusion-cli`]: [`DynamicFileCatalogProvider`] catalog provider
-/// that treats files and directories on a filesystem as tables.
+///   that treats files and directories on a filesystem as tables.
 ///
 /// * The [`catalog.rs`]:  a simple directory based catalog.
 ///
 ///  * [delta-rs]:  [`UnityCatalogProvider`] implementation that can
-///  read from Delta Lake tables
+///    read from Delta Lake tables
 ///
 /// [`datafusion-cli`]: https://datafusion.apache.org/user-guide/cli/index.html
 /// [`DynamicFileCatalogProvider`]: https://github.com/apache/datafusion/blob/31b9b48b08592b7d293f46e75707aad7dadd7cbc/datafusion-cli/src/catalog.rs#L75

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
@@ -114,15 +114,15 @@ impl Default for DataFrameWriteOptions {
 /// The typical workflow using DataFrames looks like
 ///
 /// 1. Create a DataFrame via methods on [SessionContext], such as [`read_csv`]
-/// and [`read_parquet`].
+///    and [`read_parquet`].
 ///
 /// 2. Build a desired calculation by calling methods such as [`filter`],
-/// [`select`], [`aggregate`], and [`limit`]
+///    [`select`], [`aggregate`], and [`limit`]
 ///
 /// 3. Execute into [`RecordBatch`]es by calling [`collect`]
 ///
 /// A `DataFrame` is a wrapper around a [`LogicalPlan`] and the [`SessionState`]
-/// required for execution.
+///    required for execution.
 ///
 /// DataFrames are "lazy" in the sense that most methods do not actually compute
 /// anything, they just build up a plan. Calling [`collect`] executes the plan

diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs
@@ -49,7 +49,7 @@ use object_store::{ObjectMeta, ObjectStore};
 /// This means that if this function returns true:
 /// - the table provider can filter the table partition values with this expression
 /// - the expression can be marked as `TableProviderFilterPushDown::Exact` once this filtering
-/// was performed
+///   was performed
 pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool {
     let mut is_applicable = true;
     expr.apply(|expr| {

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
@@ -287,17 +287,17 @@ impl ListingOptions {
     ///# Notes
     ///
     /// - If only one level (e.g. `year` in the example above) is
-    /// specified, the other levels are ignored but the files are
-    /// still read.
+    ///   specified, the other levels are ignored but the files are
+    ///   still read.
     ///
     /// - Files that don't follow this partitioning scheme will be
-    /// ignored.
+    ///   ignored.
     ///
     /// - Since the columns have the same value for all rows read from
-    /// each individual file (such as dates), they are typically
-    /// dictionary encoded for efficiency. You may use
-    /// [`wrap_partition_type_in_dict`] to request a
-    /// dictionary-encoded type.
+    ///   each individual file (such as dates), they are typically
+    ///   dictionary encoded for efficiency. You may use
+    ///   [`wrap_partition_type_in_dict`] to request a
+    ///   dictionary-encoded type.
     ///
     /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.
     ///

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs
@@ -119,32 +119,32 @@ pub use writer::plan_to_parquet;
 /// Supports the following optimizations:
 ///
 /// * Concurrent reads: Can read from one or more files in parallel as multiple
-/// partitions, including concurrently reading multiple row groups from a single
-/// file.
+///   partitions, including concurrently reading multiple row groups from a single
+///   file.
 ///
 /// * Predicate push down: skips row groups and pages based on
-/// min/max/null_counts in the row group metadata, the page index and bloom
-/// filters.
+///   min/max/null_counts in the row group metadata, the page index and bloom
+///   filters.
 ///
 /// * Projection pushdown: reads and decodes only the columns required.
 ///
 /// * Limit pushdown: stop execution early after some number of rows are read.
 ///
 /// * Custom readers: customize reading  parquet files, e.g. to cache metadata,
-/// coalesce I/O operations, etc. See [`ParquetFileReaderFactory`] for more
-/// details.
+///   coalesce I/O operations, etc. See [`ParquetFileReaderFactory`] for more
+///   details.
 ///
 /// * Schema adapters: read parquet files with different schemas into a unified
-/// table schema. This can be used to implement "schema evolution". See
-/// [`SchemaAdapterFactory`] for more details.
+///   table schema. This can be used to implement "schema evolution". See
+///   [`SchemaAdapterFactory`] for more details.
 ///
 /// * metadata_size_hint: controls the number of bytes read from the end of the
-/// file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
-/// custom reader is used, it supplies the metadata directly and this parameter
-/// is ignored. [`ParquetExecBuilder::with_metadata_size_hint`] for more details.
+///   file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
+///   custom reader is used, it supplies the metadata directly and this parameter
+///   is ignored. [`ParquetExecBuilder::with_metadata_size_hint`] for more details.
 ///
 /// * User provided  [`ParquetAccessPlan`]s to skip row groups and/or pages
-/// based on external information. See "Implementing External Indexes" below
+///   based on external information. See "Implementing External Indexes" below
 ///
 /// # Implementing External Indexes
 ///
@@ -191,22 +191,22 @@ pub use writer::plan_to_parquet;
 /// # Execution Overview
 ///
 /// * Step 1: [`ParquetExec::execute`] is called, returning a [`FileStream`]
-/// configured to open parquet files with a [`ParquetOpener`].
+///   configured to open parquet files with a [`ParquetOpener`].
 ///
 /// * Step 2: When the stream is polled, the [`ParquetOpener`] is called to open
-/// the file.
+///   the file.
 ///
 /// * Step 3: The `ParquetOpener` gets the [`ParquetMetaData`] (file metadata)
-/// via [`ParquetFileReaderFactory`], creating a [`ParquetAccessPlan`] by
-/// applying predicates to metadata. The plan and projections are used to
-/// determine what pages must be read.
+///   via [`ParquetFileReaderFactory`], creating a [`ParquetAccessPlan`] by
+///   applying predicates to metadata. The plan and projections are used to
+///   determine what pages must be read.
 ///
 /// * Step 4: The stream begins reading data, fetching the required pages
-/// and incrementally decoding them.
+///   and incrementally decoding them.
 ///
 /// * Step 5: As each [`RecordBatch]` is read, it may be adapted by a
-/// [`SchemaAdapter`] to match the table schema. By default missing columns are
-/// filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
+///   [`SchemaAdapter`] to match the table schema. By default missing columns are
+///   filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
 ///
 /// [`RecordBatch`]: arrow::record_batch::RecordBatch
 /// [`SchemaAdapter`]: crate::datasource::schema_adapter::SchemaAdapter

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -1358,14 +1358,14 @@ impl<'a> StatisticsConverter<'a> {
     /// # Parameters:
     ///
     /// * `column_page_index`: The parquet column page indices, read from
-    /// `ParquetMetaData` column_index
+    ///   `ParquetMetaData` column_index
     ///
     /// * `column_offset_index`: The parquet column offset indices, read from
-    /// `ParquetMetaData` offset_index
+    ///   `ParquetMetaData` offset_index
     ///
     /// * `row_group_indices`: The indices of the row groups, that are used to
-    /// extract the column page index and offset index on a per row group
-    /// per column basis.
+    ///   extract the column page index and offset index on a per row group
+    ///   per column basis.
     ///
     /// # Return Value
     ///
@@ -1486,13 +1486,13 @@ impl<'a> StatisticsConverter<'a> {
     /// # Parameters:
     ///
     /// * `column_offset_index`: The parquet column offset indices, read from
-    /// `ParquetMetaData` offset_index
+    ///   `ParquetMetaData` offset_index
     ///
     /// * `row_group_metadatas`: The metadata slice of the row groups, read
-    /// from `ParquetMetaData` row_groups
+    ///   from `ParquetMetaData` row_groups
     ///
     /// * `row_group_indices`: The indices of the row groups, that are used to
-    /// extract the column offset index on a per row group per column basis.
+    ///   extract the column offset index on a per row group per column basis.
     ///
     /// See docs on [`Self::data_page_mins`] for details.
     pub fn data_page_row_counts<I>(