@@ -1046,39 +1046,41 @@ where
10461046
10471047/// Extracts Parquet statistics as Arrow arrays
10481048///
1049- /// This is used to convert Parquet statistics to Arrow arrays, with proper type
1050- /// conversions. This information can be used for pruning parquet files or row
1051- /// groups based on the statistics embedded in parquet files
1049+ /// This is used to convert Parquet statistics to Arrow [`ArrayRef`], with
1050+ /// proper type conversions. This information can be used for pruning Parquet
1051+ /// files, row groups, and data pages based on the statistics embedded in
1052+ /// Parquet metadata.
10521053///
10531054/// # Schemas
10541055///
1055- /// The schema of the parquet file and the arrow schema are used to convert the
1056- /// underlying statistics value (stored as a parquet value) into the
1057- /// corresponding Arrow value. For example, Decimals are stored as binary in
1058- /// parquet files.
1056+ /// The converter ues the schema of the Parquet file and the Arrow schema to
1057+ /// convert the underlying statistics value (stored as a parquet value) into the
1058+ /// corresponding Arrow value. For example, Decimals are stored as binary in
1059+ /// parquet files and this structure handles mapping them to the `i128`
1060+ /// representation used in Arrow.
10591061///
1060- /// The parquet_schema and arrow_schema do not have to be identical (for
1062+ /// Note: The Parquet schema and Arrow schema do not have to be identical (for
10611063/// example, the columns may be in different orders and one or the other schemas
10621064/// may have additional columns). The function [`parquet_column`] is used to
1063- /// match the column in the parquet file to the column in the arrow schema.
1065+ /// match the column in the Parquet schema to the column in the Arrow schema.
10641066#[ derive( Debug ) ]
10651067pub struct StatisticsConverter < ' a > {
1066- /// the index of the matched column in the parquet schema
1068+ /// the index of the matched column in the Parquet schema
10671069 parquet_column_index : Option < usize > ,
1068- /// The field (with data type) of the column in the arrow schema
1070+ /// The field (with data type) of the column in the Arrow schema
10691071 arrow_field : & ' a Field ,
10701072}
10711073
10721074impl < ' a > StatisticsConverter < ' a > {
1073- /// Return the index of the column in the parquet schema, if any
1075+ /// Return the index of the column in the Parquet schema, if any
10741076 ///
10751077 /// Returns `None` if the column is was present in the Arrow schema, but not
10761078 /// present in the parquet file
10771079 pub fn parquet_column_index ( & self ) -> Option < usize > {
10781080 self . parquet_column_index
10791081 }
10801082
1081- /// Return the arrow schema's [`Field]` of the column in the arrow schema
1083+ /// Return the arrow schema's [`Field]` of the column in the Arrow schema
10821084 pub fn arrow_field ( & self ) -> & ' a Field {
10831085 self . arrow_field
10841086 }
@@ -1093,7 +1095,7 @@ impl<'a> StatisticsConverter<'a> {
10931095 /// # Example
10941096 /// ```no_run
10951097 /// # use arrow::datatypes::Schema;
1096- /// # use arrow_array::ArrayRef;
1098+ /// # use arrow_array::{ ArrayRef, UInt64Array} ;
10971099 /// # use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
10981100 /// # use parquet::file::metadata::ParquetMetaData;
10991101 /// # fn get_parquet_metadata() -> ParquetMetaData { unimplemented!() }
@@ -1109,7 +1111,9 @@ impl<'a> StatisticsConverter<'a> {
11091111 /// let row_counts = converter.row_group_row_counts(metadata
11101112 /// .row_groups()
11111113 /// .iter()
1112- /// );
1114+ /// ).unwrap();
1115+ /// // file had 2 row groups, with 1024 and 23 rows respectively
1116+ /// assert_eq!(row_counts, Some(UInt64Array::from(vec![1024, 23])));
11131117 /// ```
11141118 pub fn row_group_row_counts < I > ( & self , metadatas : I ) -> Result < Option < UInt64Array > >
11151119 where
@@ -1199,8 +1203,9 @@ impl<'a> StatisticsConverter<'a> {
11991203 ///
12001204 /// # Example
12011205 /// ```no_run
1206+ /// # use std::sync::Arc;
12021207 /// # use arrow::datatypes::Schema;
1203- /// # use arrow_array::ArrayRef;
1208+ /// # use arrow_array::{ ArrayRef, Float64Array} ;
12041209 /// # use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
12051210 /// # use parquet::file::metadata::ParquetMetaData;
12061211 /// # fn get_parquet_metadata() -> ParquetMetaData { unimplemented!() }
@@ -1216,6 +1221,8 @@ impl<'a> StatisticsConverter<'a> {
12161221 /// let min_values: ArrayRef = converter
12171222 /// .row_group_mins(metadata.row_groups().iter())
12181223 /// .unwrap();
1224+ /// // if "foo" is a Float64 value, the returned array will contain Float64 values
1225+ /// assert_eq!(min_values, Arc::new(Float64Array::from(vec![Some(1.0), Some(2.0)])) as _);
12191226 /// ```
12201227 pub fn row_group_mins < I > ( & self , metadatas : I ) -> Result < ArrayRef >
12211228 where
0 commit comments