Skip to content

Commit 93c7a12

Browse files
authored
Mark Encoding::BIT_PACKED as deprecated and document its compatibility issues (#5348)
* Mark Encoding::BIT_PACKED as deprecated and document its compatibility issues * Allow deprecated BIT_PACKED in parquet-layout binary
1 parent 31cf5ce commit 93c7a12

File tree

9 files changed

+27
-2
lines changed

9 files changed

+27
-2
lines changed

parquet/src/arrow/record_reader/definition_levels.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ impl PackedDecoder {
276276
self.packed_offset = 0;
277277
self.packed_count = match encoding {
278278
Encoding::RLE => 0,
279+
#[allow(deprecated)]
279280
Encoding::BIT_PACKED => data.len() * 8,
280281
_ => unreachable!("invalid level encoding: {}", encoding),
281282
};

parquet/src/basic.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,21 @@ pub enum Encoding {
256256
/// Usable for definition/repetition levels encoding and boolean values.
257257
RLE,
258258

259-
/// Bit packed encoding.
259+
/// **Deprecated** Bit-packed encoding.
260260
///
261261
/// This can only be used if the data has a known max width.
262262
/// Usable for definition/repetition levels encoding.
263+
///
264+
/// There are compatibility issues with files using this encoding.
265+
/// The parquet standard specifies the bits to be packed starting from the
266+
/// most-significant bit, several implementations do not follow this bit order.
267+
/// Several other implementations also have issues reading this encoding
268+
/// because of incorrect assumptions about the length of the encoded data.
269+
///
270+
/// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead.
271+
#[deprecated(
272+
note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead"
273+
)]
263274
BIT_PACKED,
264275

265276
/// Delta encoding for integers, either INT32 or INT64.
@@ -301,6 +312,7 @@ impl FromStr for Encoding {
301312
"PLAIN" | "plain" => Ok(Encoding::PLAIN),
302313
"PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY),
303314
"RLE" | "rle" => Ok(Encoding::RLE),
315+
#[allow(deprecated)]
304316
"BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED),
305317
"DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED),
306318
"DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => {
@@ -910,6 +922,7 @@ impl TryFrom<parquet::Encoding> for Encoding {
910922
parquet::Encoding::PLAIN => Encoding::PLAIN,
911923
parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY,
912924
parquet::Encoding::RLE => Encoding::RLE,
925+
#[allow(deprecated)]
913926
parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED,
914927
parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED,
915928
parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -927,6 +940,7 @@ impl From<Encoding> for parquet::Encoding {
927940
Encoding::PLAIN => parquet::Encoding::PLAIN,
928941
Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY,
929942
Encoding::RLE => parquet::Encoding::RLE,
943+
#[allow(deprecated)]
930944
Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED,
931945
Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED,
932946
Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
@@ -1114,6 +1128,7 @@ impl str::FromStr for LogicalType {
11141128
}
11151129

11161130
#[cfg(test)]
1131+
#[allow(deprecated)] // allow BIT_PACKED encoding for the whole test module
11171132
mod tests {
11181133
use super::*;
11191134

parquet/src/bin/parquet-layout.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ fn encoding(encoding: parquet::format::Encoding) -> &'static str {
200200
Ok(Encoding::PLAIN) => "plain",
201201
Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary",
202202
Ok(Encoding::RLE) => "rle",
203+
#[allow(deprecated)]
203204
Ok(Encoding::BIT_PACKED) => "bit_packed",
204205
Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed",
205206
Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array",

parquet/src/column/reader.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,7 @@ fn parse_v1_level(
580580
buf.slice(i32_size..i32_size + data_size),
581581
))
582582
}
583+
#[allow(deprecated)]
583584
Encoding::BIT_PACKED => {
584585
let bit_width = num_required_bits(max_level as u64);
585586
let num_bytes = ceil(num_buffered_values as usize * bit_width as usize, 8);

parquet/src/column/reader/decoder.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ impl LevelDecoder {
268268
decoder.set_data(data);
269269
Self::Rle(decoder)
270270
}
271+
#[allow(deprecated)]
271272
Encoding::BIT_PACKED => Self::Packed(BitReader::new(data), bit_width),
272273
_ => unreachable!("invalid level encoding: {}", encoding),
273274
}

parquet/src/encodings/decoding.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,7 @@ mod tests {
11361136
);
11371137

11381138
// unsupported
1139+
#[allow(deprecated)]
11391140
create_and_check_decoder::<Int32Type>(
11401141
Encoding::BIT_PACKED,
11411142
Some(nyi_err!("Encoding BIT_PACKED is not supported")),

parquet/src/encodings/encoding/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,7 @@ mod tests {
754754
);
755755

756756
// unsupported
757+
#[allow(deprecated)]
757758
create_and_check_encoder::<Int32Type>(
758759
Encoding::BIT_PACKED,
759760
Some(nyi_err!("Encoding BIT_PACKED is not supported")),

parquet/src/encodings/levels.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub fn max_buffer_size(
3535
let bit_width = num_required_bits(max_level as u64);
3636
match encoding {
3737
Encoding::RLE => RleEncoder::max_buffer_size(bit_width, num_buffered_values),
38+
#[allow(deprecated)]
3839
Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize, 8),
3940
_ => panic!("Unsupported encoding type {encoding}"),
4041
}
@@ -66,6 +67,7 @@ impl LevelEncoder {
6667
buffer.extend_from_slice(&[0; 4]);
6768
LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
6869
}
70+
#[allow(deprecated)]
6971
Encoding::BIT_PACKED => {
7072
// Here we set full byte buffer without adjusting for num_buffered_values,
7173
// because byte buffer will already be allocated with size from

parquet/src/file/serialized_reader.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,9 @@ mod tests {
963963
assert_eq!(num_values, 8);
964964
assert_eq!(encoding, Encoding::PLAIN_DICTIONARY);
965965
assert_eq!(def_level_encoding, Encoding::RLE);
966-
assert_eq!(rep_level_encoding, Encoding::BIT_PACKED);
966+
#[allow(deprecated)]
967+
let expected_rep_level_encoding = Encoding::BIT_PACKED;
968+
assert_eq!(rep_level_encoding, expected_rep_level_encoding);
967969
assert!(statistics.is_none());
968970
true
969971
}

0 commit comments

Comments
 (0)